aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst7
-rw-r--r--Documentation/dev-tools/kasan.rst63
-rw-r--r--arch/Kconfig9
-rw-r--r--arch/arc/include/asm/pgtable.h1
-rw-r--r--arch/arc/mm/fault.c10
-rw-r--r--arch/arc/mm/highmem.c4
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-4k.h3
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-64k.h3
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/mm/kasan_init_64.c61
-rw-r--r--drivers/base/memory.c40
-rw-r--r--drivers/hv/hv_balloon.c4
-rw-r--r--drivers/xen/balloon.c1
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/direct-io.c21
-rw-r--r--fs/hugetlbfs/inode.c63
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/userfaultfd.c21
-rw-r--r--include/asm-generic/4level-fixup.h1
-rw-r--r--include/asm-generic/5level-fixup.h1
-rw-r--r--include/asm-generic/pgtable-nop4d.h2
-rw-r--r--include/asm-generic/pgtable-nopmd.h2
-rw-r--r--include/asm-generic/pgtable-nopud.h2
-rw-r--r--include/asm-generic/pgtable.h51
-rw-r--r--include/asm-generic/tlb.h4
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/hugetlb.h140
-rw-r--r--include/linux/kasan.h31
-rw-r--r--include/linux/memblock.h3
-rw-r--r--include/linux/memcontrol.h49
-rw-r--r--include/linux/memory_hotplug.h11
-rw-r--r--include/linux/mm.h34
-rw-r--r--include/linux/mmzone.h34
-rw-r--r--include/linux/moduleloader.h2
-rw-r--r--include/linux/page-isolation.h4
-rw-r--r--include/linux/slab.h20
-rw-r--r--include/linux/string.h2
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/vmalloc.h12
-rw-r--r--include/trace/events/kmem.h47
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/sysctl.c2
-rw-r--r--lib/Kconfig.kasan16
-rw-r--r--lib/test_kasan.c26
-rw-r--r--lib/vsprintf.c40
-rw-r--r--mm/Kconfig40
-rw-r--r--mm/cma.c6
-rw-r--r--mm/cma_debug.c10
-rw-r--r--mm/filemap.c54
-rw-r--r--mm/gup.c40
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/hugetlb.c288
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/internal.h27
-rw-r--r--mm/kasan/common.c233
-rw-r--r--mm/kasan/generic_report.c3
-rw-r--r--mm/kasan/kasan.h1
-rw-r--r--mm/khugepaged.c18
-rw-r--r--mm/madvise.c14
-rw-r--r--mm/memblock.c111
-rw-r--r--mm/memcontrol.c167
-rw-r--r--mm/memory-failure.c61
-rw-r--r--mm/memory.c52
-rw-r--r--mm/memory_hotplug.c86
-rw-r--r--mm/mempolicy.c47
-rw-r--r--mm/migrate.c16
-rw-r--r--mm/mmap.c63
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/page_alloc.c137
-rw-r--r--mm/page_io.c15
-rw-r--r--mm/page_isolation.c12
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/shmem.c29
-rw-r--r--mm/slab.c7
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c99
-rw-r--r--mm/slub.c36
-rw-r--r--mm/sparse.c18
-rw-r--r--mm/swap.c29
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/userfaultfd.c73
-rw-r--r--mm/util.c22
-rw-r--r--mm/vmalloc.c192
-rw-r--r--mm/vmscan.c662
-rw-r--r--mm/workingset.c69
-rw-r--r--mm/z3fold.c375
-rw-r--r--scripts/spelling.txt28
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c36
-rw-r--r--tools/testing/selftests/vm/config1
95 files changed, 2661 insertions, 1506 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 007ba86aef78..6d13f2de6d69 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1288,7 +1288,12 @@ PAGE_SIZE multiple when read back.
inactive_anon, active_anon, inactive_file, active_file, unevictable
Amount of memory, swap-backed and filesystem-backed,
on the internal memory management lists used by the
- page reclaim algorithm
+ page reclaim algorithm.
+
+ As these represent internal list state (eg. shmem pages are on anon
+ memory management lists), inactive_foo + active_foo may not be equal to
+ the value for the foo counter, since the foo counter is type-based, not
+ list-based.
slab_reclaimable
Part of "slab" that might be reclaimed, such as
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 525296121d89..e4d66e7c50de 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -218,3 +218,66 @@ brk handler is used to print bug reports.
A potential expansion of this mode is a hardware tag-based mode, which would
use hardware memory tagging support instead of compiler instrumentation and
manual shadow memory manipulation.
+
+What memory accesses are sanitised by KASAN?
+--------------------------------------------
+
+The kernel maps memory in a number of different parts of the address
+space. This poses something of a problem for KASAN, which requires
+that all addresses accessed by instrumented code have a valid shadow
+region.
+
+The range of kernel virtual addresses is large: there is not enough
+real memory to support a real shadow region for every address that
+could be accessed by the kernel.
+
+By default
+~~~~~~~~~~
+
+By default, architectures only map real memory over the shadow region
+for the linear mapping (and potentially other small areas). For all
+other areas - such as vmalloc and vmemmap space - a single read-only
+page is mapped over the shadow area. This read-only shadow page
+declares all memory accesses as permitted.
+
+This presents a problem for modules: they do not live in the linear
+mapping, but in a dedicated module space. By hooking in to the module
+allocator, KASAN can temporarily map real shadow memory to cover
+them. This allows detection of invalid accesses to module globals, for
+example.
+
+This also creates an incompatibility with ``VMAP_STACK``: if the stack
+lives in vmalloc space, it will be shadowed by the read-only page, and
+the kernel will fault when trying to set up the shadow data for stack
+variables.
+
+CONFIG_KASAN_VMALLOC
+~~~~~~~~~~~~~~~~~~~~
+
+With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the
+cost of greater memory usage. Currently this is only supported on x86.
+
+This works by hooking into vmalloc and vmap, and dynamically
+allocating real shadow memory to back the mappings.
+
+Most mappings in vmalloc space are small, requiring less than a full
+page of shadow space. Allocating a full shadow page per mapping would
+therefore be wasteful. Furthermore, to ensure that different mappings
+use different shadow pages, mappings would have to be aligned to
+``KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE``.
+
+Instead, we share backing space across multiple mappings. We allocate
+a backing page when a mapping in vmalloc space uses a particular page
+of the shadow region. This page can be shared by other vmalloc
+mappings later on.
+
+We hook in to the vmap infrastructure to lazily clean up unused shadow
+memory.
+
+To avoid the difficulties around swapping mappings around, we expect
+that the part of the shadow region that covers the vmalloc space will
+not be covered by the early shadow page, but will be left
+unmapped. This will require changes in arch-specific code.
+
+This allows ``VMAP_STACK`` support on x86, and can simplify support of
+architectures that do not have a fixed module region.
diff --git a/arch/Kconfig b/arch/Kconfig
index da75bab22eee..7b861fe3f900 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -836,16 +836,17 @@ config HAVE_ARCH_VMAP_STACK
config VMAP_STACK
default y
bool "Use a virtually-mapped stack"
- depends on HAVE_ARCH_VMAP_STACK && !KASAN
+ depends on HAVE_ARCH_VMAP_STACK
+ depends on !KASAN || KASAN_VMALLOC
---help---
Enable this if you want the use virtually-mapped kernel stacks
with guard pages. This causes kernel stack overflows to be
caught immediately rather than causing difficult-to-diagnose
corruption.
- This is presently incompatible with KASAN because KASAN expects
- the stack to map directly to the KASAN shadow map using a formula
- that is incorrect if the stack is in vmalloc space.
+ To use this with KASAN, the architecture must support backing
+ virtual mappings with real shadow memory, and KASAN_VMALLOC must
+ be enabled.
config ARCH_OPTIONAL_KERNEL_RWX
def_bool n
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 7addd0301c51..b917b596f7fb 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -33,7 +33,6 @@
#define _ASM_ARC_PGTABLE_H
#include <linux/bits.h>
-#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopmd.h>
#include <asm/page.h>
#include <asm/mmu.h> /* to propagate CONFIG_ARC_MMU_VER <n> */
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 3861543b66a0..fb86bc3e9b35 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -30,6 +30,7 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address)
* with the 'reference' page table.
*/
pgd_t *pgd, *pgd_k;
+ p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
@@ -39,8 +40,13 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address)
if (!pgd_present(*pgd_k))
goto bad_area;
- pud = pud_offset(pgd, address);
- pud_k = pud_offset(pgd_k, address);
+ p4d = p4d_offset(pgd, address);
+ p4d_k = p4d_offset(pgd_k, address);
+ if (!p4d_present(*p4d_k))
+ goto bad_area;
+
+ pud = pud_offset(p4d, address);
+ pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
goto bad_area;
diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c
index a4856bfaedf3..fc8849e4f72e 100644
--- a/arch/arc/mm/highmem.c
+++ b/arch/arc/mm/highmem.c
@@ -111,12 +111,14 @@ EXPORT_SYMBOL(__kunmap_atomic);
static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr)
{
pgd_t *pgd_k;
+ p4d_t *p4d_k;
pud_t *pud_k;
pmd_t *pmd_k;
pte_t *pte_k;
pgd_k = pgd_offset_k(kvaddr);
- pud_k = pud_offset(pgd_k, kvaddr);
+ p4d_k = p4d_offset(pgd_k, kvaddr);
+ pud_k = pud_offset(p4d_k, kvaddr);
pmd_k = pmd_offset(pud_k, kvaddr);
pte_k = (pte_t *)memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
index a069dfcac9a9..4e697bc2f4cd 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
@@ -70,9 +70,6 @@ static inline int get_hugepd_cache_index(int index)
/* should not reach */
}
-#else /* !CONFIG_HUGETLB_PAGE */
-static inline int pmd_huge(pmd_t pmd) { return 0; }
-static inline int pud_huge(pud_t pud) { return 0; }
#endif /* CONFIG_HUGETLB_PAGE */
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
index e3d4dd4ae2fa..34d1018896b3 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -59,9 +59,6 @@ static inline int get_hugepd_cache_index(int index)
BUG();
}
-#else /* !CONFIG_HUGETLB_PAGE */
-static inline int pmd_huge(pmd_t pmd) { return 0; }
-static inline int pud_huge(pud_t pud) { return 0; }
#endif /* CONFIG_HUGETLB_PAGE */
static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 6ee17d09649c..974109bb85db 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -13,6 +13,7 @@
#include <linux/memblock.h>
#include <linux/of_fdt.h>
#include <linux/mm.h>
+#include <linux/hugetlb.h>
#include <linux/string_helpers.h>
#include <linux/stop_machine.h>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0cb1756223be..5e8949953660 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -134,6 +134,7 @@ config X86
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if X86_64
+ select HAVE_ARCH_KASAN_VMALLOC if X86_64
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 296da58f3013..cf5bc37c90ac 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -245,6 +245,49 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
} while (pgd++, addr = next, addr != end);
}
+static void __init kasan_shallow_populate_p4ds(pgd_t *pgd,
+ unsigned long addr,
+ unsigned long end)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ void *p;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (p4d_none(*p4d)) {
+ p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true);
+ p4d_populate(&init_mm, p4d, p);
+ }
+ } while (p4d++, addr = next, addr != end);
+}
+
+static void __init kasan_shallow_populate_pgds(void *start, void *end)
+{
+ unsigned long addr, next;
+ pgd_t *pgd;
+ void *p;
+
+ addr = (unsigned long)start;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, (unsigned long)end);
+
+ if (pgd_none(*pgd)) {
+ p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true);
+ pgd_populate(&init_mm, pgd, p);
+ }
+
+ /*
+ * we need to populate p4ds to be synced when running in
+ * four level mode - see sync_global_pgds_l4()
+ */
+ kasan_shallow_populate_p4ds(pgd, addr, next);
+ } while (pgd++, addr = next, addr != (unsigned long)end);
+}
+
#ifdef CONFIG_KASAN_INLINE
static int kasan_die_handler(struct notifier_block *self,
unsigned long val,
@@ -354,6 +397,24 @@ void __init kasan_init(void)
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+ kasan_mem_to_shadow((void *)VMALLOC_START));
+
+ /*
+ * If we're in full vmalloc mode, don't back vmalloc space with early
+ * shadow pages. Instead, prepopulate pgds/p4ds so they are synced to
+ * the global table and we can populate the lower levels on demand.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_shallow_populate_pgds(
+ kasan_mem_to_shadow((void *)VMALLOC_START),
+ kasan_mem_to_shadow((void *)VMALLOC_END));
+ else
+ kasan_populate_early_shadow(
+ kasan_mem_to_shadow((void *)VMALLOC_START),
+ kasan_mem_to_shadow((void *)VMALLOC_END));
+
+ kasan_populate_early_shadow(
+ kasan_mem_to_shadow((void *)VMALLOC_END + 1),
shadow_cpu_entry_begin);
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 84c4e1f72cbd..799b43191dea 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -19,15 +19,12 @@
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
-#include <linux/mutex.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/uaccess.h>
-static DEFINE_MUTEX(mem_sysfs_mutex);
-
#define MEMORY_CLASS_NAME "memory"
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
@@ -538,12 +535,7 @@ static ssize_t soft_offline_page_store(struct device *dev,
if (kstrtoull(buf, 0, &pfn) < 0)
return -EINVAL;
pfn >>= PAGE_SHIFT;
- if (!pfn_valid(pfn))
- return -ENXIO;
- /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
- if (!pfn_to_online_page(pfn))
- return -EIO;
- ret = soft_offline_page(pfn_to_page(pfn), 0);
+ ret = soft_offline_page(pfn, 0);
return ret == 0 ? count : ret;
}
@@ -705,6 +697,8 @@ static void unregister_memory(struct memory_block *memory)
* Create memory block devices for the given memory area. Start and size
* have to be aligned to memory block granularity. Memory block devices
* will be initialized as offline.
+ *
+ * Called under device_hotplug_lock.
*/
int create_memory_block_devices(unsigned long start, unsigned long size)
{
@@ -718,7 +712,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
!IS_ALIGNED(size, memory_block_size_bytes())))
return -EINVAL;
- mutex_lock(&mem_sysfs_mutex);
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
if (ret)
@@ -730,11 +723,12 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
for (block_id = start_block_id; block_id != end_block_id;
block_id++) {
mem = find_memory_block_by_id(block_id);
+ if (WARN_ON_ONCE(!mem))
+ continue;
mem->section_count = 0;
unregister_memory(mem);
}
}
- mutex_unlock(&mem_sysfs_mutex);
return ret;
}
@@ -742,6 +736,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
* Remove memory block devices for the given memory area. Start and size
* have to be aligned to memory block granularity. Memory block devices
* have to be offline.
+ *
+ * Called under device_hotplug_lock.
*/
void remove_memory_block_devices(unsigned long start, unsigned long size)
{
@@ -754,7 +750,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
!IS_ALIGNED(size, memory_block_size_bytes())))
return;
- mutex_lock(&mem_sysfs_mutex);
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
@@ -763,7 +758,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
unregister_memory_block_under_nodes(mem);
unregister_memory(mem);
}
- mutex_unlock(&mem_sysfs_mutex);
}
/* return true if the memory block is offlined, otherwise, return false */
@@ -797,12 +791,13 @@ static const struct attribute_group *memory_root_attr_groups[] = {
};
/*
- * Initialize the sysfs support for memory devices...
+ * Initialize the sysfs support for memory devices. At the time this function
+ * is called, we cannot have concurrent creation/deletion of memory block
+ * devices, the device_hotplug_lock is not needed.
*/
void __init memory_dev_init(void)
{
int ret;
- int err;
unsigned long block_sz, nr;
/* Validate the configured memory block size */
@@ -813,24 +808,19 @@ void __init memory_dev_init(void)
ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
if (ret)
- goto out;
+ panic("%s() failed to register subsystem: %d\n", __func__, ret);
/*
* Create entries for memory sections that were found
* during boot and have been initialized
*/
- mutex_lock(&mem_sysfs_mutex);
for (nr = 0; nr <= __highest_present_section_nr;
nr += sections_per_block) {
- err = add_memory_block(nr);
- if (!ret)
- ret = err;
+ ret = add_memory_block(nr);
+ if (ret)
+ panic("%s() failed to add memory block: %d\n", __func__,
+ ret);
}
- mutex_unlock(&mem_sysfs_mutex);
-
-out:
- if (ret)
- panic("%s() failed: %d\n", __func__, ret);
}
/**
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 7f4cf4fc805e..b155d0052981 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -682,9 +682,7 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
__ClearPageOffline(pg);
/* This frame is currently backed; online the page. */
- __online_page_set_limits(pg);
- __online_page_increment_counters(pg);
- __online_page_free(pg);
+ generic_online_page(pg, 0);
lockdep_assert_held(&dm_device.ha_lock);
dm_device.num_pages_onlined++;
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 5bae515c8e25..4f2e78a5e4db 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -374,7 +374,6 @@ static void xen_online_page(struct page *page, unsigned int order)
mutex_lock(&balloon_mutex);
for (i = 0; i < size; i++) {
p = pfn_to_page(start_pfn + i);
- __online_page_set_limits(p);
balloon_append(p);
}
mutex_unlock(&balloon_mutex);
diff --git a/fs/buffer.c b/fs/buffer.c
index d39838090b22..d8c7242426bb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,6 +49,8 @@
#include <trace/events/block.h>
#include <linux/fscrypt.h>
+#include "internal.h"
+
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
enum rw_hint hint, struct writeback_control *wbc);
@@ -1423,10 +1425,10 @@ static bool has_bh_in_lru(int cpu, void *dummy)
for (i = 0; i < BH_LRU_SIZE; i++) {
if (b->bhs[i])
- return 1;
+ return true;
}
- return 0;
+ return false;
}
void invalidate_bh_lrus(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9329ced91f1d..0ec4f270139f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -221,27 +221,6 @@ static inline struct page *dio_get_page(struct dio *dio,
}
/*
- * Warn about a page cache invalidation failure during a direct io write.
- */
-void dio_warn_stale_pagecache(struct file *filp)
-{
- static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
- char pathname[128];
- struct inode *inode = file_inode(filp);
- char *path;
-
- errseq_set(&inode->i_mapping->wb_err, -EIO);
- if (__ratelimit(&_rs)) {
- path = file_path(filp, pathname, sizeof(pathname));
- if (IS_ERR(path))
- path = "(unknown)";
- pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
- pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
- current->comm);
- }
-}
-
-/*
* dio_complete() - called when all DIO BIO I/O has been completed
*
* This drops i_dio_count, lets interested parties know that a DIO operation
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a478df035651..d5c2a3158610 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash;
index = page->index;
- hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
+ hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
addr = index * hpage_size;
/* mutex taken here, fault path and hole punch */
- hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
+ hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
@@ -815,8 +815,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* File creation. Allocate an inode, and we're done..
*/
-static int hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t dev)
+static int do_hugetlbfs_mknod(struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t dev,
+ bool tmpfile)
{
struct inode *inode;
int error = -ENOSPC;
@@ -824,13 +827,23 @@ static int hugetlbfs_mknod(struct inode *dir,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (inode) {
dir->i_ctime = dir->i_mtime = current_time(dir);
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
+ if (tmpfile) {
+ d_tmpfile(dentry, inode);
+ } else {
+ d_instantiate(dentry, inode);
+ dget(dentry);/* Extra count - pin the dentry in core */
+ }
error = 0;
}
return error;
}
+static int hugetlbfs_mknod(struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
+{
+ return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
+}
+
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
@@ -844,6 +857,12 @@ static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mo
return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
+static int hugetlbfs_tmpfile(struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+{
+ return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
+}
+
static int hugetlbfs_symlink(struct inode *dir,
struct dentry *dentry, const char *symname)
{
@@ -1102,6 +1121,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations = {
.mknod = hugetlbfs_mknod,
.rename = simple_rename,
.setattr = hugetlbfs_setattr,
+ .tmpfile = hugetlbfs_tmpfile,
};
static const struct inode_operations hugetlbfs_inode_operations = {
@@ -1461,28 +1481,41 @@ static int __init init_hugetlbfs_fs(void)
sizeof(struct hugetlbfs_inode_info),
0, SLAB_ACCOUNT, init_once);
if (hugetlbfs_inode_cachep == NULL)
- goto out2;
+ goto out;
error = register_filesystem(&hugetlbfs_fs_type);
if (error)
- goto out;
+ goto out_free;
+ /* default hstate mount is required */
+ mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
+ if (IS_ERR(mnt)) {
+ error = PTR_ERR(mnt);
+ goto out_unreg;
+ }
+ hugetlbfs_vfsmount[default_hstate_idx] = mnt;
+
+ /* other hstates are optional */
i = 0;
for_each_hstate(h) {
+ if (i == default_hstate_idx)
+ continue;
+
mnt = mount_one_hugetlbfs(h);
- if (IS_ERR(mnt) && i == 0) {
- error = PTR_ERR(mnt);
- goto out;
- }
- hugetlbfs_vfsmount[i] = mnt;
+ if (IS_ERR(mnt))
+ hugetlbfs_vfsmount[i] = NULL;
+ else
+ hugetlbfs_vfsmount[i] = mnt;
i++;
}
return 0;
- out:
+ out_unreg:
+ (void)unregister_filesystem(&hugetlbfs_fs_type);
+ out_free:
kmem_cache_destroy(hugetlbfs_inode_cachep);
- out2:
+ out:
return error;
}
fs_initcall(init_hugetlbfs_fs)
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 3e7da392aa6f..bb981ec76456 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -327,8 +327,8 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
down_read(&OCFS2_I(inode)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
up_read(&OCFS2_I(inode)->ip_xattr_sem);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
+ if (IS_ERR_OR_NULL(acl))
+ return PTR_ERR_OR_ZERO(acl);
ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
if (ret)
return ret;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d7f54e535294..37df7c9eedb1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1460,7 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
start = vma->vm_start;
vma_end = min(end, vma->vm_end);
- new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+ new_flags = (vma->vm_flags &
+ ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
@@ -1834,13 +1835,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
features = uffdio_api.features;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
- memset(&uffdio_api, 0, sizeof(uffdio_api));
- if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
- goto out;
- ret = -EINVAL;
- goto out;
- }
+ ret = -EINVAL;
+ if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ goto err_out;
+ ret = -EPERM;
+ if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
+ goto err_out;
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
uffdio_api.ioctls = UFFD_API_IOCTLS;
@@ -1853,6 +1853,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = 0;
out:
return ret;
+err_out:
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ ret = -EFAULT;
+ goto out;
}
static long userfaultfd_ioctl(struct file *file, unsigned cmd,
diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index e3667c9a33a5..c86cf7cb4bba 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -30,7 +30,6 @@
#undef pud_free_tlb
#define pud_free_tlb(tlb, x, addr) do { } while (0)
#define pud_free(mm, x) do { } while (0)
-#define __pud_free_tlb(tlb, x, addr) do { } while (0)
#undef pud_addr_end
#define pud_addr_end(addr, end) (end)
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index f6947da70d71..4c74b1c1d13b 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -51,7 +51,6 @@ static inline int p4d_present(p4d_t p4d)
#undef p4d_free_tlb
#define p4d_free_tlb(tlb, x, addr) do { } while (0)
#define p4d_free(mm, x) do { } while (0)
-#define __p4d_free_tlb(tlb, x, addr) do { } while (0)
#undef p4d_addr_end
#define p4d_addr_end(addr, end) (end)
diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
index aebab905e6cd..ce2cbb3c380f 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -50,7 +50,7 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
*/
#define p4d_alloc_one(mm, address) NULL
#define p4d_free(mm, x) do { } while (0)
-#define __p4d_free_tlb(tlb, x, a) do { } while (0)
+#define p4d_free_tlb(tlb, x, a) do { } while (0)
#undef p4d_addr_end
#define p4d_addr_end(addr, end) (end)
diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h
index b85b8271a73d..0d9b28cba16d 100644
--- a/include/asm-generic/pgtable-nopmd.h
+++ b/include/asm-generic/pgtable-nopmd.h
@@ -60,7 +60,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address)
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
}
-#define __pmd_free_tlb(tlb, x, a) do { } while (0)
+#define pmd_free_tlb(tlb, x, a) do { } while (0)
#undef pmd_addr_end
#define pmd_addr_end(addr, end) (end)
diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h
index c77a1d301155..d3776cb494c0 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -59,7 +59,7 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
*/
#define pud_alloc_one(mm, address) NULL
#define pud_free(mm, x) do { } while (0)
-#define __pud_free_tlb(tlb, x, a) do { } while (0)
+#define pud_free_tlb(tlb, x, a) do { } while (0)
#undef pud_addr_end
#define pud_addr_end(addr, end) (end)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 818691846c90..798ea36a0549 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -558,8 +558,19 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
* Do the tests inline, but report and clear the bad entry in mm/memory.c.
*/
void pgd_clear_bad(pgd_t *);
+
+#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
+#else
+#define p4d_clear_bad(p4d) do { } while (0)
+#endif
+
+#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
+#else
+#define pud_clear_bad(p4d) do { } while (0)
+#endif
+
void pmd_clear_bad(pmd_t *);
static inline int pgd_none_or_clear_bad(pgd_t *pgd)
@@ -903,6 +914,21 @@ static inline int pud_write(pud_t pud)
}
#endif /* pud_write */
+#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+static inline int pmd_devmap(pmd_t pmd)
+{
+ return 0;
+}
+static inline int pud_devmap(pud_t pud)
+{
+ return 0;
+}
+static inline int pgd_devmap(pgd_t pgd)
+{
+ return 0;
+}
+#endif
+
#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
!defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
@@ -912,6 +938,31 @@ static inline int pud_trans_huge(pud_t pud)
}
#endif
+/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */
+static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud)
+{
+ pud_t pudval = READ_ONCE(*pud);
+
+ if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
+ return 1;
+ if (unlikely(pud_bad(pudval))) {
+ pud_clear_bad(pud);
+ return 1;
+ }
+ return 0;
+}
+
+/* See pmd_trans_unstable for discussion. */
+static inline int pud_trans_unstable(pud_t *pud)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ return pud_none_or_trans_huge_or_dev_or_clear_bad(pud);
+#else
+ return 0;
+#endif
+}
+
#ifndef pmd_read_atomic
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
{
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e64991142a8b..2b10036fefd0 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -584,7 +584,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
} while (0)
#endif
-#ifndef __ARCH_HAS_4LEVEL_HACK
#ifndef pud_free_tlb
#define pud_free_tlb(tlb, pudp, address) \
do { \
@@ -594,9 +593,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
__pud_free_tlb(tlb, pudp, address); \
} while (0)
#endif
-#endif
-#ifndef __ARCH_HAS_5LEVEL_HACK
#ifndef p4d_free_tlb
#define p4d_free_tlb(tlb, pudp, address) \
do { \
@@ -605,7 +602,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
__p4d_free_tlb(tlb, pudp, address); \
} while (0)
#endif
-#endif
#endif /* CONFIG_MMU */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 670f96a6529c..c159a8bdee8b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3156,7 +3156,6 @@ enum {
};
void dio_end_io(struct bio *bio);
-void dio_warn_stale_pagecache(struct file *filp);
ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, struct iov_iter *iter,
@@ -3201,6 +3200,11 @@ static inline void inode_dio_end(struct inode *inode)
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}
+/*
+ * Warn about a page cache invalidation failure diring a direct I/O write.
+ */
+void dio_warn_stale_pagecache(struct file *filp);
+
extern void inode_set_flags(struct inode *inode, unsigned int flags,
unsigned int mask);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 61f2f6ff9467..e5b817cb86e7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -612,6 +612,8 @@ static inline bool pm_suspended_storage(void)
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask);
+extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask);
#endif
void free_contig_range(unsigned long pfn, unsigned int nr_pages);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 53fc34f930d0..31d4920994b9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -105,8 +105,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx, unsigned long address);
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
@@ -164,38 +163,130 @@ static inline void adjust_range_if_pmd_sharing_possible(
{
}
-#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; })
-#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
-#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
+static inline long follow_hugetlb_page(struct mm_struct *mm,
+ struct vm_area_struct *vma, struct page **pages,
+ struct vm_area_struct **vmas, unsigned long *position,
+ unsigned long *nr_pages, long i, unsigned int flags,
+ int *nonblocking)
+{
+ BUG();
+ return 0;
+}
+
+static inline struct page *follow_huge_addr(struct mm_struct *mm,
+ unsigned long address, int write)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static inline int copy_hugetlb_page_range(struct mm_struct *dst,
+ struct mm_struct *src, struct vm_area_struct *vma)
+{
+ BUG();
+ return 0;
+}
+
static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}
-#define hugetlb_report_node_meminfo(n, buf) 0
+
+static inline int hugetlb_report_node_meminfo(int nid, char *buf)
+{
+ return 0;
+}
+
static inline void hugetlb_show_meminfo(void)
{
}
-#define follow_huge_pd(vma, addr, hpd, flags, pdshift) NULL
-#define follow_huge_pmd(mm, addr, pmd, flags) NULL
-#define follow_huge_pud(mm, addr, pud, flags) NULL
-#define follow_huge_pgd(mm, addr, pgd, flags) NULL
-#define prepare_hugepage_range(file, addr, len) (-EINVAL)
-#define pmd_huge(x) 0
-#define pud_huge(x) 0
-#define is_hugepage_only_range(mm, addr, len) 0
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
-#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
- src_addr, pagep) ({ BUG(); 0; })
-#define huge_pte_offset(mm, address, sz) 0
+
+static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
+ unsigned long address, hugepd_t hpd, int flags,
+ int pdshift)
+{
+ return NULL;
+}
+
+static inline struct page *follow_huge_pmd(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmd, int flags)
+{
+ return NULL;
+}
+
+static inline struct page *follow_huge_pud(struct mm_struct *mm,
+ unsigned long address, pud_t *pud, int flags)
+{
+ return NULL;
+}
+
+static inline struct page *follow_huge_pgd(struct mm_struct *mm,
+ unsigned long address, pgd_t *pgd, int flags)
+{
+ return NULL;
+}
+
+static inline int prepare_hugepage_range(struct file *file,
+ unsigned long addr, unsigned long len)
+{
+ return -EINVAL;
+}
+
+static inline int pmd_huge(pmd_t pmd)
+{
+ return 0;
+}
+
+static inline int pud_huge(pud_t pud)
+{
+ return 0;
+}
+
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long len)
+{
+ return 0;
+}
+
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ BUG();
+}
+
+static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ BUG();
+ return 0;
+}
+
+static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
+ unsigned long sz)
+{
+ return NULL;
+}
static inline bool isolate_huge_page(struct page *page, struct list_head *list)
{
return false;
}
-#define putback_active_hugepage(p) do {} while (0)
-#define move_hugetlb_state(old, new, reason) do {} while (0)
-static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
- unsigned long address, unsigned long end, pgprot_t newprot)
+static inline void putback_active_hugepage(struct page *page)
+{
+}
+
+static inline void move_hugetlb_state(struct page *oldpage,
+ struct page *newpage, int reason)
+{
+}
+
+static inline unsigned long hugetlb_change_protection(
+ struct vm_area_struct *vma, unsigned long address,
+ unsigned long end, pgprot_t newprot)
{
return 0;
}
@@ -213,9 +304,10 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
{
BUG();
}
+
static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+ struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags)
{
BUG();
return 0;
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index cc8a03cc9674..4f404c565db1 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -70,8 +70,18 @@ struct kasan_cache {
int free_meta_offset;
};
+/*
+ * These functions provide a special case to support backing module
+ * allocations with real shadow memory. With KASAN vmalloc, the special
+ * case is unnecessary, as the work is handled in the generic case.
+ */
+#ifndef CONFIG_KASAN_VMALLOC
int kasan_module_alloc(void *addr, size_t size);
void kasan_free_shadow(const struct vm_struct *vm);
+#else
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+#endif
int kasan_add_zero_shadow(void *start, unsigned long size);
void kasan_remove_zero_shadow(void *start, unsigned long size);
@@ -194,4 +204,25 @@ static inline void *kasan_reset_tag(const void *addr)
#endif /* CONFIG_KASAN_SW_TAGS */
+#ifdef CONFIG_KASAN_VMALLOC
+int kasan_populate_vmalloc(unsigned long requested_size,
+ struct vm_struct *area);
+void kasan_poison_vmalloc(void *start, unsigned long size);
+void kasan_release_vmalloc(unsigned long start, unsigned long end,
+ unsigned long free_region_start,
+ unsigned long free_region_end);
+#else
+static inline int kasan_populate_vmalloc(unsigned long requested_size,
+ struct vm_struct *area)
+{
+ return 0;
+}
+
+static inline void kasan_poison_vmalloc(void *start, unsigned long size) {}
+static inline void kasan_release_vmalloc(unsigned long start,
+ unsigned long end,
+ unsigned long free_region_start,
+ unsigned long free_region_end) {}
+#endif
+
#endif /* LINUX_KASAN_H */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f491690d54c6..b38bbefabfab 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -358,6 +358,9 @@ static inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
MEMBLOCK_ALLOC_ACCESSIBLE);
}
+void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid);
void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
int nid);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ae703ea3ef48..a7a0a1a5c8d5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -58,7 +58,6 @@ enum mem_cgroup_protection {
struct mem_cgroup_reclaim_cookie {
pg_data_t *pgdat;
- int priority;
unsigned int generation;
};
@@ -81,7 +80,6 @@ struct mem_cgroup_id {
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
@@ -112,7 +110,7 @@ struct memcg_shrinker_map {
};
/*
- * per-zone information in memory controller.
+ * per-node information in memory controller.
*/
struct mem_cgroup_per_node {
struct lruvec lruvec;
@@ -126,7 +124,7 @@ struct mem_cgroup_per_node {
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
- struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
+ struct mem_cgroup_reclaim_iter iter;
struct memcg_shrinker_map __rcu *shrinker_map;
@@ -134,9 +132,6 @@ struct mem_cgroup_per_node {
unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
- bool congested; /* memcg has many dirty pages */
- /* backed by a congested BDI */
-
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
};
@@ -313,13 +308,6 @@ struct mem_cgroup {
struct list_head kmem_caches;
#endif
- int last_scanned_node;
-#if MAX_NUMNODES > 1
- nodemask_t scan_nodes;
- atomic_t numainfo_events;
- atomic_t numainfo_updating;
-#endif
-
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
struct wb_domain cgwb_domain;
@@ -394,25 +382,27 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
}
/**
- * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
- * @node: node of the wanted lruvec
+ * mem_cgroup_lruvec - get the lru list vector for a memcg & node
* @memcg: memcg of the wanted lruvec
*
- * Returns the lru list vector holding pages for a given @node or a given
- * @memcg and @zone. This can be the node lruvec, if the memory controller
- * is disabled.
+ * Returns the lru list vector holding pages for a given @memcg &
+ * @node combination. This can be the node lruvec, if the memory
+ * controller is disabled.
*/
-static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
- struct mem_cgroup *memcg)
+static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat)
{
struct mem_cgroup_per_node *mz;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
- lruvec = node_lruvec(pgdat);
+ lruvec = &pgdat->__lruvec;
goto out;
}
+ if (!memcg)
+ memcg = root_mem_cgroup;
+
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
lruvec = &mz->lruvec;
out:
@@ -728,7 +718,7 @@ static inline void __mod_lruvec_page_state(struct page *page,
return;
}
- lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup);
+ lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
@@ -899,16 +889,21 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
{
}
-static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
- struct mem_cgroup *memcg)
+static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat)
{
- return node_lruvec(pgdat);
+ return &pgdat->__lruvec;
}
static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
struct pglist_data *pgdat)
{
- return &pgdat->lruvec;
+ return &pgdat->__lruvec;
+}
+
+static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
+{
+ return NULL;
}
static inline bool mm_match_cgroup(struct mm_struct *mm,
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f46ea71b4ffd..3a08ecdfca11 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -102,13 +102,10 @@ extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
+extern void generic_online_page(struct page *page, unsigned int order);
extern int set_online_page_callback(online_page_callback_t callback);
extern int restore_online_page_callback(online_page_callback_t callback);
-extern void __online_page_set_limits(struct page *page);
-extern void __online_page_increment_counters(struct page *page);
-extern void __online_page_free(struct page *page);
-
extern int try_online_node(int nid);
extern int arch_add_memory(int nid, u64 start, u64 size,
@@ -229,9 +226,6 @@ void put_online_mems(void);
void mem_hotplug_begin(void);
void mem_hotplug_done(void);
-extern void set_zone_contiguous(struct zone *zone);
-extern void clear_zone_contiguous(struct zone *zone);
-
#else /* ! CONFIG_MEMORY_HOTPLUG */
#define pfn_to_online_page(pfn) \
({ \
@@ -339,6 +333,9 @@ static inline int remove_memory(int nid, u64 start, u64 size)
static inline void __remove_memory(int nid, u64 start, u64 size) {}
#endif /* CONFIG_MEMORY_HOTREMOVE */
+extern void set_zone_contiguous(struct zone *zone);
+extern void clear_zone_contiguous(struct zone *zone);
+
extern void __ref free_area_init_core_hotplug(int nid);
extern int __add_memory(int nid, u64 start, u64 size);
extern int add_memory(int nid, u64 start, u64 size);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6fb714fa851..8b0ef04b6d15 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -564,21 +564,6 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
struct mmu_gather;
struct inode;
-#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline int pmd_devmap(pmd_t pmd)
-{
- return 0;
-}
-static inline int pud_devmap(pud_t pud)
-{
- return 0;
-}
-static inline int pgd_devmap(pgd_t pgd)
-{
- return 0;
-}
-#endif
-
/*
* FIXME: take this include out, include page-flags.h in
* files which need it (119 of them)
@@ -1643,19 +1628,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
return (unsigned long)val;
}
+void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
+
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
- atomic_long_add(value, &mm->rss_stat.count[member]);
+ long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
+
+ mm_trace_rss_stat(mm, member, count);
}
static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
- atomic_long_inc(&mm->rss_stat.count[member]);
+ long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
+
+ mm_trace_rss_stat(mm, member, count);
}
static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
- atomic_long_dec(&mm->rss_stat.count[member]);
+ long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
+
+ mm_trace_rss_stat(mm, member, count);
}
/* Optimized variant when page is already known not to be PageAnon */
@@ -2214,9 +2207,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
extern void setup_per_cpu_pageset(void);
-extern void zone_pcp_update(struct zone *zone);
-extern void zone_pcp_reset(struct zone *zone);
-
/* page_alloc.c */
extern int min_free_kbytes;
extern int watermark_boost_factor;
@@ -2780,7 +2770,7 @@ extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
extern atomic_long_t num_poisoned_pages __read_mostly;
-extern int soft_offline_page(struct page *page, int flags);
+extern int soft_offline_page(unsigned long pfn, int flags);
/*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b0a36d1580b6..89d8ff06c9ce 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -273,12 +273,12 @@ enum lru_list {
#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
-static inline int is_file_lru(enum lru_list lru)
+static inline bool is_file_lru(enum lru_list lru)
{
return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
}
-static inline int is_active_lru(enum lru_list lru)
+static inline bool is_active_lru(enum lru_list lru)
{
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}
@@ -296,6 +296,12 @@ struct zone_reclaim_stat {
unsigned long recent_scanned[2];
};
+enum lruvec_flags {
+ LRUVEC_CONGESTED, /* lruvec has many dirty pages
+ * backed by a congested BDI
+ */
+};
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
@@ -303,12 +309,14 @@ struct lruvec {
atomic_long_t inactive_age;
/* Refaults at the time of last reclaim cycle */
unsigned long refaults;
+ /* Various lruvec state flags (enum lruvec_flags) */
+ unsigned long flags;
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
};
-/* Isolate unmapped file */
+/* Isolate unmapped pages */
#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
@@ -572,9 +580,6 @@ struct zone {
} ____cacheline_internodealigned_in_smp;
enum pgdat_flags {
- PGDAT_CONGESTED, /* pgdat has many dirty pages backed by
- * a congested BDI
- */
PGDAT_DIRTY, /* reclaim scanning has recently found
* many dirty file pages at the tail
* of the LRU.
@@ -777,7 +782,13 @@ typedef struct pglist_data {
#endif
/* Fields commonly accessed by the page reclaim scanner */
- struct lruvec lruvec;
+
+ /*
+ * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
+ *
+ * Use mem_cgroup_lruvec() to look up lruvecs.
+ */
+ struct lruvec __lruvec;
unsigned long flags;
@@ -800,11 +811,6 @@ typedef struct pglist_data {
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
-static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
-{
- return &pgdat->lruvec;
-}
-
static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
return pgdat->node_start_pfn + pgdat->node_spanned_pages;
@@ -842,7 +848,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
#ifdef CONFIG_MEMCG
return lruvec->pgdat;
#else
- return container_of(lruvec, struct pglist_data, lruvec);
+ return container_of(lruvec, struct pglist_data, __lruvec);
#endif
}
@@ -1079,7 +1085,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
/**
* for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
* @zone - The current zone in the iterator
- * @z - The current pointer within zonelist->zones being iterated
+ * @z - The current pointer within zonelist->_zonerefs being iterated
* @zlist - The zonelist being iterated
* @highidx - The zone index of the highest zone to return
* @nodemask - Nodemask allowed by the allocator
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 5229c18025e9..ca92aea8a6bd 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -91,7 +91,7 @@ void module_arch_cleanup(struct module *mod);
/* Any cleanup before freeing mod->module_init */
void module_arch_freeing_init(struct module *mod);
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
#include <linux/kasan.h>
#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
#else
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 1099c2fee20f..6861df759fad 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -30,7 +30,7 @@ static inline bool is_migrate_isolate(int migratetype)
}
#endif
-#define SKIP_HWPOISON 0x1
+#define MEMORY_OFFLINE 0x1
#define REPORT_FAILURE 0x2
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
@@ -58,7 +58,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* Test all pages in [start_pfn, end_pfn) are isolated or not.
*/
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
- bool skip_hwpoisoned_pages);
+ int isol_flags);
struct page *alloc_migrate_target(struct page *page, unsigned long private);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4d2a2fa55ed5..877a95c6a2d2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -561,26 +561,6 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
return __kmalloc(size, flags);
}
-/*
- * Determine size used for the nth kmalloc cache.
- * return size or 0 if a kmalloc cache for that
- * size does not exist
- */
-static __always_inline unsigned int kmalloc_size(unsigned int n)
-{
-#ifndef CONFIG_SLOB
- if (n > 2)
- return 1U << n;
-
- if (n == 1 && KMALLOC_MIN_SIZE <= 32)
- return 96;
-
- if (n == 2 && KMALLOC_MIN_SIZE <= 64)
- return 192;
-#endif
- return 0;
-}
-
static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
#ifndef CONFIG_SLOB
diff --git a/include/linux/string.h b/include/linux/string.h
index b6ccdc2c7f02..02894e417565 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -216,6 +216,8 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
const void *from, size_t available);
+int ptr_to_hashval(const void *ptr, unsigned long *hashval_out);
+
/**
* strstarts - does @str start with @prefix?
* @str: string to examine
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 063c0c1e112b..1e99f7ac1d7e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -307,7 +307,7 @@ struct vma_swap_readahead {
};
/* linux/mm/workingset.c */
-void *workingset_eviction(struct page *page);
+void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index b4c58a191eb1..a4b241102771 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -22,6 +22,18 @@ struct notifier_block; /* in notifier.h */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
+
+/*
+ * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
+ *
+ * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
+ * shadow memory has been mapped. It's used to handle allocation errors so that
+ * we don't try to poision shadow on free if it was never allocated.
+ *
+ * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to
+ * determine which allocations need the module shadow freed.
+ */
+
/*
* Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
* vfree_atomic().
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 69e8bb8963db..ad7e642bd497 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -316,6 +316,53 @@ TRACE_EVENT(mm_page_alloc_extfrag,
__entry->change_ownership)
);
+/*
+ * Required for uniquely and securely identifying mm in rss_stat tracepoint.
+ */
+#ifndef __PTR_TO_HASHVAL
+static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
+{
+ int ret;
+ unsigned long hashval;
+
+ ret = ptr_to_hashval(ptr, &hashval);
+ if (ret)
+ return 0;
+
+ /* The hashed value is only 32-bit */
+ return (unsigned int)hashval;
+}
+#define __PTR_TO_HASHVAL
+#endif
+
+TRACE_EVENT(rss_stat,
+
+ TP_PROTO(struct mm_struct *mm,
+ int member,
+ long count),
+
+ TP_ARGS(mm, member, count),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, mm_id)
+ __field(unsigned int, curr)
+ __field(int, member)
+ __field(long, size)
+ ),
+
+ TP_fast_assign(
+ __entry->mm_id = mm_ptr_to_hash(mm);
+ __entry->curr = !!(current->mm == mm);
+ __entry->member = member;
+ __entry->size = (count << PAGE_SHIFT);
+ ),
+
+ TP_printk("mm_id=%u curr=%d member=%d size=%ldB",
+ __entry->mm_id,
+ __entry->curr,
+ __entry->member,
+ __entry->size)
+ );
#endif /* _TRACE_KMEM_H */
/* This part must be outside protection */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c74761004ee5..ece7e13f6e4a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
PAGE_SIZE, 0, 0);
- if (area->vaddr & ~PAGE_MASK) {
+ if (IS_ERR_VALUE(area->vaddr)) {
ret = area->vaddr;
goto fail;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 0f0bac8318dd..21c6c1e29b98 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,6 +93,7 @@
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/stackleak.h>
+#include <linux/kasan.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -223,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
+ /* Clear the KASAN shadow of the stack. */
+ kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b6f2f35d0bcf..70665934d53e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1466,7 +1466,7 @@ static struct ctl_table vm_table[] = {
.procname = "drop_caches",
.data = &sysctl_drop_caches,
.maxlen = sizeof(int),
- .mode = 0644,
+ .mode = 0200,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = SYSCTL_ONE,
.extra2 = &four,
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 6c9682ce0254..81f5464ea9e1 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -6,6 +6,9 @@ config HAVE_ARCH_KASAN
config HAVE_ARCH_KASAN_SW_TAGS
bool
+config HAVE_ARCH_KASAN_VMALLOC
+ bool
+
config CC_HAS_KASAN_GENERIC
def_bool $(cc-option, -fsanitize=kernel-address)
@@ -142,6 +145,19 @@ config KASAN_SW_TAGS_IDENTIFY
(use-after-free or out-of-bounds) at the cost of increased
memory consumption.
+config KASAN_VMALLOC
+ bool "Back mappings in vmalloc space with real shadow memory"
+ depends on KASAN && HAVE_ARCH_KASAN_VMALLOC
+ help
+ By default, the shadow region for vmalloc space is the read-only
+ zero page. This means that KASAN cannot detect errors involving
+ vmalloc space.
+
+ Enabling this option will hook in to vmap/vmalloc and back those
+ mappings with real shadow memory allocated on demand. This allows
+ for KASAN to detect more sorts of errors (and to support vmapped
+ stacks), but at the cost of higher memory usage.
+
config TEST_KASAN
tristate "Module for testing KASAN for bug detection"
depends on m && KASAN
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 49cc4d570a40..328d33beae36 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -19,6 +19,7 @@
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/vmalloc.h>
#include <asm/page.h>
@@ -748,6 +749,30 @@ static noinline void __init kmalloc_double_kzfree(void)
kzfree(ptr);
}
+#ifdef CONFIG_KASAN_VMALLOC
+static noinline void __init vmalloc_oob(void)
+{
+ void *area;
+
+ pr_info("vmalloc out-of-bounds\n");
+
+ /*
+ * We have to be careful not to hit the guard page.
+ * The MMU will catch that and crash us.
+ */
+ area = vmalloc(3000);
+ if (!area) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ ((volatile char *)area)[3100];
+ vfree(area);
+}
+#else
+static void __init vmalloc_oob(void) {}
+#endif
+
static int __init kmalloc_tests_init(void)
{
/*
@@ -793,6 +818,7 @@ static int __init kmalloc_tests_init(void)
kasan_strings();
kasan_bitops();
kmalloc_double_kzfree();
+ vmalloc_oob();
kasan_restore_multi_shot(multishot);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index dee8fc467fcf..7c488a1ce318 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -761,11 +761,38 @@ static int __init initialize_ptr_random(void)
early_initcall(initialize_ptr_random);
/* Maps a pointer to a 32 bit unique identifier. */
+static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
+{
+ unsigned long hashval;
+
+ if (static_branch_unlikely(&not_filled_random_ptr_key))
+ return -EAGAIN;
+
+#ifdef CONFIG_64BIT
+ hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
+ /*
+ * Mask off the first 32 bits, this makes explicit that we have
+ * modified the address (and 32 bits is plenty for a unique ID).
+ */
+ hashval = hashval & 0xffffffff;
+#else
+ hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
+#endif
+ *hashval_out = hashval;
+ return 0;
+}
+
+int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
+{
+ return __ptr_to_hashval(ptr, hashval_out);
+}
+
static char *ptr_to_id(char *buf, char *end, const void *ptr,
struct printf_spec spec)
{
const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
unsigned long hashval;
+ int ret;
/* When debugging early boot use non-cryptographically secure hash. */
if (unlikely(debug_boot_weak_hash)) {
@@ -773,22 +800,13 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr,
return pointer_string(buf, end, (const void *)hashval, spec);
}
- if (static_branch_unlikely(&not_filled_random_ptr_key)) {
+ ret = __ptr_to_hashval(ptr, &hashval);
+ if (ret) {
spec.field_width = 2 * sizeof(ptr);
/* string length must be less than default_width */
return error_string(buf, end, str, spec);
}
-#ifdef CONFIG_64BIT
- hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
- /*
- * Mask off the first 32 bits, this makes explicit that we have
- * modified the address (and 32 bits is plenty for a unique ID).
- */
- hashval = hashval & 0xffffffff;
-#else
- hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
-#endif
return pointer_string(buf, end, (const void *)hashval, spec);
}
diff --git a/mm/Kconfig b/mm/Kconfig
index f332efe751dd..ab80933be65f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -29,7 +29,7 @@ config FLATMEM_MANUAL
For systems that have holes in their physical address
spaces and for features like NUMA and memory hotplug,
- choose "Sparse Memory"
+ choose "Sparse Memory".
If unsure, choose this option (Flat Memory) over any other.
@@ -122,9 +122,9 @@ config SPARSEMEM_VMEMMAP
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
default y
help
- SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
- pfn_to_page and page_to_pfn operations. This is the most
- efficient option when sufficient kernel resources are available.
+ SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
+ pfn_to_page and page_to_pfn operations. This is the most
+ efficient option when sufficient kernel resources are available.
config HAVE_MEMBLOCK_NODE_MAP
bool
@@ -160,9 +160,9 @@ config MEMORY_HOTPLUG_SPARSE
depends on SPARSEMEM && MEMORY_HOTPLUG
config MEMORY_HOTPLUG_DEFAULT_ONLINE
- bool "Online the newly added memory blocks by default"
- depends on MEMORY_HOTPLUG
- help
+ bool "Online the newly added memory blocks by default"
+ depends on MEMORY_HOTPLUG
+ help
This option sets the default policy setting for memory hotplug
onlining policy (/sys/devices/system/memory/auto_online_blocks) which
determines what happens to newly added memory regions. Policy setting
@@ -227,14 +227,14 @@ config COMPACTION
select MIGRATION
depends on MMU
help
- Compaction is the only memory management component to form
- high order (larger physically contiguous) memory blocks
- reliably. The page allocator relies on compaction heavily and
- the lack of the feature can lead to unexpected OOM killer
- invocations for high order memory requests. You shouldn't
- disable this option unless there really is a strong reason for
- it and then we would be really interested to hear about that at
- linux-mm@kvack.org.
+ Compaction is the only memory management component to form
+ high order (larger physically contiguous) memory blocks
+ reliably. The page allocator relies on compaction heavily and
+ the lack of the feature can lead to unexpected OOM killer
+ invocations for high order memory requests. You shouldn't
+ disable this option unless there really is a strong reason for
+ it and then we would be really interested to hear about that at
+ linux-mm@kvack.org.
#
# support for page migration
@@ -258,7 +258,7 @@ config ARCH_ENABLE_THP_MIGRATION
bool
config CONTIG_ALLOC
- def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+ def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
config PHYS_ADDR_T_64BIT
def_bool 64BIT
@@ -302,10 +302,10 @@ config KSM
root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
config DEFAULT_MMAP_MIN_ADDR
- int "Low address space to protect from user allocation"
+ int "Low address space to protect from user allocation"
depends on MMU
- default 4096
- help
+ default 4096
+ help
This is the portion of low virtual memory which should be protected
from userspace allocation. Keeping a user from writing to low pages
can help reduce the impact of kernel NULL pointer bugs.
@@ -408,7 +408,7 @@ choice
endchoice
config ARCH_WANTS_THP_SWAP
- def_bool n
+ def_bool n
config THP_SWAP
def_bool y
diff --git a/mm/cma.c b/mm/cma.c
index 7fe0b8356775..be55d1988c67 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -95,13 +95,11 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
static int __init cma_activate_area(struct cma *cma)
{
- int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long);
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
unsigned i = cma->count >> pageblock_order;
struct zone *zone;
- cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
-
+ cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
if (!cma->bitmap) {
cma->count = 0;
return -ENOMEM;
@@ -139,7 +137,7 @@ static int __init cma_activate_area(struct cma *cma)
not_in_zone:
pr_err("CMA area %s could not be activated\n", cma->name);
- kfree(cma->bitmap);
+ bitmap_free(cma->bitmap);
cma->count = 0;
return -EINVAL;
}
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index a7dd9e8e10d5..4e6cbe2f586e 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -29,7 +29,7 @@ static int cma_debugfs_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
static int cma_used_get(void *data, u64 *val)
{
@@ -44,7 +44,7 @@ static int cma_used_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
static int cma_maxchunk_get(void *data, u64 *val)
{
@@ -66,7 +66,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
{
@@ -126,7 +126,7 @@ static int cma_free_write(void *data, u64 val)
return cma_free_mem(cma, pages);
}
-DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
static int cma_alloc_mem(struct cma *cma, int count)
{
@@ -158,7 +158,7 @@ static int cma_alloc_write(void *data, u64 val)
return cma_alloc_mem(cma, pages);
}
-DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 85b7d087eb45..bf6aa30be58d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2329,27 +2329,6 @@ EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
-static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
- struct file *fpin)
-{
- int flags = vmf->flags;
-
- if (fpin)
- return fpin;
-
- /*
- * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
- * anything, so we only pin the file and drop the mmap_sem if only
- * FAULT_FLAG_ALLOW_RETRY is set.
- */
- if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
- FAULT_FLAG_ALLOW_RETRY) {
- fpin = get_file(vmf->vma->vm_file);
- up_read(&vmf->vma->vm_mm->mmap_sem);
- }
- return fpin;
-}
-
/*
* lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
* @vmf - the vm_fault for this fault.
@@ -3161,6 +3140,27 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(pagecache_write_end);
+/*
+ * Warn about a page cache invalidation failure during a direct I/O write.
+ */
+void dio_warn_stale_pagecache(struct file *filp)
+{
+ static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
+ char pathname[128];
+ struct inode *inode = file_inode(filp);
+ char *path;
+
+ errseq_set(&inode->i_mapping->wb_err, -EIO);
+ if (__ratelimit(&_rs)) {
+ path = file_path(filp, pathname, sizeof(pathname));
+ if (IS_ERR(path))
+ path = "(unknown)";
+ pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
+ pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
+ current->comm);
+ }
+}
+
ssize_t
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
@@ -3218,11 +3218,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
* Most of the time we do not need this since dio_complete() will do
* the invalidation for us. However there are some file systems that
* do not end up with dio_complete() being called, so let's not break
- * them by removing it completely
+ * them by removing it completely.
+ *
+ * Noticeable example is a blkdev_direct_IO().
+ *
+ * Skip invalidation for async writes or if mapping has no pages.
*/
- if (mapping->nrpages)
- invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
+ if (written > 0 && mapping->nrpages &&
+ invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
+ dio_warn_stale_pagecache(file);
if (written > 0) {
pos += written;
diff --git a/mm/gup.c b/mm/gup.c
index 8f236a335ae9..7646bf993b25 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -734,11 +734,17 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* Or NULL if the caller does not require them.
* @nonblocking: whether waiting for disk IO or mmap_sem contention
*
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
+ * Returns either number of pages pinned (which may be less than the
+ * number requested), or an error. Details about the return value:
+ *
+ * -- If nr_pages is 0, returns 0.
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
+ * pages pinned. Again, this may be less than nr_pages.
+ *
+ * The caller is responsible for releasing returned @pages, via put_page().
+ *
+ * @vmas are valid only as long as mmap_sem is held.
*
* Must be called with mmap_sem held. It may be released. See below.
*
@@ -1107,11 +1113,17 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
*
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
+ * Returns either number of pages pinned (which may be less than the
+ * number requested), or an error. Details about the return value:
+ *
+ * -- If nr_pages is 0, returns 0.
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
+ * pages pinned. Again, this may be less than nr_pages.
+ *
+ * The caller is responsible for releasing returned @pages, via put_page().
+ *
+ * @vmas are valid only as long as mmap_sem is held.
*
* Must be called with mmap_sem held for read or write.
*
@@ -1443,6 +1455,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
bool drain_allow = true;
bool migrate_allow = true;
LIST_HEAD(cma_page_list);
+ long ret = nr_pages;
check_again:
for (i = 0; i < nr_pages;) {
@@ -1504,17 +1517,18 @@ check_again:
* again migrating any new CMA pages which we failed to isolate
* earlier.
*/
- nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ ret = __get_user_pages_locked(tsk, mm, start, nr_pages,
pages, vmas, NULL,
gup_flags);
- if ((nr_pages > 0) && migrate_allow) {
+ if ((ret > 0) && migrate_allow) {
+ nr_pages = ret;
drain_allow = true;
goto check_again;
}
}
- return nr_pages;
+ return ret;
}
#else
static long check_and_migrate_cma_pages(struct task_struct *tsk,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 13cc93785006..41a0fbddc96b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3003,7 +3003,7 @@ next:
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
"%llu\n");
static int __init split_huge_pages_debugfs(void)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b45a95363a84..ac65bb5e38ac 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -244,16 +244,66 @@ struct file_region {
long to;
};
+/* Must be called with resv->lock held. Calling this with count_only == true
+ * will count the number of pages to be added but will not modify the linked
+ * list.
+ */
+static long add_reservation_in_range(struct resv_map *resv, long f, long t,
+ bool count_only)
+{
+ long chg = 0;
+ struct list_head *head = &resv->regions;
+ struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
+
+ /* Locate the region we are before or in. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+
+ chg = t - f;
+
+ /* Check for and consume any regions we now overlap with. */
+ nrg = rg;
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ break;
+
+ /* We overlap with this area, if it extends further than
+ * us then we must extend ourselves. Account for its
+ * existing reservation.
+ */
+ if (rg->to > t) {
+ chg += rg->to - t;
+ t = rg->to;
+ }
+ chg -= rg->to - rg->from;
+
+ if (!count_only && rg != nrg) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ }
+
+ if (!count_only) {
+ nrg->from = f;
+ nrg->to = t;
+ }
+
+ return chg;
+}
+
/*
* Add the huge page range represented by [f, t) to the reserve
- * map. In the normal case, existing regions will be expanded
- * to accommodate the specified range. Sufficient regions should
- * exist for expansion due to the previous call to region_chg
- * with the same range. However, it is possible that region_del
- * could have been called after region_chg and modifed the map
- * in such a way that no region exists to be expanded. In this
- * case, pull a region descriptor from the cache associated with
- * the map and use that for the new range.
+ * map. Existing regions will be expanded to accommodate the specified
+ * range, or a region will be taken from the cache. Sufficient regions
+ * must exist in the cache due to the previous call to region_chg with
+ * the same range.
*
* Return the number of new huge pages added to the map. This
* number is greater than or equal to zero.
@@ -261,7 +311,7 @@ struct file_region {
static long region_add(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg, *trg;
+ struct file_region *rg, *nrg;
long add = 0;
spin_lock(&resv->lock);
@@ -272,9 +322,8 @@ static long region_add(struct resv_map *resv, long f, long t)
/*
* If no region exists which can be expanded to include the
- * specified range, the list must have been modified by an
- * interleving call to region_del(). Pull a region descriptor
- * from the cache and use it for this range.
+ * specified range, pull a region descriptor from the cache
+ * and use it for this range.
*/
if (&rg->link == head || t < rg->from) {
VM_BUG_ON(resv->region_cache_count <= 0);
@@ -292,38 +341,7 @@ static long region_add(struct resv_map *resv, long f, long t)
goto out_locked;
}
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
-
- /* Check for and consume any regions we now overlap with. */
- nrg = rg;
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- break;
-
- /* If this area reaches higher then extend our area to
- * include it completely. If this is not the first area
- * which we intend to reuse, free it. */
- if (rg->to > t)
- t = rg->to;
- if (rg != nrg) {
- /* Decrement return value by the deleted range.
- * Another range will span this area so that by
- * end of routine add will be >= zero
- */
- add -= (rg->to - rg->from);
- list_del(&rg->link);
- kfree(rg);
- }
- }
-
- add += (nrg->from - f); /* Added to beginning of region */
- nrg->from = f;
- add += t - nrg->to; /* Added to end of region */
- nrg->to = t;
+ add = add_reservation_in_range(resv, f, t, false);
out_locked:
resv->adds_in_progress--;
@@ -339,15 +357,9 @@ out_locked:
* call to region_add that will actually modify the reserve
* map to add the specified range [f, t). region_chg does
* not change the number of huge pages represented by the
- * map. However, if the existing regions in the map can not
- * be expanded to represent the new range, a new file_region
- * structure is added to the map as a placeholder. This is
- * so that the subsequent region_add call will have all the
- * regions it needs and will not fail.
- *
- * Upon entry, region_chg will also examine the cache of region descriptors
- * associated with the map. If there are not enough descriptors cached, one
- * will be allocated for the in progress add operation.
+ * map. A new file_region structure is added to the cache
+ * as a placeholder, so that the subsequent region_add
+ * call will have all the regions it needs and will not fail.
*
* Returns the number of huge pages that need to be added to the existing
* reservation map for the range [f, t). This number is greater or equal to
@@ -356,11 +368,8 @@ out_locked:
*/
static long region_chg(struct resv_map *resv, long f, long t)
{
- struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg = NULL;
long chg = 0;
-retry:
spin_lock(&resv->lock);
retry_locked:
resv->adds_in_progress++;
@@ -378,10 +387,8 @@ retry_locked:
spin_unlock(&resv->lock);
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
- if (!trg) {
- kfree(nrg);
+ if (!trg)
return -ENOMEM;
- }
spin_lock(&resv->lock);
list_add(&trg->link, &resv->region_cache);
@@ -389,61 +396,8 @@ retry_locked:
goto retry_locked;
}
- /* Locate the region we are before or in. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
+ chg = add_reservation_in_range(resv, f, t, true);
- /* If we are below the current region then a new region is required.
- * Subtle, allocate a new region at the position but make it zero
- * size such that we can guarantee to record the reservation. */
- if (&rg->link == head || t < rg->from) {
- if (!nrg) {
- resv->adds_in_progress--;
- spin_unlock(&resv->lock);
- nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
- if (!nrg)
- return -ENOMEM;
-
- nrg->from = f;
- nrg->to = f;
- INIT_LIST_HEAD(&nrg->link);
- goto retry;
- }
-
- list_add(&nrg->link, rg->link.prev);
- chg = t - f;
- goto out_nrg;
- }
-
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
- chg = t - f;
-
- /* Check for and consume any regions we now overlap with. */
- list_for_each_entry(rg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- goto out;
-
- /* We overlap with this area, if it extends further than
- * us then we must extend ourselves. Account for its
- * existing reservation. */
- if (rg->to > t) {
- chg += rg->to - t;
- t = rg->to;
- }
- chg -= rg->to - rg->from;
- }
-
-out:
- spin_unlock(&resv->lock);
- /* We already know we raced and no longer need the new region */
- kfree(nrg);
- return chg;
-out_nrg:
spin_unlock(&resv->lock);
return chg;
}
@@ -1069,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order)
}
#ifdef CONFIG_CONTIG_ALLOC
-static int __alloc_gigantic_page(unsigned long start_pfn,
- unsigned long nr_pages, gfp_t gfp_mask)
-{
- unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
- gfp_mask);
-}
-
-static bool pfn_range_valid_gigantic(struct zone *z,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long i, end_pfn = start_pfn + nr_pages;
- struct page *page;
-
- for (i = start_pfn; i < end_pfn; i++) {
- page = pfn_to_online_page(i);
- if (!page)
- return false;
-
- if (page_zone(page) != z)
- return false;
-
- if (PageReserved(page))
- return false;
-
- if (page_count(page) > 0)
- return false;
-
- if (PageHuge(page))
- return false;
- }
-
- return true;
-}
-
-static bool zone_spans_last_pfn(const struct zone *zone,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long last_pfn = start_pfn + nr_pages - 1;
- return zone_spans_pfn(zone, last_pfn);
-}
-
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
- unsigned int order = huge_page_order(h);
- unsigned long nr_pages = 1 << order;
- unsigned long ret, pfn, flags;
- struct zonelist *zonelist;
- struct zone *zone;
- struct zoneref *z;
+ unsigned long nr_pages = 1UL << huge_page_order(h);
- zonelist = node_zonelist(nid, gfp_mask);
- for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
- spin_lock_irqsave(&zone->lock, flags);
-
- pfn = ALIGN(zone->zone_start_pfn, nr_pages);
- while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
- /*
- * We release the zone lock here because
- * alloc_contig_range() will also lock the zone
- * at some point. If there's an allocation
- * spinning on this lock, it may win the race
- * and cause alloc_contig_range() to fail...
- */
- spin_unlock_irqrestore(&zone->lock, flags);
- ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
- if (!ret)
- return pfn_to_page(pfn);
- spin_lock_irqsave(&zone->lock, flags);
- }
- pfn += nr_pages;
- }
-
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-
- return NULL;
+ return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
@@ -3915,7 +3796,7 @@ retry:
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4042,8 +3923,7 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx, unsigned long address)
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
unsigned long key[2];
u32 hash;
@@ -4051,7 +3931,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
key[0] = (unsigned long) mapping;
key[1] = idx;
- hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+ hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
return hash & (num_fault_mutexes - 1);
}
@@ -4060,8 +3940,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx, unsigned long address)
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
return 0;
}
@@ -4105,7 +3984,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4459,6 +4338,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
}
+
+ /*
+ * If subpage information not requested, update counters
+ * and skip the same_page loop below.
+ */
+ if (!pages && !vmas && !pfn_offset &&
+ (vaddr + huge_page_size(h) < vma->vm_end) &&
+ (remainder >= pages_per_huge_page(h))) {
+ vaddr += huge_page_size(h);
+ remainder -= pages_per_huge_page(h);
+ i += pages_per_huge_page(h);
+ spin_unlock(ptl);
+ continue;
+ }
+
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
@@ -4842,7 +4736,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -4872,7 +4766,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_unlock_write(mapping);
+ i_mmap_unlock_read(mapping);
return pte;
}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 5b7430bd83a6..e488876b168a 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -67,8 +67,8 @@ static int hwpoison_unpoison(void *data, u64 val)
return unpoison_memory(val);
}
-DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
-DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
+DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
static void pfn_inject_exit(void)
{
diff --git a/mm/internal.h b/mm/internal.h
index 0d5f720c75ab..3cf20ab3ca01 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -165,6 +165,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
+extern void zone_pcp_update(struct zone *zone);
+extern void zone_pcp_reset(struct zone *zone);
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -290,7 +293,8 @@ static inline bool is_data_mapping(vm_flags_t flags)
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent);
+ struct vm_area_struct *prev);
+void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU
extern long populate_vma_page_range(struct vm_area_struct *vma,
@@ -362,6 +366,27 @@ vma_address(struct page *page, struct vm_area_struct *vma)
return max(start, vma->vm_start);
}
+static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+ struct file *fpin)
+{
+ int flags = vmf->flags;
+
+ if (fpin)
+ return fpin;
+
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+ * anything, so we only pin the file and drop the mmap_sem if only
+ * FAULT_FLAG_ALLOW_RETRY is set.
+ */
+ if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+ FAULT_FLAG_ALLOW_RETRY) {
+ fpin = get_file(vmf->vma->vm_file);
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ }
+ return fpin;
+}
+
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6814d6d6a023..df3371d5c572 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -36,6 +36,8 @@
#include <linux/bug.h>
#include <linux/uaccess.h>
+#include <asm/tlbflush.h>
+
#include "kasan.h"
#include "../slab.h"
@@ -590,6 +592,7 @@ void kasan_kfree_large(void *ptr, unsigned long ip)
/* The object will be poisoned by page_alloc. */
}
+#ifndef CONFIG_KASAN_VMALLOC
int kasan_module_alloc(void *addr, size_t size)
{
void *ret;
@@ -625,6 +628,7 @@ void kasan_free_shadow(const struct vm_struct *vm)
if (vm->flags & VM_KASAN)
vfree(kasan_mem_to_shadow(vm->addr));
}
+#endif
extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip);
@@ -744,3 +748,232 @@ static int __init kasan_memhotplug_init(void)
core_initcall(kasan_memhotplug_init);
#endif
+
+#ifdef CONFIG_KASAN_VMALLOC
+static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+ pte_t pte;
+
+ if (likely(!pte_none(*ptep)))
+ return 0;
+
+ page = __get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pte_none(*ptep))) {
+ set_pte_at(&init_mm, addr, ptep, pte);
+ page = 0;
+ }
+ spin_unlock(&init_mm.page_table_lock);
+ if (page)
+ free_page(page);
+ return 0;
+}
+
+int kasan_populate_vmalloc(unsigned long requested_size, struct vm_struct *area)
+{
+ unsigned long shadow_start, shadow_end;
+ int ret;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow(area->addr);
+ shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
+ shadow_end = (unsigned long)kasan_mem_to_shadow(area->addr +
+ area->size);
+ shadow_end = ALIGN(shadow_end, PAGE_SIZE);
+
+ ret = apply_to_page_range(&init_mm, shadow_start,
+ shadow_end - shadow_start,
+ kasan_populate_vmalloc_pte, NULL);
+ if (ret)
+ return ret;
+
+ flush_cache_vmap(shadow_start, shadow_end);
+
+ kasan_unpoison_shadow(area->addr, requested_size);
+
+ area->flags |= VM_KASAN;
+
+ /*
+ * We need to be careful about inter-cpu effects here. Consider:
+ *
+ * CPU#0 CPU#1
+ * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ;
+ * p[99] = 1;
+ *
+ * With compiler instrumentation, that ends up looking like this:
+ *
+ * CPU#0 CPU#1
+ * // vmalloc() allocates memory
+ * // let a = area->addr
+ * // we reach kasan_populate_vmalloc
+ * // and call kasan_unpoison_shadow:
+ * STORE shadow(a), unpoison_val
+ * ...
+ * STORE shadow(a+99), unpoison_val x = LOAD p
+ * // rest of vmalloc process <data dependency>
+ * STORE p, a LOAD shadow(x+99)
+ *
+ * If there is no barrier between the end of unpoisioning the shadow
+ * and the store of the result to p, the stores could be committed
+ * in a different order by CPU#0, and CPU#1 could erroneously observe
+ * poison in the shadow.
+ *
+ * We need some sort of barrier between the stores.
+ *
+ * In the vmalloc() case, this is provided by a smp_wmb() in
+ * clear_vm_uninitialized_flag(). In the per-cpu allocator and in
+ * get_vm_area() and friends, the caller gets shadow allocated but
+ * doesn't have any pages mapped into the virtual address space that
+ * has been reserved. Mapping those pages in will involve taking and
+ * releasing a page-table lock, which will provide the barrier.
+ */
+
+ return 0;
+}
+
+/*
+ * Poison the shadow for a vmalloc region. Called as part of the
+ * freeing process at the time the region is freed.
+ */
+void kasan_poison_vmalloc(void *start, unsigned long size)
+{
+ size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+ kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID);
+}
+
+static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+
+ page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
+
+ spin_lock(&init_mm.page_table_lock);
+
+ if (likely(!pte_none(*ptep))) {
+ pte_clear(&init_mm, addr, ptep);
+ free_page(page);
+ }
+ spin_unlock(&init_mm.page_table_lock);
+
+ return 0;
+}
+
+/*
+ * Release the backing for the vmalloc region [start, end), which
+ * lies within the free region [free_region_start, free_region_end).
+ *
+ * This can be run lazily, long after the region was freed. It runs
+ * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap
+ * infrastructure.
+ *
+ * How does this work?
+ * -------------------
+ *
+ * We have a region that is page aligned, labelled as A.
+ * That might not map onto the shadow in a way that is page-aligned:
+ *
+ * start end
+ * v v
+ * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |??AAAAAA|AAAAAAAA|AA??????| < shadow
+ * (1) (2) (3)
+ *
+ * First we align the start upwards and the end downwards, so that the
+ * shadow of the region aligns with shadow page boundaries. In the
+ * example, this gives us the shadow page (2). This is the shadow entirely
+ * covered by this allocation.
+ *
+ * Then we have the tricky bits. We want to know if we can free the
+ * partially covered shadow pages - (1) and (3) in the example. For this,
+ * we are given the start and end of the free region that contains this
+ * allocation. Extending our previous example, we could have:
+ *
+ * free_region_start free_region_end
+ * | start end |
+ * v v v v
+ * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow
+ * (1) (2) (3)
+ *
+ * Once again, we align the start of the free region up, and the end of
+ * the free region down so that the shadow is page aligned. So we can free
+ * page (1) - we know no allocation currently uses anything in that page,
+ * because all of it is in the vmalloc free region. But we cannot free
+ * page (3), because we can't be sure that the rest of it is unused.
+ *
+ * We only consider pages that contain part of the original region for
+ * freeing: we don't try to free other pages from the free region or we'd
+ * end up trying to free huge chunks of virtual address space.
+ *
+ * Concurrency
+ * -----------
+ *
+ * How do we know that we're not freeing a page that is simultaneously
+ * being used for a fresh allocation in kasan_populate_vmalloc(_pte)?
+ *
+ * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running
+ * at the same time. While we run under free_vmap_area_lock, the population
+ * code does not.
+ *
+ * free_vmap_area_lock instead operates to ensure that the larger range
+ * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and
+ * the per-cpu region-finding algorithm both run under free_vmap_area_lock,
+ * no space identified as free will become used while we are running. This
+ * means that so long as we are careful with alignment and only free shadow
+ * pages entirely covered by the free region, we will not run in to any
+ * trouble - any simultaneous allocations will be for disjoint regions.
+ */
+void kasan_release_vmalloc(unsigned long start, unsigned long end,
+ unsigned long free_region_start,
+ unsigned long free_region_end)
+{
+ void *shadow_start, *shadow_end;
+ unsigned long region_start, region_end;
+
+ region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
+ region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
+
+ free_region_start = ALIGN(free_region_start,
+ PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
+
+ if (start != region_start &&
+ free_region_start < region_start)
+ region_start -= PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
+
+ free_region_end = ALIGN_DOWN(free_region_end,
+ PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
+
+ if (end != region_end &&
+ free_region_end > region_end)
+ region_end += PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
+
+ shadow_start = kasan_mem_to_shadow((void *)region_start);
+ shadow_end = kasan_mem_to_shadow((void *)region_end);
+
+ if (shadow_end > shadow_start) {
+ apply_to_page_range(&init_mm, (unsigned long)shadow_start,
+ (unsigned long)(shadow_end - shadow_start),
+ kasan_depopulate_vmalloc_pte, NULL);
+ flush_tlb_kernel_range((unsigned long)shadow_start,
+ (unsigned long)shadow_end);
+ }
+}
+#endif
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
index 36c645939bc9..2d97efd4954f 100644
--- a/mm/kasan/generic_report.c
+++ b/mm/kasan/generic_report.c
@@ -86,6 +86,9 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
case KASAN_ALLOCA_RIGHT:
bug_type = "alloca-out-of-bounds";
break;
+ case KASAN_VMALLOC_INVALID:
+ bug_type = "vmalloc-out-of-bounds";
+ break;
}
return bug_type;
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 35cff6bbb716..3a083274628e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -25,6 +25,7 @@
#endif
#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
+#define KASAN_VMALLOC_INVALID 0xF9 /* unallocated space in vmapped page */
/*
* Stack redzone shadow values
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a8a57bebb5fa..b679908743cb 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1602,6 +1602,24 @@ static void collapse_file(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_unlocked;
}
+ } else if (PageDirty(page)) {
+ /*
+ * khugepaged only works on read-only fd,
+ * so this page is dirty because it hasn't
+ * been flushed since first write. There
+ * won't be new dirty pages.
+ *
+ * Trigger async flush here and hope the
+ * writeback is done when khugepaged
+ * revisits this page.
+ *
+ * This is a one-off situation. We are not
+ * forcing writeback in loop.
+ */
+ xas_unlock_irq(&xas);
+ filemap_flush(mapping);
+ result = SCAN_FAIL;
+ goto xa_unlocked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
diff --git a/mm/madvise.c b/mm/madvise.c
index 94c343b4c968..bcdb6a042787 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -864,13 +864,13 @@ static int madvise_inject_error(int behavior,
{
struct page *page;
struct zone *zone;
- unsigned int order;
+ unsigned long size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- for (; start < end; start += PAGE_SIZE << order) {
+ for (; start < end; start += size) {
unsigned long pfn;
int ret;
@@ -882,9 +882,9 @@ static int madvise_inject_error(int behavior,
/*
* When soft offlining hugepages, after migrating the page
* we dissolve it, therefore in the second loop "page" will
- * no longer be a compound page, and order will be 0.
+ * no longer be a compound page.
*/
- order = compound_order(compound_head(page));
+ size = page_size(compound_head(page));
if (PageHWPoison(page)) {
put_page(page);
@@ -895,7 +895,7 @@ static int madvise_inject_error(int behavior,
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
pfn, start);
- ret = soft_offline_page(page, MF_COUNT_INCREASED);
+ ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
if (ret)
return ret;
continue;
@@ -1059,9 +1059,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
if (!madvise_behavior_valid(behavior))
return error;
- if (start & ~PAGE_MASK)
+ if (!PAGE_ALIGNED(start))
return error;
- len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+ len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
diff --git a/mm/memblock.c b/mm/memblock.c
index c4b16cae2bc9..4bc2c7d8bf42 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -57,42 +57,38 @@
* at build time. The region arrays for the "memory" and "reserved"
* types are initially sized to %INIT_MEMBLOCK_REGIONS and for the
* "physmap" type to %INIT_PHYSMEM_REGIONS.
- * The :c:func:`memblock_allow_resize` enables automatic resizing of
- * the region arrays during addition of new regions. This feature
- * should be used with care so that memory allocated for the region
- * array will not overlap with areas that should be reserved, for
- * example initrd.
+ * The memblock_allow_resize() enables automatic resizing of the region
+ * arrays during addition of new regions. This feature should be used
+ * with care so that memory allocated for the region array will not
+ * overlap with areas that should be reserved, for example initrd.
*
* The early architecture setup should tell memblock what the physical
- * memory layout is by using :c:func:`memblock_add` or
- * :c:func:`memblock_add_node` functions. The first function does not
- * assign the region to a NUMA node and it is appropriate for UMA
- * systems. Yet, it is possible to use it on NUMA systems as well and
- * assign the region to a NUMA node later in the setup process using
- * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node`
- * performs such an assignment directly.
+ * memory layout is by using memblock_add() or memblock_add_node()
+ * functions. The first function does not assign the region to a NUMA
+ * node and it is appropriate for UMA systems. Yet, it is possible to
+ * use it on NUMA systems as well and assign the region to a NUMA node
+ * later in the setup process using memblock_set_node(). The
+ * memblock_add_node() performs such an assignment directly.
*
* Once memblock is setup the memory can be allocated using one of the
* API variants:
*
- * * :c:func:`memblock_phys_alloc*` - these functions return the
- * **physical** address of the allocated memory
- * * :c:func:`memblock_alloc*` - these functions return the **virtual**
- * address of the allocated memory.
+ * * memblock_phys_alloc*() - these functions return the **physical**
+ * address of the allocated memory
+ * * memblock_alloc*() - these functions return the **virtual** address
+ * of the allocated memory.
*
* Note, that both API variants use implict assumptions about allowed
* memory ranges and the fallback methods. Consult the documentation
- * of :c:func:`memblock_alloc_internal` and
- * :c:func:`memblock_alloc_range_nid` functions for more elaboarte
- * description.
+ * of memblock_alloc_internal() and memblock_alloc_range_nid()
+ * functions for more elaborate description.
*
- * As the system boot progresses, the architecture specific
- * :c:func:`mem_init` function frees all the memory to the buddy page
- * allocator.
+ * As the system boot progresses, the architecture specific mem_init()
+ * function frees all the memory to the buddy page allocator.
*
- * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
+ * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
* memblock data structures will be discarded after the system
- * initialization compltes.
+ * initialization completes.
*/
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -1323,12 +1319,13 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
* @start: the lower bound of the memory region to allocate (phys address)
* @end: the upper bound of the memory region to allocate (phys address)
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @exact_nid: control the allocation fall back to other nodes
*
* The allocation is performed from memory region limited by
- * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE.
+ * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE.
*
- * If the specified node can not hold the requested memory the
- * allocation falls back to any node in the system
+ * If the specified node can not hold the requested memory and @exact_nid
+ * is false, the allocation falls back to any node in the system.
*
* For systems with memory mirroring, the allocation is attempted first
* from the regions with mirroring enabled and then retried from any
@@ -1342,7 +1339,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
*/
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid,
+ bool exact_nid)
{
enum memblock_flags flags = choose_memblock_flags();
phys_addr_t found;
@@ -1362,7 +1360,7 @@ again:
if (found && !memblock_reserve(found, size))
goto done;
- if (nid != NUMA_NO_NODE) {
+ if (nid != NUMA_NO_NODE && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
@@ -1410,7 +1408,8 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
phys_addr_t start,
phys_addr_t end)
{
- return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
+ false);
}
/**
@@ -1429,7 +1428,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
{
return memblock_alloc_range_nid(size, align, 0,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid, false);
}
/**
@@ -1439,6 +1438,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
* @min_addr: the lower bound of the memory region to allocate (phys address)
* @max_addr: the upper bound of the memory region to allocate (phys address)
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @exact_nid: control the allocation fall back to other nodes
*
* Allocates memory block using memblock_alloc_range_nid() and
* converts the returned physical address to virtual.
@@ -1454,7 +1454,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
static void * __init memblock_alloc_internal(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
+ int nid, bool exact_nid)
{
phys_addr_t alloc;
@@ -1469,11 +1469,13 @@ static void * __init memblock_alloc_internal(
if (max_addr > memblock.current_limit)
max_addr = memblock.current_limit;
- alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
+ alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid,
+ exact_nid);
/* retry allocation without lower limit */
if (!alloc && min_addr)
- alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+ alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid,
+ exact_nid);
if (!alloc)
return NULL;
@@ -1482,6 +1484,43 @@ static void * __init memblock_alloc_internal(
}
/**
+ * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node
+ * without zeroing memory
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory.
+ *
+ * Return:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_alloc_exact_nid_raw(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
+
+ ptr = memblock_alloc_internal(size, align,
+ min_addr, max_addr, nid, true);
+ if (ptr && size > 0)
+ page_init_poison(ptr, size);
+
+ return ptr;
+}
+
+/**
* memblock_alloc_try_nid_raw - allocate boot memory block without zeroing
* memory and without panicking
* @size: size of memory block to be allocated in bytes
@@ -1512,7 +1551,7 @@ void * __init memblock_alloc_try_nid_raw(
&max_addr, (void *)_RET_IP_);
ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid);
+ min_addr, max_addr, nid, false);
if (ptr && size > 0)
page_init_poison(ptr, size);
@@ -1547,7 +1586,7 @@ void * __init memblock_alloc_try_nid(
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid);
+ min_addr, max_addr, nid, false);
if (ptr)
memset(ptr, 0, size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 01f3f8b665e9..bc01423277c5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -108,7 +108,6 @@ static const char *const mem_cgroup_lru_names[] = {
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
-#define NUMAINFO_EVENTS_TARGET 1024
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
@@ -778,7 +777,7 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
if (!memcg || memcg == root_mem_cgroup) {
__mod_node_page_state(pgdat, idx, val);
} else {
- lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
rcu_read_unlock();
@@ -877,9 +876,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
- case MEM_CGROUP_TARGET_NUMAINFO:
- next = val + NUMAINFO_EVENTS_TARGET;
- break;
default:
break;
}
@@ -899,21 +895,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
- bool do_numainfo __maybe_unused;
do_softlimit = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
-#if MAX_NUMNODES > 1
- do_numainfo = mem_cgroup_event_ratelimit(memcg,
- MEM_CGROUP_TARGET_NUMAINFO);
-#endif
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
-#if MAX_NUMNODES > 1
- if (unlikely(do_numainfo))
- atomic_inc(&memcg->numainfo_events);
-#endif
}
}
@@ -1052,7 +1039,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup_per_node *mz;
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
- iter = &mz->iter[reclaim->priority];
+ iter = &mz->iter;
if (prev && reclaim->generation != iter->generation)
goto out_unlock;
@@ -1152,15 +1139,11 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
- int i;
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(from, nid);
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ iter = &mz->iter;
+ cmpxchg(&iter->position, dead_memcg, NULL);
}
}
@@ -1238,7 +1221,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
- lruvec = &pgdat->lruvec;
+ lruvec = &pgdat->__lruvec;
goto out;
}
@@ -1595,104 +1578,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
return ret;
}
-#if MAX_NUMNODES > 1
-
-/**
- * test_mem_cgroup_node_reclaimable
- * @memcg: the target memcg
- * @nid: the node ID to be checked.
- * @noswap : specify true here if the user wants flle only information.
- *
- * This function returns whether the specified memcg contains any
- * reclaimable pages on a node. Returns true if there are any reclaimable
- * pages in the node.
- */
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
- int nid, bool noswap)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
-
- if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
- lruvec_page_state(lruvec, NR_ACTIVE_FILE))
- return true;
- if (noswap || !total_swap_pages)
- return false;
- if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
- lruvec_page_state(lruvec, NR_ACTIVE_ANON))
- return true;
- return false;
-
-}
-
-/*
- * Always updating the nodemask is not very good - even if we have an empty
- * list or the wrong list here, we can start from some node and traverse all
- * nodes based on the zonelist. So update the list loosely once per 10 secs.
- *
- */
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
-{
- int nid;
- /*
- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
- * pagein/pageout changes since the last update.
- */
- if (!atomic_read(&memcg->numainfo_events))
- return;
- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
- return;
-
- /* make a nodemask where this memcg uses memory from */
- memcg->scan_nodes = node_states[N_MEMORY];
-
- for_each_node_mask(nid, node_states[N_MEMORY]) {
-
- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
- node_clear(nid, memcg->scan_nodes);
- }
-
- atomic_set(&memcg->numainfo_events, 0);
- atomic_set(&memcg->numainfo_updating, 0);
-}
-
-/*
- * Selecting a node where we start reclaim from. Because what we need is just
- * reducing usage counter, start from anywhere is O,K. Considering
- * memory reclaim from current node, there are pros. and cons.
- *
- * Freeing memory from current node means freeing memory from a node which
- * we'll use or we've used. So, it may make LRU bad. And if several threads
- * hit limits, it will see a contention on a node. But freeing from remote
- * node means more costs for memory reclaim because of memory latency.
- *
- * Now, we use round-robin. Better algorithm is welcomed.
- */
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
-{
- int node;
-
- mem_cgroup_may_update_nodemask(memcg);
- node = memcg->last_scanned_node;
-
- node = next_node_in(node, memcg->scan_nodes);
- /*
- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
- * last time it really checked all the LRUs due to rate limiting.
- * Fallback to the current node in that case for simplicity.
- */
- if (unlikely(node == MAX_NUMNODES))
- node = numa_node_id();
-
- memcg->last_scanned_node = node;
- return node;
-}
-#else
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
-{
- return 0;
-}
-#endif
-
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
pg_data_t *pgdat,
gfp_t gfp_mask,
@@ -1705,7 +1590,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.pgdat = pgdat,
- .priority = 0,
};
excess = soft_limit_excess(root_memcg);
@@ -3750,7 +3634,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
unsigned long nr = 0;
enum lru_list lru;
@@ -5078,7 +4962,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
INIT_WORK(&memcg->high_work, high_work_func);
- memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
@@ -5455,8 +5338,8 @@ static int mem_cgroup_move_account(struct page *page,
anon = PageAnon(page);
pgdat = page_pgdat(page);
- from_vec = mem_cgroup_lruvec(pgdat, from);
- to_vec = mem_cgroup_lruvec(pgdat, to);
+ from_vec = mem_cgroup_lruvec(from, pgdat);
+ to_vec = mem_cgroup_lruvec(to, pgdat);
spin_lock_irqsave(&from->move_lock, flags);
@@ -6096,7 +5979,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned long nr_pages;
+ unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ bool drained = false;
unsigned long high;
int err;
@@ -6107,12 +5991,29 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
memcg->high = high;
- nr_pages = page_counter_read(&memcg->memory);
- if (nr_pages > high)
- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, true);
+ for (;;) {
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long reclaimed;
+
+ if (nr_pages <= high)
+ break;
+
+ if (signal_pending(current))
+ break;
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+ GFP_KERNEL, true);
+
+ if (!reclaimed && !nr_retries--)
+ break;
+ }
- memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -6144,10 +6045,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_pages <= max)
break;
- if (signal_pending(current)) {
- err = -EINTR;
+ if (signal_pending(current))
break;
- }
if (!drained) {
drain_all_stock(memcg);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3151c87dff73..41c634f45d45 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -303,30 +303,24 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
- * TBD would GFP_NOIO be enough?
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
struct vm_area_struct *vma,
- struct list_head *to_kill,
- struct to_kill **tkc)
+ struct list_head *to_kill)
{
struct to_kill *tk;
- if (*tkc) {
- tk = *tkc;
- *tkc = NULL;
- } else {
- tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
- if (!tk) {
- pr_err("Memory failure: Out of memory while machine check handling\n");
- return;
- }
+ tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
+ if (!tk) {
+ pr_err("Memory failure: Out of memory while machine check handling\n");
+ return;
}
+
tk->addr = page_address_in_vma(p, vma);
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(p, vma);
else
- tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
+ tk->size_shift = page_shift(compound_head(p));
/*
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -345,6 +339,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
kfree(tk);
return;
}
+
get_task_struct(tsk);
tk->tsk = tsk;
list_add_tail(&tk->nd, to_kill);
@@ -436,7 +431,7 @@ static struct task_struct *task_early_kill(struct task_struct *tsk,
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc, int force_early)
+ int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -461,7 +456,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill, tkc);
+ add_to_kill(t, page, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -472,7 +467,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc, int force_early)
+ int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -496,7 +491,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill, tkc);
+ add_to_kill(t, page, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -505,26 +500,17 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
/*
* Collect the processes who have the corrupted page mapped to kill.
- * This is done in two steps for locking reasons.
- * First preallocate one tokill structure outside the spin locks,
- * so that we can kill at least one process reasonably reliable.
*/
static void collect_procs(struct page *page, struct list_head *tokill,
int force_early)
{
- struct to_kill *tk;
-
if (!page->mapping)
return;
- tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
- if (!tk)
- return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk, force_early);
+ collect_procs_anon(page, tokill, force_early);
else
- collect_procs_file(page, tokill, &tk, force_early);
- kfree(tk);
+ collect_procs_file(page, tokill, force_early);
}
static const char *action_name[] = {
@@ -1490,7 +1476,7 @@ static void memory_failure_work_func(struct work_struct *work)
if (!gotten)
break;
if (entry.flags & MF_SOFT_OFFLINE)
- soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+ soft_offline_page(entry.pfn, entry.flags);
else
memory_failure(entry.pfn, entry.flags);
}
@@ -1871,7 +1857,7 @@ static int soft_offline_free_page(struct page *page)
/**
* soft_offline_page - Soft offline a page.
- * @page: page to offline
+ * @pfn: pfn to soft-offline
* @flags: flags. Same as memory_failure().
*
* Returns 0 on success, otherwise negated errno.
@@ -1891,18 +1877,17 @@ static int soft_offline_free_page(struct page *page)
* This is not a 100% solution for all memory, but tries to be
* ``good enough'' for the majority of memory.
*/
-int soft_offline_page(struct page *page, int flags)
+int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
- unsigned long pfn = page_to_pfn(page);
+ struct page *page;
- if (is_zone_device_page(page)) {
- pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
- pfn);
- if (flags & MF_COUNT_INCREASED)
- put_page(page);
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+ /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
+ page = pfn_to_online_page(pfn);
+ if (!page)
return -EIO;
- }
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
diff --git a/mm/memory.c b/mm/memory.c
index b6a5d6a08438..513c3ecc76ee 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -72,6 +72,8 @@
#include <linux/oom.h>
#include <linux/numa.h>
+#include <trace/events/kmem.h>
+
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
@@ -152,6 +154,10 @@ static int __init init_zero_pfn(void)
}
core_initcall(init_zero_pfn);
+void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
+{
+ trace_rss_stat(mm, member, count);
+}
#if defined(SPLIT_RSS_COUNTING)
@@ -2289,10 +2295,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
*
* The function expects the page to be locked and unlocks it.
*/
-static void fault_dirty_shared_page(struct vm_area_struct *vma,
- struct page *page)
+static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping;
+ struct page *page = vmf->page;
bool dirtied;
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
@@ -2307,16 +2314,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma,
mapping = page_rmapping(page);
unlock_page(page);
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+
+ /*
+ * Throttle page dirtying rate down to writeback speed.
+ *
+ * mapping may be NULL here because some device drivers do not
+ * set page.mapping but still dirty their pages
+ *
+ * Drop the mmap_sem before waiting on IO, if we can. The file
+ * is pinning the mapping, as per above.
+ */
if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
+ struct file *fpin;
+
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
balance_dirty_pages_ratelimited(mapping);
+ if (fpin) {
+ fput(fpin);
+ return VM_FAULT_RETRY;
+ }
}
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
+ return 0;
}
/*
@@ -2571,6 +2592,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = VM_FAULT_WRITE;
get_page(vmf->page);
@@ -2594,10 +2616,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
wp_page_reuse(vmf);
lock_page(vmf->page);
}
- fault_dirty_shared_page(vma, vmf->page);
+ ret |= fault_dirty_shared_page(vmf);
put_page(vmf->page);
- return VM_FAULT_WRITE;
+ return ret;
}
/*
@@ -3083,7 +3105,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/*
* The memory barrier inside __SetPageUptodate makes sure that
- * preceeding stores to the page contents become visible before
+ * preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
@@ -3641,7 +3663,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
return ret;
}
- fault_dirty_shared_page(vma, vmf->page);
+ ret |= fault_dirty_shared_page(vmf);
return ret;
}
@@ -3988,6 +4010,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
vmf.pud = pud_alloc(mm, p4d, address);
if (!vmf.pud)
return VM_FAULT_OOM;
+retry_pud:
if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
@@ -4014,6 +4037,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
+
+ /* Huge pud page fault raced with pmd_alloc? */
+ if (pud_trans_unstable(vmf.pud))
+ goto retry_pud;
+
if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f307bd82d750..55ac23ef11c1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -49,8 +49,6 @@
* and restore_online_page_callback() for generic callback restore.
*/
-static void generic_online_page(struct page *page, unsigned int order);
-
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
@@ -278,6 +276,22 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
return 0;
}
+static int check_hotplug_memory_addressable(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
+
+ if (max_addr >> MAX_PHYSMEM_BITS) {
+ const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
+ WARN(1,
+ "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
+ (u64)PFN_PHYS(pfn), max_addr, max_allowed);
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
/*
* Reasonably generic function for adding memory. It is
* expected that archs that support memory hotplug will
@@ -291,6 +305,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
unsigned long nr, start_sec, end_sec;
struct vmem_altmap *altmap = restrictions->altmap;
+ err = check_hotplug_memory_addressable(pfn, nr_pages);
+ if (err)
+ return err;
+
if (altmap) {
/*
* Validate altmap is within bounds of the total request
@@ -580,24 +598,7 @@ int restore_online_page_callback(online_page_callback_t callback)
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);
-void __online_page_set_limits(struct page *page)
-{
-}
-EXPORT_SYMBOL_GPL(__online_page_set_limits);
-
-void __online_page_increment_counters(struct page *page)
-{
- adjust_managed_page_count(page, 1);
-}
-EXPORT_SYMBOL_GPL(__online_page_increment_counters);
-
-void __online_page_free(struct page *page)
-{
- __free_reserved_page(page);
-}
-EXPORT_SYMBOL_GPL(__online_page_free);
-
-static void generic_online_page(struct page *page, unsigned int order)
+void generic_online_page(struct page *page, unsigned int order)
{
kernel_map_pages(page, 1 << order, 1);
__free_pages_core(page, order);
@@ -607,6 +608,7 @@ static void generic_online_page(struct page *page, unsigned int order)
totalhigh_pages_add(1UL << order);
#endif
}
+EXPORT_SYMBOL_GPL(generic_online_page);
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
@@ -1180,7 +1182,8 @@ static bool is_pageblock_removable_nolock(unsigned long pfn)
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
+ return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE,
+ MEMORY_OFFLINE);
}
/* Checks if this range of memory is likely to be hot-removable. */
@@ -1377,9 +1380,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
-/*
- * remove from free_area[] and mark all as Reserved.
- */
+/* Mark all sections offline and remove all free pages from the buddy. */
static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
void *data)
@@ -1397,7 +1398,8 @@ static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
void *data)
{
- return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
+ return test_pages_isolated(start_pfn, start_pfn + nr_pages,
+ MEMORY_OFFLINE);
}
static int __init cmdline_parse_movable_node(char *p)
@@ -1478,10 +1480,19 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
node_clear_state(node, N_MEMORY);
}
+static int count_system_ram_pages_cb(unsigned long start_pfn,
+ unsigned long nr_pages, void *data)
+{
+ unsigned long *nr_system_ram_pages = data;
+
+ *nr_system_ram_pages += nr_pages;
+ return 0;
+}
+
static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long pfn, nr_pages;
+ unsigned long pfn, nr_pages = 0;
unsigned long offlined_pages = 0;
int ret, node, nr_isolate_pageblock;
unsigned long flags;
@@ -1492,6 +1503,22 @@ static int __ref __offline_pages(unsigned long start_pfn,
mem_hotplug_begin();
+ /*
+ * Don't allow to offline memory blocks that contain holes.
+ * Consequently, memory blocks with holes can never get onlined
+ * via the hotplug path - online_pages() - as hotplugged memory has
+ * no holes. This way, we e.g., don't have to worry about marking
+ * memory holes PG_reserved, don't need pfn_valid() checks, and can
+ * avoid using walk_system_ram_range() later.
+ */
+ walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages,
+ count_system_ram_pages_cb);
+ if (nr_pages != end_pfn - start_pfn) {
+ ret = -EINVAL;
+ reason = "memory holes";
+ goto failed_removal;
+ }
+
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
@@ -1503,12 +1530,11 @@ static int __ref __offline_pages(unsigned long start_pfn,
zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
- nr_pages = end_pfn - start_pfn;
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- SKIP_HWPOISON | REPORT_FAILURE);
+ MEMORY_OFFLINE | REPORT_FAILURE);
if (ret < 0) {
reason = "failure to isolate range";
goto failed_removal;
@@ -1750,13 +1776,13 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
- memblock_free(start, size);
- memblock_remove(start, size);
/* remove memory block devices before removing memory */
remove_memory_block_devices(start, size);
arch_remove_memory(nid, start, size, NULL);
+ memblock_free(start, size);
+ memblock_remove(start, size);
__release_memory_resource(start, size);
try_offline_node(nid);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08c94170ae4..067cf7d3daf5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -410,7 +410,9 @@ struct queue_pages {
struct list_head *pagelist;
unsigned long flags;
nodemask_t *nmask;
- struct vm_area_struct *prev;
+ unsigned long start;
+ unsigned long end;
+ struct vm_area_struct *first;
};
/*
@@ -618,6 +620,22 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
+ /* range check first */
+ VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
+
+ if (!qp->first) {
+ qp->first = vma;
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ (qp->start < vma->vm_start))
+ /* hole at head side of range */
+ return -EFAULT;
+ }
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ ((vma->vm_end < qp->end) &&
+ (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
+ /* hole at middle or tail of range */
+ return -EFAULT;
+
/*
* Need check MPOL_MF_STRICT to return -EIO if possible
* regardless of vma_migratable
@@ -628,17 +646,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (endvma > end)
endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
-
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
- return -EFAULT;
- if (qp->prev && qp->prev->vm_end < vma->vm_start)
- return -EFAULT;
- }
-
- qp->prev = vma;
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
@@ -681,14 +688,23 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{
+ int err;
struct queue_pages qp = {
.pagelist = pagelist,
.flags = flags,
.nmask = nodes,
- .prev = NULL,
+ .start = start,
+ .end = end,
+ .first = NULL,
};
- return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+ err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+
+ if (!qp.first)
+ /* whole range in hole */
+ err = -EFAULT;
+
+ return err;
}
/*
@@ -740,8 +756,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long vmend;
vma = find_vma(mm, start);
- if (!vma || vma->vm_start > start)
- return -EFAULT;
+ VM_BUG_ON(!vma);
prev = vma->vm_prev;
if (start > vma->vm_start)
diff --git a/mm/migrate.c b/mm/migrate.c
index 4fe45d1428c8..eae1565285e3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1168,15 +1168,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
enum migrate_reason reason)
{
int rc = MIGRATEPAGE_SUCCESS;
- struct page *newpage;
+ struct page *newpage = NULL;
if (!thp_migration_supported() && PageTransHuge(page))
return -ENOMEM;
- newpage = get_new_page(page, private);
- if (!newpage)
- return -ENOMEM;
-
if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
ClearPageActive(page);
@@ -1187,13 +1183,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
__ClearPageIsolated(page);
unlock_page(page);
}
- if (put_new_page)
- put_new_page(newpage, private);
- else
- put_page(newpage);
goto out;
}
+ newpage = get_new_page(page, private);
+ if (!newpage)
+ return -ENOMEM;
+
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
@@ -1863,7 +1859,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone) +
nr_migrate_pages,
- 0, 0))
+ ZONE_MOVABLE, 0))
continue;
return true;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index a7d8c84d19b7..9c648524e4dc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -641,7 +641,7 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
- __vma_link_list(mm, vma, prev, rb_parent);
+ __vma_link_list(mm, vma, prev);
__vma_link_rb(mm, vma, rb_link, rb_parent);
}
@@ -684,37 +684,14 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
static __always_inline void __vma_unlink_common(struct mm_struct *mm,
struct vm_area_struct *vma,
- struct vm_area_struct *prev,
- bool has_prev,
struct vm_area_struct *ignore)
{
- struct vm_area_struct *next;
-
vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
- next = vma->vm_next;
- if (has_prev)
- prev->vm_next = next;
- else {
- prev = vma->vm_prev;
- if (prev)
- prev->vm_next = next;
- else
- mm->mmap = next;
- }
- if (next)
- next->vm_prev = prev;
-
+ __vma_unlink_list(mm, vma);
/* Kill the cache */
vmacache_invalidate(mm);
}
-static inline void __vma_unlink_prev(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *prev)
-{
- __vma_unlink_common(mm, vma, prev, true, vma);
-}
-
/*
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
* is already present in an i_mmap tree without adjusting the tree.
@@ -769,8 +746,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
remove_next = 1 + (end > next->vm_end);
VM_WARN_ON(remove_next == 2 &&
end != next->vm_next->vm_end);
- VM_WARN_ON(remove_next == 1 &&
- end != next->vm_end);
/* trim end to next, for case 6 first pass */
end = next->vm_end;
}
@@ -889,7 +864,7 @@ again:
* us to remove next before dropping the locks.
*/
if (remove_next != 3)
- __vma_unlink_prev(mm, next, vma);
+ __vma_unlink_common(mm, next, next);
else
/*
* vma is not before next if they've been
@@ -900,7 +875,7 @@ again:
* "next" (which is stored in post-swap()
* "vma").
*/
- __vma_unlink_common(mm, next, NULL, false, vma);
+ __vma_unlink_common(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
@@ -1116,15 +1091,18 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* the area passed down from mprotect_fixup, never extending beyond one
* vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
*
- * AAAA AAAA AAAA AAAA
- * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
- * cannot merge might become might become might become
- * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
- * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
- * mremap move: PPPPXXXXXXXX 8
- * AAAA
- * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
- * might become case 1 below case 2 below case 3 below
+ * AAAA AAAA AAAA
+ * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN
+ * cannot merge might become might become
+ * PPNNNNNNNNNN PPPPPPPPPPNN
+ * mmap, brk or case 4 below case 5 below
+ * mremap move:
+ * AAAA AAAA
+ * PPPP NNNN PPPPNNNNXXXX
+ * might become might become
+ * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
+ * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or
+ * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8
*
* It is important for case 8 that the vma NNNN overlapping the
* region AAAA is never going to extended over XXXX. Instead XXXX must
@@ -1442,7 +1420,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (offset_in_page(addr))
+ if (IS_ERR_VALUE(addr))
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
@@ -3006,15 +2984,16 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
+ unsigned long mapped_addr;
/* Until we need other flags, refuse anything except VM_EXEC. */
if ((flags & (~VM_EXEC)) != 0)
return -EINVAL;
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (offset_in_page(error))
- return error;
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
error = mlock_future_check(mm, mm->def_flags, len);
if (error)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7967825f6d33..7a8e84f86831 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -80,6 +80,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (prot_numa) {
struct page *page;
+ /* Avoid TLB flush if possible */
+ if (pte_protnone(oldpte))
+ continue;
+
page = vm_normal_page(vma, addr, oldpte);
if (!page || PageKsm(page))
continue;
@@ -97,10 +101,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (page_is_file_cache(page) && PageDirty(page))
continue;
- /* Avoid TLB flush if possible */
- if (pte_protnone(oldpte))
- continue;
-
/*
* Don't mess with PTEs if page is already on the node
* a single-threaded process is running on.
diff --git a/mm/mremap.c b/mm/mremap.c
index 1fc8a29fbe3f..122938dcec15 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -558,7 +558,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
- if (offset_in_page(ret))
+ if (IS_ERR_VALUE(ret))
goto out1;
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
@@ -706,7 +706,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
vma->vm_pgoff +
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
- if (offset_in_page(new_addr)) {
+ if (IS_ERR_VALUE(new_addr)) {
ret = new_addr;
goto out;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 7de592058ab4..bd2b4e5ef144 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -648,7 +648,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
if (rb_prev)
prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- __vma_link_list(mm, vma, prev, parent);
+ __vma_link_list(mm, vma, prev);
}
/*
@@ -684,13 +684,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
/* remove from the MM's tree and list */
rb_erase(&vma->vm_rb, &mm->mm_rb);
- if (vma->vm_prev)
- vma->vm_prev->vm_next = vma->vm_next;
- else
- mm->mmap = vma->vm_next;
-
- if (vma->vm_next)
- vma->vm_next->vm_prev = vma->vm_prev;
+ __vma_unlink_list(mm, vma);
}
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f391c0c4ed1d..4785a8a2040e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5354,6 +5354,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" min:%lukB"
" low:%lukB"
" high:%lukB"
+ " reserved_highatomic:%luKB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
@@ -5375,6 +5376,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
+ K(zone->nr_reserved_highatomic),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
@@ -6711,7 +6713,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
pgdat_page_ext_init(pgdat);
spin_lock_init(&pgdat->lru_lock);
- lruvec_init(node_lruvec(pgdat));
+ lruvec_init(&pgdat->__lruvec);
}
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
@@ -7988,6 +7990,15 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+static void __zone_pcp_update(struct zone *zone)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
+}
+
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
* cpu. It is the fraction of total pages in each zone that a hot per cpu
@@ -8019,13 +8030,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
goto out;
- for_each_populated_zone(zone) {
- unsigned int cpu;
-
- for_each_possible_cpu(cpu)
- pageset_set_high_and_batch(zone,
- per_cpu_ptr(zone->pageset, cpu));
- }
+ for_each_populated_zone(zone)
+ __zone_pcp_update(zone);
out:
mutex_unlock(&pcp_batch_high_lock);
return ret;
@@ -8261,7 +8267,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
- if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
continue;
if (__PageMovable(page))
@@ -8477,7 +8483,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end, false)) {
+ if (test_pages_isolated(outer_start, end, 0)) {
pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
__func__, outer_start, end);
ret = -EBUSY;
@@ -8502,6 +8508,107 @@ done:
pfn_max_align_up(end), migratetype);
return ret;
}
+
+static int __alloc_contig_pages(unsigned long start_pfn,
+ unsigned long nr_pages, gfp_t gfp_mask)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
+ gfp_mask);
+}
+
+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i, end_pfn = start_pfn + nr_pages;
+ struct page *page;
+
+ for (i = start_pfn; i < end_pfn; i++) {
+ page = pfn_to_online_page(i);
+ if (!page)
+ return false;
+
+ if (page_zone(page) != z)
+ return false;
+
+ if (PageReserved(page))
+ return false;
+
+ if (page_count(page) > 0)
+ return false;
+
+ if (PageHuge(page))
+ return false;
+ }
+ return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ unsigned long last_pfn = start_pfn + nr_pages - 1;
+
+ return zone_spans_pfn(zone, last_pfn);
+}
+
+/**
+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
+ * @nr_pages: Number of contiguous pages to allocate
+ * @gfp_mask: GFP mask to limit search and used during compaction
+ * @nid: Target node
+ * @nodemask: Mask for other possible nodes
+ *
+ * This routine is a wrapper around alloc_contig_range(). It scans over zones
+ * on an applicable zonelist to find a contiguous pfn range which can then be
+ * tried for allocation with alloc_contig_range(). This routine is intended
+ * for allocation requests which can not be fulfilled with the buddy allocator.
+ *
+ * The allocated memory is always aligned to a page boundary. If nr_pages is a
+ * power of two then the alignment is guaranteed to be to the given nr_pages
+ * (e.g. 1GB request would be aligned to 1GB).
+ *
+ * Allocated pages can be freed with free_contig_range() or by manually calling
+ * __free_page() on each allocated page.
+ *
+ * Return: pointer to contiguous pages on success, or NULL if not successful.
+ */
+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ unsigned long ret, pfn, flags;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+
+ zonelist = node_zonelist(nid, gfp_mask);
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(gfp_mask), nodemask) {
+ spin_lock_irqsave(&zone->lock, flags);
+
+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
+ if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
+ /*
+ * We release the zone lock here because
+ * alloc_contig_range() will also lock the zone
+ * at some point. If there's an allocation
+ * spinning on this lock, it may win the race
+ * and cause alloc_contig_range() to fail...
+ */
+ spin_unlock_irqrestore(&zone->lock, flags);
+ ret = __alloc_contig_pages(pfn, nr_pages,
+ gfp_mask);
+ if (!ret)
+ return pfn_to_page(pfn);
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+ pfn += nr_pages;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+ return NULL;
+}
#endif /* CONFIG_CONTIG_ALLOC */
void free_contig_range(unsigned long pfn, unsigned int nr_pages)
@@ -8523,11 +8630,8 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
*/
void __meminit zone_pcp_update(struct zone *zone)
{
- unsigned cpu;
mutex_lock(&pcp_batch_high_lock);
- for_each_possible_cpu(cpu)
- pageset_set_high_and_batch(zone,
- per_cpu_ptr(zone->pageset, cpu));
+ __zone_pcp_update(zone);
mutex_unlock(&pcp_batch_high_lock);
}
@@ -8560,7 +8664,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
struct zone *zone;
- unsigned int order, i;
+ unsigned int order;
unsigned long pfn;
unsigned long flags;
unsigned long offlined_pages = 0;
@@ -8588,7 +8692,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
- SetPageReserved(page);
offlined_pages++;
continue;
}
@@ -8602,8 +8705,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
pfn, 1 << order, end_pfn);
#endif
del_page_from_free_area(page, &zone->free_area[order]);
- for (i = 0; i < (1 << order); i++)
- SetPageReserved((page+i));
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
diff --git a/mm/page_io.c b/mm/page_io.c
index 60a66a58b9bf..3a198deb8bb1 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -22,6 +22,7 @@
#include <linux/writeback.h>
#include <linux/frontswap.h>
#include <linux/blkdev.h>
+#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <asm/pgtable.h>
@@ -354,10 +355,19 @@ int swap_readpage(struct page *page, bool synchronous)
struct swap_info_struct *sis = page_swap_info(page);
blk_qc_t qc;
struct gendisk *disk;
+ unsigned long pflags;
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageUptodate(page), page);
+
+ /*
+ * Count submission time as memory stall. When the device is congested,
+ * or the submitting cgroup IO-throttled, submission can be a
+ * significant part of overall IO time.
+ */
+ psi_memstall_enter(&pflags);
+
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
unlock_page(page);
@@ -371,7 +381,7 @@ int swap_readpage(struct page *page, bool synchronous)
ret = mapping->a_ops->readpage(swap_file, page);
if (!ret)
count_vm_event(PSWPIN);
- return ret;
+ goto out;
}
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
@@ -382,7 +392,7 @@ int swap_readpage(struct page *page, bool synchronous)
}
count_vm_event(PSWPIN);
- return 0;
+ goto out;
}
ret = 0;
@@ -418,6 +428,7 @@ int swap_readpage(struct page *page, bool synchronous)
bio_put(bio);
out:
+ psi_memstall_leave(&pflags);
return ret;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 89c19c0feadb..04ee1663cdbe 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -168,7 +168,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* @migratetype: Migrate type to set in error recovery.
* @flags: The following flags are allowed (they can be combined in
* a bit mask)
- * SKIP_HWPOISON - ignore hwpoison pages
+ * MEMORY_OFFLINE - isolate to offline (!allocate) memory
+ * e.g., skip over PageHWPoison() pages
* REPORT_FAILURE - report details about the failure to
* isolate the range
*
@@ -257,7 +258,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
*/
static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
- bool skip_hwpoisoned_pages)
+ int flags)
{
struct page *page;
@@ -274,7 +275,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* simple way to verify that as VM_BUG_ON(), though.
*/
pfn += 1 << page_order(page);
- else if (skip_hwpoisoned_pages && PageHWPoison(page))
+ else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
else
@@ -286,7 +287,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
/* Caller should ensure that requested range is in a single zone */
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
- bool skip_hwpoisoned_pages)
+ int isol_flags)
{
unsigned long pfn, flags;
struct page *page;
@@ -308,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
- skip_hwpoisoned_pages);
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
spin_unlock_irqrestore(&zone->lock, flags);
trace_test_pages_isolated(start_pfn, end_pfn, pfn);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 532c29276fce..3d7c01e76efc 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -24,18 +24,27 @@ void pgd_clear_bad(pgd_t *pgd)
pgd_clear(pgd);
}
+#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *p4d)
{
p4d_ERROR(*p4d);
p4d_clear(p4d);
}
+#endif
+#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *pud)
{
pud_ERROR(*pud);
pud_clear(pud);
}
+#endif
+/*
+ * Note that the pmd variant below can't be stub'ed out just as for p4d/pud
+ * above. pmd folding is special and typically pmd_* macros refer to upper
+ * level even when folded
+ */
void pmd_clear_bad(pmd_t *pmd)
{
pmd_ERROR(*pmd);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0c7b2a9400d4..b3e381919835 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -251,18 +251,37 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
*
- * If dst->anon_vma is NULL this function tries to find and reuse existing
- * anon_vma which has no vmas and only one child anon_vma. This prevents
- * degradation of anon_vma hierarchy to endless linear chain in case of
- * constantly forking task. On the other hand, an anon_vma with more than one
- * child isn't reused even if there was no alive vma, thus rmap walker has a
- * good chance of avoiding scanning the whole hierarchy when it searches where
- * page is mapped.
+ * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_fork(). The first three want an exact copy of src, while the last
+ * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
+ * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
+ * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
+ *
+ * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
+ * and reuse existing anon_vma which has no vmas and only one child anon_vma.
+ * This prevents degradation of anon_vma hierarchy to endless linear chain in
+ * case of constantly forking task. On the other hand, an anon_vma with more
+ * than one child isn't reused even if there was no alive vma, thus rmap
+ * walker has a good chance of avoiding scanning the whole hierarchy when it
+ * searches where page is mapped.
*/
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
+ struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev;
+
+ /*
+ * If parent share anon_vma with its vm_prev, keep this sharing in in
+ * child.
+ *
+ * 1. Parent has vm_prev, which implies we have vm_prev.
+ * 2. Parent and its vm_prev have the same anon_vma.
+ */
+ if (!dst->anon_vma && src->anon_vma &&
+ pprev && pprev->anon_vma == src->anon_vma)
+ dst->anon_vma = prev->anon_vma;
+
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
@@ -287,8 +306,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
* will always reuse it. Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
- if (!dst->anon_vma && anon_vma != src->anon_vma &&
- anon_vma->degree < 2)
+ if (!dst->anon_vma && src->anon_vma &&
+ anon_vma != src->anon_vma && anon_vma->degree < 2)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
@@ -458,9 +477,10 @@ void __init anon_vma_init(void)
* chain and verify that the page in question is indeed mapped in it
* [ something equivalent to page_mapped_in_vma() ].
*
- * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
- * that the anon_vma pointer from page->mapping is valid if there is a
- * mapcount, we can dereference the anon_vma after observing those.
+ * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
+ * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
+ * if there is a mapcount, we can dereference the anon_vma after observing
+ * those.
*/
struct anon_vma *page_get_anon_vma(struct page *page)
{
@@ -1055,7 +1075,6 @@ static void __page_set_anon_rmap(struct page *page,
static void __page_check_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
-#ifdef CONFIG_DEBUG_VM
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
* be set up correctly at this point.
@@ -1068,9 +1087,9 @@ static void __page_check_anon_rmap(struct page *page,
* are initially only visible via the pagetables, and the pte is locked
* over the call to page_add_new_anon_rmap.
*/
- BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
- BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
-#endif
+ VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
+ page);
}
/**
@@ -1273,12 +1292,20 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (TestClearPageDoubleMap(page)) {
/*
* Subpages can be mapped with PTEs too. Check how many of
- * themi are still mapped.
+ * them are still mapped.
*/
for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
+
+ /*
+ * Queue the page for deferred split if at least one small
+ * page of the compound page is unmapped, but at least one
+ * small page is still mapped.
+ */
+ if (nr && nr < HPAGE_PMD_NR)
+ deferred_split_huge_page(page);
} else {
nr = HPAGE_PMD_NR;
}
@@ -1286,10 +1313,8 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
- if (nr) {
+ if (nr)
__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
- deferred_split_huge_page(page);
- }
}
/**
diff --git a/mm/shmem.c b/mm/shmem.c
index 220be9fa2c41..165fa6332993 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1369,7 +1369,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+ if (add_to_swap_cache(page, swap,
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) {
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
info->swapped++;
@@ -2022,16 +2023,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
shmem_falloc->waitq &&
vmf->pgoff >= shmem_falloc->start &&
vmf->pgoff < shmem_falloc->next) {
+ struct file *fpin;
wait_queue_head_t *shmem_falloc_waitq;
DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
ret = VM_FAULT_NOPAGE;
- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* It's polite to up mmap_sem if we can */
- up_read(&vma->vm_mm->mmap_sem);
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ if (fpin)
ret = VM_FAULT_RETRY;
- }
shmem_falloc_waitq = shmem_falloc->waitq;
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
@@ -2049,6 +2048,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
spin_lock(&inode->i_lock);
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
spin_unlock(&inode->i_lock);
+
+ if (fpin)
+ fput(fpin);
return ret;
}
spin_unlock(&inode->i_lock);
@@ -2213,11 +2215,14 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return -EPERM;
/*
- * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
- * read-only mapping, take care to not allow mprotect to revert
- * protections.
+ * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+ * MAP_SHARED and read-only, take care to not allow mprotect to
+ * revert protections on such mappings. Do this only for shared
+ * mappings. For private mappings, don't need to mask
+ * VM_MAYWRITE as we still want them to be COW-writable.
*/
- vma->vm_flags &= ~(VM_MAYWRITE);
+ if (vma->vm_flags & VM_SHARED)
+ vma->vm_flags &= ~(VM_MAYWRITE);
}
file_accessed(file);
@@ -2742,7 +2747,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
}
shmem_falloc.waitq = &shmem_falloc_waitq;
- shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
spin_lock(&inode->i_lock);
inode->i_private = &shmem_falloc;
@@ -3928,7 +3933,7 @@ out2:
static ssize_t shmem_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int values[] = {
+ static const int values[] = {
SHMEM_HUGE_ALWAYS,
SHMEM_HUGE_WITHIN_SIZE,
SHMEM_HUGE_ADVISE,
diff --git a/mm/slab.c b/mm/slab.c
index 66e5d8032bae..f1e1840af533 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1247,9 +1247,10 @@ void __init kmem_cache_init(void)
* structures first. Without this, further allocations will bug.
*/
kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
- kmalloc_info[INDEX_NODE].name,
- kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
- 0, kmalloc_size(INDEX_NODE));
+ kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL],
+ kmalloc_info[INDEX_NODE].size,
+ ARCH_KMALLOC_FLAGS, 0,
+ kmalloc_info[INDEX_NODE].size);
slab_state = PARTIAL_NODE;
setup_kmalloc_cache_index_table();
diff --git a/mm/slab.h b/mm/slab.h
index b2b01694dc43..7e94700aa78c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -139,7 +139,7 @@ extern struct kmem_cache *kmem_cache;
/* A table of kmalloc cache names and sizes */
extern const struct kmalloc_info_struct {
- const char *name;
+ const char *name[NR_KMALLOC_TYPES];
unsigned int size;
} kmalloc_info[];
@@ -369,7 +369,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
if (ret)
goto out;
- lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+ lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);
/* transer try_charge() page references to kmem_cache */
@@ -393,7 +393,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
rcu_read_lock();
memcg = READ_ONCE(s->memcg_params.memcg);
if (likely(!mem_cgroup_is_root(memcg))) {
- lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+ lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
memcg_kmem_uncharge_memcg(page, order, memcg);
} else {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f9fb27b4c843..8afa188f6e20 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1139,26 +1139,56 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
return kmalloc_caches[kmalloc_type(flags)][index];
}
+#ifdef CONFIG_ZONE_DMA
+#define INIT_KMALLOC_INFO(__size, __short_size) \
+{ \
+ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
+ .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
+ .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \
+ .size = __size, \
+}
+#else
+#define INIT_KMALLOC_INFO(__size, __short_size) \
+{ \
+ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
+ .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
+ .size = __size, \
+}
+#endif
+
/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
* kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
* kmalloc-67108864.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
- {NULL, 0}, {"kmalloc-96", 96},
- {"kmalloc-192", 192}, {"kmalloc-8", 8},
- {"kmalloc-16", 16}, {"kmalloc-32", 32},
- {"kmalloc-64", 64}, {"kmalloc-128", 128},
- {"kmalloc-256", 256}, {"kmalloc-512", 512},
- {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048},
- {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192},
- {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768},
- {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072},
- {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288},
- {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152},
- {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608},
- {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432},
- {"kmalloc-64M", 67108864}
+ INIT_KMALLOC_INFO(0, 0),
+ INIT_KMALLOC_INFO(96, 96),
+ INIT_KMALLOC_INFO(192, 192),
+ INIT_KMALLOC_INFO(8, 8),
+ INIT_KMALLOC_INFO(16, 16),
+ INIT_KMALLOC_INFO(32, 32),
+ INIT_KMALLOC_INFO(64, 64),
+ INIT_KMALLOC_INFO(128, 128),
+ INIT_KMALLOC_INFO(256, 256),
+ INIT_KMALLOC_INFO(512, 512),
+ INIT_KMALLOC_INFO(1024, 1k),
+ INIT_KMALLOC_INFO(2048, 2k),
+ INIT_KMALLOC_INFO(4096, 4k),
+ INIT_KMALLOC_INFO(8192, 8k),
+ INIT_KMALLOC_INFO(16384, 16k),
+ INIT_KMALLOC_INFO(32768, 32k),
+ INIT_KMALLOC_INFO(65536, 64k),
+ INIT_KMALLOC_INFO(131072, 128k),
+ INIT_KMALLOC_INFO(262144, 256k),
+ INIT_KMALLOC_INFO(524288, 512k),
+ INIT_KMALLOC_INFO(1048576, 1M),
+ INIT_KMALLOC_INFO(2097152, 2M),
+ INIT_KMALLOC_INFO(4194304, 4M),
+ INIT_KMALLOC_INFO(8388608, 8M),
+ INIT_KMALLOC_INFO(16777216, 16M),
+ INIT_KMALLOC_INFO(33554432, 32M),
+ INIT_KMALLOC_INFO(67108864, 64M)
};
/*
@@ -1208,36 +1238,14 @@ void __init setup_kmalloc_cache_index_table(void)
}
}
-static const char *
-kmalloc_cache_name(const char *prefix, unsigned int size)
-{
-
- static const char units[3] = "\0kM";
- int idx = 0;
-
- while (size >= 1024 && (size % 1024 == 0)) {
- size /= 1024;
- idx++;
- }
-
- return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]);
-}
-
static void __init
-new_kmalloc_cache(int idx, int type, slab_flags_t flags)
+new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
{
- const char *name;
-
- if (type == KMALLOC_RECLAIM) {
+ if (type == KMALLOC_RECLAIM)
flags |= SLAB_RECLAIM_ACCOUNT;
- name = kmalloc_cache_name("kmalloc-rcl",
- kmalloc_info[idx].size);
- BUG_ON(!name);
- } else {
- name = kmalloc_info[idx].name;
- }
- kmalloc_caches[type][idx] = create_kmalloc_cache(name,
+ kmalloc_caches[type][idx] = create_kmalloc_cache(
+ kmalloc_info[idx].name[type],
kmalloc_info[idx].size, flags, 0,
kmalloc_info[idx].size);
}
@@ -1249,7 +1257,8 @@ new_kmalloc_cache(int idx, int type, slab_flags_t flags)
*/
void __init create_kmalloc_caches(slab_flags_t flags)
{
- int i, type;
+ int i;
+ enum kmalloc_cache_type type;
for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
@@ -1278,12 +1287,10 @@ void __init create_kmalloc_caches(slab_flags_t flags)
struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
if (s) {
- unsigned int size = kmalloc_size(i);
- const char *n = kmalloc_cache_name("dma-kmalloc", size);
-
- BUG_ON(!n);
kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
- n, size, SLAB_CACHE_DMA | flags, 0, 0);
+ kmalloc_info[i].name[KMALLOC_DMA],
+ kmalloc_info[i].size,
+ SLAB_CACHE_DMA | flags, 0, 0);
}
}
#endif
diff --git a/mm/slub.c b/mm/slub.c
index e72e802fc569..d11389710b12 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -93,9 +93,7 @@
* minimal so we rely on the page allocators per cpu caches for
* fast frees and allocs.
*
- * Overloading of page flags that are otherwise used for LRU management.
- *
- * PageActive The slab is frozen and exempt from list processing.
+ * page->frozen The slab is frozen and exempt from list processing.
* This means that the slab is dedicated to a purpose
* such as satisfying allocations for a specific
* processor. Objects may be freed in the slab while
@@ -111,7 +109,7 @@
* free objects in addition to the regular freelist
* that requires the slab lock.
*
- * PageError Slab requires special handling due to debug
+ * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
* options set. This moves slab handling out of
* the fast path and disables lockless freelists.
*/
@@ -736,6 +734,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
{
u8 *fault;
u8 *end;
+ u8 *addr = page_address(page);
metadata_access_enable();
fault = memchr_inv(start, value, bytes);
@@ -748,8 +747,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
end--;
slab_bug(s, "%s overwritten", what);
- pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
- fault, end - 1, fault[0], value);
+ pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ fault, end - 1, fault - addr,
+ fault[0], value);
print_trailer(s, page, object);
restore_bytes(s, what, value, fault, end);
@@ -844,7 +844,8 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
@@ -4383,31 +4384,26 @@ static int count_total(struct page *page)
#endif
#ifdef CONFIG_SLUB_DEBUG
-static int validate_slab(struct kmem_cache *s, struct page *page,
+static void validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
void *p;
void *addr = page_address(page);
- if (!check_slab(s, page) ||
- !on_freelist(s, page, NULL))
- return 0;
+ if (!check_slab(s, page) || !on_freelist(s, page, NULL))
+ return;
/* Now we know that a valid freelist exists */
bitmap_zero(map, page->objects);
get_map(s, page, map);
for_each_object(p, s, addr, page->objects) {
- if (test_bit(slab_index(p, s, addr), map))
- if (!check_object(s, page, p, SLUB_RED_INACTIVE))
- return 0;
- }
+ u8 val = test_bit(slab_index(p, s, addr), map) ?
+ SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
- for_each_object(p, s, addr, page->objects)
- if (!test_bit(slab_index(p, s, addr), map))
- if (!check_object(s, page, p, SLUB_RED_ACTIVE))
- return 0;
- return 1;
+ if (!check_object(s, page, p, val))
+ break;
+ }
}
static void validate_slab_slab(struct kmem_cache *s, struct page *page,
diff --git a/mm/sparse.c b/mm/sparse.c
index f6891c1992b1..b20ab7cdac86 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -458,8 +458,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
if (map)
return map;
- map = memblock_alloc_try_nid(size,
- PAGE_SIZE, addr,
+ map = memblock_alloc_try_nid_raw(size, size, addr,
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
if (!map)
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
@@ -482,10 +481,13 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
{
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
- sparsemap_buf =
- memblock_alloc_try_nid_raw(size, PAGE_SIZE,
- addr,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ /*
+ * Pre-allocated buffer is mainly used by __populate_section_memmap
+ * and we want it to be properly aligned to the section size - this is
+ * especially the case for VMEMMAP which maps memmap to PMDs
+ */
+ sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
+ addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
sparsemap_buf_end = sparsemap_buf + size;
}
@@ -647,7 +649,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static struct page *populate_section_memmap(unsigned long pfn,
+static struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
return __populate_section_memmap(pfn, nr_pages, nid, altmap);
@@ -669,7 +671,7 @@ static void free_map_bootmem(struct page *memmap)
vmemmap_free(start, end, NULL);
}
#else
-struct page *populate_section_memmap(unsigned long pfn,
+struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
struct page *page, *ret;
diff --git a/mm/swap.c b/mm/swap.c
index 38c3fa4308e2..5341ae93861f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -373,9 +373,16 @@ static void __lru_cache_activate_page(struct page *page)
void mark_page_accessed(struct page *page)
{
page = compound_head(page);
- if (!PageActive(page) && !PageUnevictable(page) &&
- PageReferenced(page)) {
+ if (!PageReferenced(page)) {
+ SetPageReferenced(page);
+ } else if (PageUnevictable(page)) {
+ /*
+ * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
+ * this list is never rotated or maintained, so marking an
+ * evictable page accessed has no effect.
+ */
+ } else if (!PageActive(page)) {
/*
* If the page is on the LRU, queue it for activation via
* activate_page_pvecs. Otherwise, assume the page is on a
@@ -389,8 +396,6 @@ void mark_page_accessed(struct page *page)
ClearPageReferenced(page);
if (page_is_file_cache(page))
workingset_activation(page);
- } else if (!PageReferenced(page)) {
- SetPageReferenced(page);
}
if (page_is_idle(page))
clear_page_idle(page);
@@ -708,9 +713,10 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
*/
void lru_add_drain_all(void)
{
+ static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
- int cpu;
+ int cpu, seq;
/*
* Make sure nobody triggers this path before mm_percpu_wq is fully
@@ -719,7 +725,19 @@ void lru_add_drain_all(void)
if (WARN_ON(!mm_percpu_wq))
return;
+ seq = raw_read_seqcount_latch(&seqcount);
+
mutex_lock(&lock);
+
+ /*
+ * Piggyback on drain started and finished while we waited for lock:
+ * all pages pended at the time of our enter were drained from vectors.
+ */
+ if (__read_seqcount_retry(&seqcount, seq))
+ goto done;
+
+ raw_write_seqcount_latch(&seqcount);
+
cpumask_clear(&has_work);
for_each_online_cpu(cpu) {
@@ -740,6 +758,7 @@ void lru_add_drain_all(void)
for_each_cpu(cpu, &has_work)
flush_work(&per_cpu(lru_add_drain_work, cpu));
+done:
mutex_unlock(&lock);
}
#else
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dab43523afdd..bb3261d45b6a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2887,6 +2887,13 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
return error;
+ /*
+ * Zoned block devices contain zones that have a sequential
+ * write only restriction. Hence zoned block devices are not
+ * suitable for swapping. Disallow them here.
+ */
+ if (blk_queue_is_zoned(p->bdev->bd_queue))
+ return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index c7ae74ce5ff3..1b0d7abad1d4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -18,6 +18,36 @@
#include <asm/tlbflush.h>
#include "internal.h"
+static __always_inline
+struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ /*
+ * Make sure that the dst range is both valid and fully within a
+ * single existing vma.
+ */
+ struct vm_area_struct *dst_vma;
+
+ dst_vma = find_vma(dst_mm, dst_start);
+ if (!dst_vma)
+ return NULL;
+
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ return NULL;
+
+ /*
+ * Check the vma is registered in uffd, this is required to
+ * enforce the VM_MAYWRITE check done at uffd registration
+ * time.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ return NULL;
+
+ return dst_vma;
+}
+
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
@@ -60,7 +90,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
/*
* The memory barrier inside __SetPageUptodate makes sure that
- * preceeding stores to the page contents become visible before
+ * preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
@@ -184,7 +214,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long src_addr, dst_addr;
long copied;
struct page *page;
- struct hstate *h;
unsigned long vma_hpagesize;
pgoff_t idx;
u32 hash;
@@ -221,20 +250,9 @@ retry:
*/
if (!dst_vma) {
err = -ENOENT;
- dst_vma = find_vma(dst_mm, dst_start);
+ dst_vma = find_dst_vma(dst_mm, dst_start, len);
if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
goto out_unlock;
- /*
- * Check the vma is registered in uffd, this is
- * required to enforce the VM_MAYWRITE check done at
- * uffd registration time.
- */
- if (!dst_vma->vm_userfaultfd_ctx.ctx)
- goto out_unlock;
-
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
- goto out_unlock;
err = -EINVAL;
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
@@ -243,10 +261,6 @@ retry:
vm_shared = dst_vma->vm_flags & VM_SHARED;
}
- if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
- (len - copied) & (vma_hpagesize - 1)))
- goto out_unlock;
-
/*
* If not shared, ensure the dst_vma has a anon_vma.
*/
@@ -256,24 +270,21 @@ retry:
goto out_unlock;
}
- h = hstate_vma(dst_vma);
-
while (src_addr < src_start + len) {
pte_t dst_pteval;
BUG_ON(dst_addr >= dst_start + len);
- VM_BUG_ON(dst_addr & ~huge_page_mask(h));
/*
* Serialize via hugetlb_fault_mutex
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
- dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
+ dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out_unlock;
@@ -300,7 +311,8 @@ retry:
err = copy_huge_page_from_user(page,
(const void __user *)src_addr,
- pages_per_huge_page(h), true);
+ vma_hpagesize / PAGE_SIZE,
+ true);
if (unlikely(err)) {
err = -EFAULT;
goto out;
@@ -475,20 +487,9 @@ retry:
* both valid and fully within a single existing vma.
*/
err = -ENOENT;
- dst_vma = find_vma(dst_mm, dst_start);
+ dst_vma = find_dst_vma(dst_mm, dst_start, len);
if (!dst_vma)
goto out_unlock;
- /*
- * Check the vma is registered in uffd, this is required to
- * enforce the VM_MAYWRITE check done at uffd registration
- * time.
- */
- if (!dst_vma->vm_userfaultfd_ctx.ctx)
- goto out_unlock;
-
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
- goto out_unlock;
err = -EINVAL;
/*
diff --git a/mm/util.c b/mm/util.c
index 3ad6db9a722e..988d11e6c17c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -271,7 +271,7 @@ void *memdup_user_nul(const void __user *src, size_t len)
EXPORT_SYMBOL(memdup_user_nul);
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent)
+ struct vm_area_struct *prev)
{
struct vm_area_struct *next;
@@ -280,18 +280,28 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
next = prev->vm_next;
prev->vm_next = vma;
} else {
+ next = mm->mmap;
mm->mmap = vma;
- if (rb_parent)
- next = rb_entry(rb_parent,
- struct vm_area_struct, vm_rb);
- else
- next = NULL;
}
vma->vm_next = next;
if (next)
next->vm_prev = vma;
}
+void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ struct vm_area_struct *prev, *next;
+
+ next = vma->vm_next;
+ prev = vma->vm_prev;
+ if (prev)
+ prev->vm_next = next;
+ else
+ mm->mmap = next;
+ if (next)
+ next->vm_prev = prev;
+}
+
/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4a7d7459c4f9..4d3b3d60d893 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -331,6 +331,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
static DEFINE_SPINLOCK(vmap_area_lock);
+static DEFINE_SPINLOCK(free_vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
@@ -682,7 +683,7 @@ insert_vmap_area_augment(struct vmap_area *va,
* free area is inserted. If VA has been merged, it is
* freed.
*/
-static __always_inline void
+static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
@@ -749,7 +750,10 @@ merge_or_add_vmap_area(struct vmap_area *va,
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
- return;
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
}
}
@@ -758,6 +762,8 @@ insert:
link_va(va, root, parent, link, head);
augment_tree_propagate_from(va);
}
+
+ return va;
}
static __always_inline bool
@@ -968,6 +974,19 @@ adjust_va_to_fit_type(struct vmap_area *va,
* There are a few exceptions though, as an example it is
* a first allocation (early boot up) when we have "one"
* big free space that has to be split.
+ *
+ * Also we can hit this path in case of regular "vmap"
+ * allocations, if "this" current CPU was not preloaded.
+ * See the comment in alloc_vmap_area() why. If so, then
+ * GFP_NOWAIT is used instead to get an extra object for
+ * split purpose. That is rare and most time does not
+ * occur.
+ *
+ * What happens if an allocation gets failed. Basically,
+ * an "overflow" path is triggered to purge lazily freed
+ * areas to free some memory, then, the "retry" path is
+ * triggered to repeat one more time. See more details
+ * in alloc_vmap_area() function.
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
@@ -1063,9 +1082,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
might_sleep();
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
- va = kmem_cache_alloc_node(vmap_area_cachep,
- gfp_mask & GFP_RECLAIM_MASK, node);
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -1073,49 +1092,55 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
* Only scan the relevant parts containing pointers to other objects
* to avoid false negatives.
*/
- kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
/*
- * Preload this CPU with one extra vmap_area object to ensure
- * that we have it available when fit type of free area is
- * NE_FIT_TYPE.
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. Please note, it
+ * does not guarantee that an allocation occurs on a CPU that
+ * is preloaded, instead we minimize the case when it is not.
+ * It can happen because of cpu migration, because there is a
+ * race until the below spinlock is taken.
*
* The preload is done in non-atomic context, thus it allows us
* to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure.
+ * low memory condition and high memory pressure. In rare case,
+ * if not preloaded, GFP_NOWAIT is used.
*
- * Even if it fails we do not really care about that. Just proceed
- * as it is. "overflow" path will refill the cache we allocate from.
+ * Set "pva" to NULL here, because of "retry" path.
*/
- preempt_disable();
- if (!__this_cpu_read(ne_fit_preload_node)) {
- preempt_enable();
- pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
- preempt_disable();
-
- if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
- if (pva)
- kmem_cache_free(vmap_area_cachep, pva);
- }
- }
+ pva = NULL;
- spin_lock(&vmap_area_lock);
- preempt_enable();
+ if (!this_cpu_read(ne_fit_preload_node))
+ /*
+ * Even if it fails we do not really care about that.
+ * Just proceed as it is. If needed "overflow" path
+ * will refill the cache we allocate from.
+ */
+ pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(&free_vmap_area_lock);
+
+ if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
+ kmem_cache_free(vmap_area_cachep, pva);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
addr = __alloc_vmap_area(size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
+
if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -1125,7 +1150,6 @@ retry:
return va;
overflow:
- spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
@@ -1161,28 +1185,24 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
-static void __free_vmap_area(struct vmap_area *va)
+/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
{
/*
* Remove from the busy tree/list.
*/
+ spin_lock(&vmap_area_lock);
unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
/*
- * Merge VA with its neighbors, otherwise just add it.
+ * Insert/Merge it back to the free tree/list.
*/
- merge_or_add_vmap_area(va,
- &free_vmap_area_root, &free_vmap_area_list);
-}
-
-/*
- * Free a region of KVA allocated by alloc_vmap_area
- */
-static void free_vmap_area(struct vmap_area *va)
-{
- spin_lock(&vmap_area_lock);
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
}
/*
@@ -1275,24 +1295,30 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;
- spin_lock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long orig_start = va->va_start;
+ unsigned long orig_end = va->va_end;
/*
* Finally insert or merge lazily-freed area. It is
* detached and there is no need to "unlink" it from
* anything.
*/
- merge_or_add_vmap_area(va,
- &free_vmap_area_root, &free_vmap_area_list);
+ va = merge_or_add_vmap_area(va, &free_vmap_area_root,
+ &free_vmap_area_list);
+
+ if (is_vmalloc_or_module_addr((void *)orig_start))
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
atomic_long_sub(nr, &vmap_lazy_nr);
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
- cond_resched_lock(&vmap_area_lock);
+ cond_resched_lock(&free_vmap_area_lock);
}
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
return true;
}
@@ -2014,15 +2040,21 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
}
EXPORT_SYMBOL_GPL(map_vm_area);
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
+static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+ struct vmap_area *va, unsigned long flags, const void *caller)
{
- spin_lock(&vmap_area_lock);
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
+}
+
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, const void *caller)
+{
+ spin_lock(&vmap_area_lock);
+ setup_vmalloc_vm_locked(vm, va, flags, caller);
spin_unlock(&vmap_area_lock);
}
@@ -2068,6 +2100,22 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
setup_vmalloc_vm(area, va, flags, caller);
+ /*
+ * For KASAN, if we are in vmalloc space, we need to cover the shadow
+ * area with real memory. If we come here through VM_ALLOC, this is
+ * done by a higher level function that has access to the true size,
+ * which might not be a full page.
+ *
+ * We assume module space comes via VM_ALLOC path.
+ */
+ if (is_vmalloc_addr(area->addr) && !(area->flags & VM_ALLOC)) {
+ if (kasan_populate_vmalloc(area->size, area)) {
+ unmap_vmap_area(va);
+ kfree(area);
+ return NULL;
+ }
+ }
+
return area;
}
@@ -2245,6 +2293,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
+ if (area->flags & VM_KASAN)
+ kasan_poison_vmalloc(area->addr, area->size);
+
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
@@ -2440,7 +2491,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail;
}
area->pages[i] = page;
- if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
+ if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
@@ -2497,6 +2548,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
if (!addr)
return NULL;
+ if (is_vmalloc_or_module_addr(area->addr)) {
+ if (kasan_populate_vmalloc(real_size, area))
+ return NULL;
+ }
+
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
@@ -3282,7 +3338,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
goto err_free;
}
retry:
- spin_lock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
/* start scanning - we scan from the top, begin with the last area */
area = term_area = last_area;
@@ -3364,29 +3420,44 @@ retry:
va = vas[area];
va->va_start = start;
va->va_end = start + size;
-
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
/* insert all vm's */
- for (area = 0; area < nr_vms; area++)
- setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+ spin_lock(&vmap_area_lock);
+ for (area = 0; area < nr_vms; area++) {
+ insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
+
+ setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
+ }
+ spin_unlock(&vmap_area_lock);
+
+ /* populate the shadow space outside of the lock */
+ for (area = 0; area < nr_vms; area++) {
+ /* assume success here */
+ kasan_populate_vmalloc(sizes[area], vms[area]);
+ }
kfree(vas);
return vms;
recovery:
- /* Remove previously inserted areas. */
+ /*
+ * Remove previously allocated areas. There is no
+ * need in removing these areas from the busy tree,
+ * because they are inserted only on the final step
+ * and when pcpu_get_vm_areas() is success.
+ */
while (area--) {
- __free_vmap_area(vas[area]);
+ merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
vas[area] = NULL;
}
overflow:
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = true;
@@ -3437,9 +3508,12 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
+ __acquires(&vmap_purge_lock)
__acquires(&vmap_area_lock)
{
+ mutex_lock(&vmap_purge_lock);
spin_lock(&vmap_area_lock);
+
return seq_list_start(&vmap_area_list, *pos);
}
@@ -3449,8 +3523,10 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
}
static void s_stop(struct seq_file *m, void *p)
+ __releases(&vmap_purge_lock)
__releases(&vmap_area_lock)
{
+ mutex_unlock(&vmap_purge_lock);
spin_unlock(&vmap_area_lock);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ee4eecc7e1c2..74e8edce83ca 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -79,6 +79,13 @@ struct scan_control {
*/
struct mem_cgroup *target_mem_cgroup;
+ /* Can active pages be deactivated as part of reclaim? */
+#define DEACTIVATE_ANON 1
+#define DEACTIVATE_FILE 2
+ unsigned int may_deactivate:2;
+ unsigned int force_deactivate:1;
+ unsigned int skipped_deactivate:1;
+
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
@@ -101,6 +108,12 @@ struct scan_control {
/* One of the zones is ready for compaction */
unsigned int compaction_ready:1;
+ /* There is easily reclaimable cold cache in the current node */
+ unsigned int cache_trim_mode:1;
+
+ /* The file pages on the current node are dangerously low */
+ unsigned int file_is_tiny:1;
+
/* Allocation order */
s8 order;
@@ -239,13 +252,13 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
up_write(&shrinker_rwsem);
}
-static bool global_reclaim(struct scan_control *sc)
+static bool cgroup_reclaim(struct scan_control *sc)
{
- return !sc->target_mem_cgroup;
+ return sc->target_mem_cgroup;
}
/**
- * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * writeback_throttling_sane - is the usual dirty throttling mechanism available?
* @sc: scan_control in question
*
* The normal page dirty throttling mechanism in balance_dirty_pages() is
@@ -257,11 +270,9 @@ static bool global_reclaim(struct scan_control *sc)
* This function tests whether the vmscan currently in progress can assume
* that the normal dirty throttling mechanism is operational.
*/
-static bool sane_reclaim(struct scan_control *sc)
+static bool writeback_throttling_sane(struct scan_control *sc)
{
- struct mem_cgroup *memcg = sc->target_mem_cgroup;
-
- if (!memcg)
+ if (!cgroup_reclaim(sc))
return true;
#ifdef CONFIG_CGROUP_WRITEBACK
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
@@ -269,29 +280,6 @@ static bool sane_reclaim(struct scan_control *sc)
#endif
return false;
}
-
-static void set_memcg_congestion(pg_data_t *pgdat,
- struct mem_cgroup *memcg,
- bool congested)
-{
- struct mem_cgroup_per_node *mn;
-
- if (!memcg)
- return;
-
- mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
- WRITE_ONCE(mn->congested, congested);
-}
-
-static bool memcg_congested(pg_data_t *pgdat,
- struct mem_cgroup *memcg)
-{
- struct mem_cgroup_per_node *mn;
-
- mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
- return READ_ONCE(mn->congested);
-
-}
#else
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
@@ -302,27 +290,15 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
{
}
-static bool global_reclaim(struct scan_control *sc)
+static bool cgroup_reclaim(struct scan_control *sc)
{
- return true;
+ return false;
}
-static bool sane_reclaim(struct scan_control *sc)
+static bool writeback_throttling_sane(struct scan_control *sc)
{
return true;
}
-
-static inline void set_memcg_congestion(struct pglist_data *pgdat,
- struct mem_cgroup *memcg, bool congested)
-{
-}
-
-static inline bool memcg_congested(struct pglist_data *pgdat,
- struct mem_cgroup *memcg)
-{
- return false;
-
-}
#endif
/*
@@ -351,32 +327,21 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
*/
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
- unsigned long lru_size = 0;
+ unsigned long size = 0;
int zid;
- if (!mem_cgroup_disabled()) {
- for (zid = 0; zid < MAX_NR_ZONES; zid++)
- lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
- } else
- lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
-
- for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+ for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
- unsigned long size;
if (!managed_zone(zone))
continue;
if (!mem_cgroup_disabled())
- size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+ size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
- size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
- NR_ZONE_LRU_BASE + lru);
- lru_size -= min(size, lru_size);
+ size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
}
-
- return lru_size;
-
+ return size;
}
/*
@@ -775,7 +740,7 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
-static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode)
{
if (current->flags & PF_SWAPWRITE)
return 1;
@@ -823,8 +788,7 @@ typedef enum {
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping,
- struct scan_control *sc)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -860,7 +824,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_inode(mapping->host, sc))
+ if (!may_write_to_inode(mapping->host))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -899,7 +863,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* gets returned with a refcount of 0.
*/
static int __remove_mapping(struct address_space *mapping, struct page *page,
- bool reclaimed)
+ bool reclaimed, struct mem_cgroup *target_memcg)
{
unsigned long flags;
int refcount;
@@ -971,7 +935,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
*/
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
- shadow = workingset_eviction(page);
+ shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
@@ -994,7 +958,7 @@ cannot_free:
*/
int remove_mapping(struct address_space *mapping, struct page *page)
{
- if (__remove_mapping(mapping, page, false)) {
+ if (__remove_mapping(mapping, page, false, NULL)) {
/*
* Unfreezing the refcount with 1 rather than 2 effectively
* drops the pagecache ref for us without requiring another
@@ -1239,7 +1203,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto activate_locked;
/* Case 2 above */
- } else if (sane_reclaim(sc) ||
+ } else if (writeback_throttling_sane(sc) ||
!PageReclaim(page) || !may_enter_fs) {
/*
* This is slightly racy - end_page_writeback()
@@ -1394,7 +1358,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(page, mapping, sc)) {
+ switch (pageout(page, mapping)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
@@ -1472,7 +1436,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
count_vm_event(PGLAZYFREED);
count_memcg_page_event(page, PGLAZYFREED);
- } else if (!mapping || !__remove_mapping(mapping, page, true))
+ } else if (!mapping || !__remove_mapping(mapping, page, true,
+ sc->target_mem_cgroup))
goto keep_locked;
unlock_page(page);
@@ -1820,7 +1785,7 @@ int isolate_lru_page(struct page *page)
/*
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
- * then get resheduled. When there are massive number of tasks doing page
+ * then get rescheduled. When there are massive number of tasks doing page
* allocation, such sleeping direct reclaimers may keep piling up on each CPU,
* the LRU list will go small and be scanned faster than necessary, leading to
* unnecessary swapping, thrashing and OOM.
@@ -1833,7 +1798,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
if (current_is_kswapd())
return 0;
- if (!sane_reclaim(sc))
+ if (!writeback_throttling_sane(sc))
return 0;
if (file) {
@@ -1983,7 +1948,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
reclaim_stat->recent_scanned[file] += nr_taken;
item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
- if (global_reclaim(sc))
+ if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -1997,7 +1962,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
- if (global_reclaim(sc))
+ if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
@@ -2199,6 +2164,20 @@ unsigned long reclaim_pages(struct list_head *page_list)
return nr_reclaimed;
}
+static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
+ struct lruvec *lruvec, struct scan_control *sc)
+{
+ if (is_active_lru(lru)) {
+ if (sc->may_deactivate & (1 << is_file_lru(lru)))
+ shrink_active_list(nr_to_scan, lruvec, sc, lru);
+ else
+ sc->skipped_deactivate = 1;
+ return 0;
+ }
+
+ return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
+}
+
/*
* The inactive anon list should be small enough that the VM never has
* to do too much work.
@@ -2227,64 +2206,25 @@ unsigned long reclaim_pages(struct list_head *page_list)
* 1TB 101 10GB
* 10TB 320 32GB
*/
-static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct scan_control *sc, bool trace)
+static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
{
- enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- enum lru_list inactive_lru = file * LRU_FILE;
+ enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
unsigned long inactive, active;
unsigned long inactive_ratio;
- unsigned long refaults;
unsigned long gb;
- /*
- * If we don't have swap space, anonymous page deactivation
- * is pointless.
- */
- if (!file && !total_swap_pages)
- return false;
-
- inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
- active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
-
- /*
- * When refaults are being observed, it means a new workingset
- * is being established. Disable active list protection to get
- * rid of the stale workingset quickly.
- */
- refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
- if (file && lruvec->refaults != refaults) {
- inactive_ratio = 0;
- } else {
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
- else
- inactive_ratio = 1;
- }
+ inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
+ active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
- if (trace)
- trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
- lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
- lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
- inactive_ratio, file);
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
return inactive * inactive_ratio < active;
}
-static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct lruvec *lruvec, struct scan_control *sc)
-{
- if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
- shrink_active_list(nr_to_scan, lruvec, sc, lru);
- return 0;
- }
-
- return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
-}
-
enum scan_balance {
SCAN_EQUAL,
SCAN_FRACT,
@@ -2301,10 +2241,10 @@ enum scan_balance {
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
-static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
- struct scan_control *sc, unsigned long *nr,
- unsigned long *lru_pages)
+static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ unsigned long *nr)
{
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
int swappiness = mem_cgroup_swappiness(memcg);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
@@ -2329,7 +2269,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* using the memory controller's swap limit feature would be
* too expensive.
*/
- if (!global_reclaim(sc) && !swappiness) {
+ if (cgroup_reclaim(sc) && !swappiness) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2345,58 +2285,18 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
}
/*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
+ * If the system is almost out of file pages, force-scan anon.
*/
- if (global_reclaim(sc)) {
- unsigned long pgdatfile;
- unsigned long pgdatfree;
- int z;
- unsigned long total_high_wmark = 0;
-
- pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
-
- total_high_wmark += high_wmark_pages(zone);
- }
-
- if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
- /*
- * Force SCAN_ANON if there are enough inactive
- * anonymous pages on the LRU in eligible zones.
- * Otherwise, the small LRU gets thrashed.
- */
- if (!inactive_list_is_low(lruvec, false, sc, false) &&
- lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
- >> sc->priority) {
- scan_balance = SCAN_ANON;
- goto out;
- }
- }
+ if (sc->file_is_tiny) {
+ scan_balance = SCAN_ANON;
+ goto out;
}
/*
- * If there is enough inactive page cache, i.e. if the size of the
- * inactive list is greater than that of the active list *and* the
- * inactive list actually has some pages to scan on this priority, we
- * do not reclaim anything from the anonymous working set right now.
- * Without the second condition we could end up never scanning an
- * lruvec even if it has plenty of old anonymous pages unless the
- * system is under heavy pressure.
+ * If there is enough inactive page cache, we do not reclaim
+ * anything from the anonymous working right now.
*/
- if (!inactive_list_is_low(lruvec, true, sc, false) &&
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
+ if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2454,7 +2354,6 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- *lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long lruvec_size;
@@ -2549,18 +2448,12 @@ out:
BUG();
}
- *lru_pages += lruvec_size;
nr[lru] = scan;
}
}
-/*
- * This is a basic per-node page freer. Used by both kswapd and direct reclaim.
- */
-static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
- struct scan_control *sc, unsigned long *lru_pages)
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
- struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
@@ -2570,7 +2463,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
struct blk_plug plug;
bool scan_adjusted;
- get_scan_count(lruvec, memcg, sc, nr, lru_pages);
+ get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2586,7 +2479,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
- scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+ scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
@@ -2668,7 +2561,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, sc, true))
+ if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2744,156 +2637,234 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return inactive_lru_pages > pages_for_compaction;
}
-static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
+static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{
- return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
- (memcg && memcg_congested(pgdat, memcg));
+ struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ unsigned long reclaimed;
+ unsigned long scanned;
+
+ switch (mem_cgroup_protected(target_memcg, memcg)) {
+ case MEMCG_PROT_MIN:
+ /*
+ * Hard protection.
+ * If there is no reclaimable memory, OOM.
+ */
+ continue;
+ case MEMCG_PROT_LOW:
+ /*
+ * Soft protection.
+ * Respect the protection only as long as
+ * there is an unprotected supply
+ * of reclaimable memory from other cgroups.
+ */
+ if (!sc->memcg_low_reclaim) {
+ sc->memcg_low_skipped = 1;
+ continue;
+ }
+ memcg_memory_event(memcg, MEMCG_LOW);
+ break;
+ case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
+ break;
+ }
+
+ reclaimed = sc->nr_reclaimed;
+ scanned = sc->nr_scanned;
+
+ shrink_lruvec(lruvec, sc);
+
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
+ sc->priority);
+
+ /* Record the group's reclaim efficiency */
+ vmpressure(sc->gfp_mask, memcg, false,
+ sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
+ } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
+ struct lruvec *target_lruvec;
bool reclaimable = false;
+ unsigned long file;
- do {
- struct mem_cgroup *root = sc->target_mem_cgroup;
- unsigned long node_lru_pages = 0;
- struct mem_cgroup *memcg;
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
- memset(&sc->nr, 0, sizeof(sc->nr));
+again:
+ memset(&sc->nr, 0, sizeof(sc->nr));
- nr_reclaimed = sc->nr_reclaimed;
- nr_scanned = sc->nr_scanned;
+ nr_reclaimed = sc->nr_reclaimed;
+ nr_scanned = sc->nr_scanned;
- memcg = mem_cgroup_iter(root, NULL, NULL);
- do {
- unsigned long lru_pages;
- unsigned long reclaimed;
- unsigned long scanned;
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
- switch (mem_cgroup_protected(root, memcg)) {
- case MEMCG_PROT_MIN:
- /*
- * Hard protection.
- * If there is no reclaimable memory, OOM.
- */
- continue;
- case MEMCG_PROT_LOW:
- /*
- * Soft protection.
- * Respect the protection only as long as
- * there is an unprotected supply
- * of reclaimable memory from other cgroups.
- */
- if (!sc->memcg_low_reclaim) {
- sc->memcg_low_skipped = 1;
- continue;
- }
- memcg_memory_event(memcg, MEMCG_LOW);
- break;
- case MEMCG_PROT_NONE:
- /*
- * All protection thresholds breached. We may
- * still choose to vary the scan pressure
- * applied based on by how much the cgroup in
- * question has exceeded its protection
- * thresholds (see get_scan_count).
- */
- break;
- }
+ if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
- reclaimed = sc->nr_reclaimed;
- scanned = sc->nr_scanned;
- shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
- node_lru_pages += lru_pages;
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE);
+ if (refaults != target_lruvec->refaults ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
- shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
- sc->priority);
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
- /* Record the group's reclaim efficiency */
- vmpressure(sc->gfp_mask, memcg, false,
- sc->nr_scanned - scanned,
- sc->nr_reclaimed - reclaimed);
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
- } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+ if (!managed_zone(zone))
+ continue;
- if (reclaim_state) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
+ total_high_wmark += high_wmark_pages(zone);
}
- /* Record the subtree's reclaim efficiency */
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
- sc->nr_scanned - nr_scanned,
- sc->nr_reclaimed - nr_reclaimed);
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
- if (sc->nr_reclaimed - nr_reclaimed)
- reclaimable = true;
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
- if (current_is_kswapd()) {
- /*
- * If reclaim is isolating dirty pages under writeback,
- * it implies that the long-lived page allocation rate
- * is exceeding the page laundering rate. Either the
- * global limits are not being effective at throttling
- * processes due to the page distribution throughout
- * zones or there is heavy usage of a slow backing
- * device. The only option is to throttle from reclaim
- * context which is not ideal as there is no guarantee
- * the dirtying process is throttled in the same way
- * balance_dirty_pages() manages.
- *
- * Once a node is flagged PGDAT_WRITEBACK, kswapd will
- * count the number of pages under pages flagged for
- * immediate reclaim and stall if any are encountered
- * in the nr_immediate check below.
- */
- if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
- set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+ shrink_node_memcgs(pgdat, sc);
- /*
- * Tag a node as congested if all the dirty pages
- * scanned were backed by a congested BDI and
- * wait_iff_congested will stall.
- */
- if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
- set_bit(PGDAT_CONGESTED, &pgdat->flags);
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
- /* Allow kswapd to start writing pages during reclaim.*/
- if (sc->nr.unqueued_dirty == sc->nr.file_taken)
- set_bit(PGDAT_DIRTY, &pgdat->flags);
+ /* Record the subtree's reclaim efficiency */
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ sc->nr_scanned - nr_scanned,
+ sc->nr_reclaimed - nr_reclaimed);
- /*
- * If kswapd scans pages marked marked for immediate
- * reclaim and under writeback (nr_immediate), it
- * implies that pages are cycling through the LRU
- * faster than they are written so also forcibly stall.
- */
- if (sc->nr.immediate)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- }
+ if (sc->nr_reclaimed - nr_reclaimed)
+ reclaimable = true;
+ if (current_is_kswapd()) {
/*
- * Legacy memcg will stall in page writeback so avoid forcibly
- * stalling in wait_iff_congested().
+ * If reclaim is isolating dirty pages under writeback,
+ * it implies that the long-lived page allocation rate
+ * is exceeding the page laundering rate. Either the
+ * global limits are not being effective at throttling
+ * processes due to the page distribution throughout
+ * zones or there is heavy usage of a slow backing
+ * device. The only option is to throttle from reclaim
+ * context which is not ideal as there is no guarantee
+ * the dirtying process is throttled in the same way
+ * balance_dirty_pages() manages.
+ *
+ * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+ * count the number of pages under pages flagged for
+ * immediate reclaim and stall if any are encountered
+ * in the nr_immediate check below.
*/
- if (!global_reclaim(sc) && sane_reclaim(sc) &&
- sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
- set_memcg_congestion(pgdat, root, true);
+ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+ set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+
+ /* Allow kswapd to start writing pages during reclaim.*/
+ if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+ set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
- * Stall direct reclaim for IO completions if underlying BDIs
- * and node is congested. Allow kswapd to continue until it
- * starts encountering unqueued dirty pages or cycling through
- * the LRU too quickly.
+ * If kswapd scans pages marked marked for immediate
+ * reclaim and under writeback (nr_immediate), it
+ * implies that pages are cycling through the LRU
+ * faster than they are written so also forcibly stall.
*/
- if (!sc->hibernation_mode && !current_is_kswapd() &&
- current_may_throttle() && pgdat_memcg_congested(pgdat, root))
- wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+ if (sc->nr.immediate)
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
- } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
- sc));
+ /*
+ * Tag a node/memcg as congested if all the dirty pages
+ * scanned were backed by a congested BDI and
+ * wait_iff_congested will stall.
+ *
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling in wait_iff_congested().
+ */
+ if ((current_is_kswapd() ||
+ (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
+ sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+ set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
+
+ /*
+ * Stall direct reclaim for IO completions if underlying BDIs
+ * and node is congested. Allow kswapd to continue until it
+ * starts encountering unqueued dirty pages or cycling through
+ * the LRU too quickly.
+ */
+ if (!current_is_kswapd() && current_may_throttle() &&
+ !sc->hibernation_mode &&
+ test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
+ wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+
+ if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
+ sc))
+ goto again;
/*
* Kswapd gives up on balancing particular nodes after too
@@ -2973,7 +2944,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* Take care memory controller reclaiming has small influence
* to global LRU.
*/
- if (global_reclaim(sc)) {
+ if (!cgroup_reclaim(sc)) {
if (!cpuset_zone_allowed(zone,
GFP_KERNEL | __GFP_HARDWALL))
continue;
@@ -3032,19 +3003,14 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
sc->gfp_mask = orig_mask;
}
-static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
- do {
- unsigned long refaults;
- struct lruvec *lruvec;
+ struct lruvec *target_lruvec;
+ unsigned long refaults;
- lruvec = mem_cgroup_lruvec(pgdat, memcg);
- refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
- lruvec->refaults = refaults;
- } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+ target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
+ target_lruvec->refaults = refaults;
}
/*
@@ -3073,7 +3039,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
retry:
delayacct_freepages_start();
- if (global_reclaim(sc))
+ if (!cgroup_reclaim(sc))
__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do {
@@ -3102,8 +3068,16 @@ retry:
if (zone->zone_pgdat == last_pgdat)
continue;
last_pgdat = zone->zone_pgdat;
+
snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
- set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
+
+ if (cgroup_reclaim(sc)) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
+ zone->zone_pgdat);
+ clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+ }
}
delayacct_freepages_end();
@@ -3115,9 +3089,27 @@ retry:
if (sc->compaction_ready)
return 1;
+ /*
+ * We make inactive:active ratio decisions based on the node's
+ * composition of memory, but a restrictive reclaim_idx or a
+ * memory.low cgroup setting can exempt large amounts of
+ * memory from reclaim. Neither of which are very common, so
+ * instead of doing costly eligibility calculations of the
+ * entire cgroup subtree up front, we assume the estimates are
+ * good, and retry with forcible deactivation if that fails.
+ */
+ if (sc->skipped_deactivate) {
+ sc->priority = initial_priority;
+ sc->force_deactivate = 1;
+ sc->skipped_deactivate = 0;
+ goto retry;
+ }
+
/* Untapped cgroup reserves? Don't OOM, retry. */
if (sc->memcg_low_skipped) {
sc->priority = initial_priority;
+ sc->force_deactivate = 0;
+ sc->skipped_deactivate = 0;
sc->memcg_low_reclaim = 1;
sc->memcg_low_skipped = 0;
goto retry;
@@ -3309,6 +3301,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
pg_data_t *pgdat,
unsigned long *nr_scanned)
{
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.target_mem_cgroup = memcg,
@@ -3317,7 +3310,6 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
};
- unsigned long lru_pages;
WARN_ON_ONCE(!current->reclaim_state);
@@ -3334,7 +3326,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
+ shrink_lruvec(lruvec, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -3348,10 +3340,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
gfp_t gfp_mask,
bool may_swap)
{
- struct zonelist *zonelist;
unsigned long nr_reclaimed;
unsigned long pflags;
- int nid;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -3364,16 +3354,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = may_swap,
};
-
- set_task_reclaim_state(current, &sc.reclaim_state);
/*
- * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
- * take care of from where we get pages. So the node where we start the
- * scan does not need to be the current node.
+ * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
+ * equal pressure on all the nodes. This is based on the assumption that
+ * the reclaim does not bail out early.
*/
- nid = mem_cgroup_select_victim_node(memcg);
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
- zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
+ set_task_reclaim_state(current, &sc.reclaim_state);
trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
@@ -3396,18 +3384,20 @@ static void age_active_anon(struct pglist_data *pgdat,
struct scan_control *sc)
{
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
if (!total_swap_pages)
return;
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+ return;
+
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
-
- if (inactive_list_is_low(lruvec, false, sc, true))
- shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
- sc, LRU_ACTIVE_ANON);
-
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
memcg = mem_cgroup_iter(NULL, memcg, NULL);
} while (memcg);
}
@@ -3475,7 +3465,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
/* Clear pgdat state for congested, dirty or under writeback. */
static void clear_pgdat_congested(pg_data_t *pgdat)
{
- clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
+
+ clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
clear_bit(PGDAT_DIRTY, &pgdat->flags);
clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
diff --git a/mm/workingset.c b/mm/workingset.c
index c963831d354f..474186b76ced 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -213,28 +213,53 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*workingsetp = workingset;
}
+static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
+{
+ /*
+ * Reclaiming a cgroup means reclaiming all its children in a
+ * round-robin fashion. That means that each cgroup has an LRU
+ * order that is composed of the LRU orders of its child
+ * cgroups; and every page has an LRU position not just in the
+ * cgroup that owns it, but in all of that group's ancestors.
+ *
+ * So when the physical inactive list of a leaf cgroup ages,
+ * the virtual inactive lists of all its parents, including
+ * the root cgroup's, age as well.
+ */
+ do {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ atomic_long_inc(&lruvec->inactive_age);
+ } while (memcg && (memcg = parent_mem_cgroup(memcg)));
+}
+
/**
* workingset_eviction - note the eviction of a page from memory
+ * @target_memcg: the cgroup that is causing the reclaim
* @page: the page being evicted
*
* Returns a shadow entry to be stored in @page->mapping->i_pages in place
* of the evicted @page so that a later refault can be detected.
*/
-void *workingset_eviction(struct page *page)
+void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
{
struct pglist_data *pgdat = page_pgdat(page);
- struct mem_cgroup *memcg = page_memcg(page);
- int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
struct lruvec *lruvec;
+ int memcgid;
/* Page is fully exclusive and pins page->mem_cgroup */
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- lruvec = mem_cgroup_lruvec(pgdat, memcg);
- eviction = atomic_long_inc_return(&lruvec->inactive_age);
+ advance_inactive_age(page_memcg(page), pgdat);
+
+ lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ /* XXX: target_memcg can be NULL, go through lruvec */
+ memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
+ eviction = atomic_long_read(&lruvec->inactive_age);
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
@@ -244,10 +269,13 @@ void *workingset_eviction(struct page *page)
* @shadow: shadow entry of the evicted page
*
* Calculates and evaluates the refault distance of the previously
- * evicted page in the context of the node it was allocated in.
+ * evicted page in the context of the node and the memcg whose memory
+ * pressure caused the eviction.
*/
void workingset_refault(struct page *page, void *shadow)
{
+ struct mem_cgroup *eviction_memcg;
+ struct lruvec *eviction_lruvec;
unsigned long refault_distance;
struct pglist_data *pgdat;
unsigned long active_file;
@@ -277,12 +305,12 @@ void workingset_refault(struct page *page, void *shadow)
* would be better if the root_mem_cgroup existed in all
* configurations instead.
*/
- memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() && !memcg)
+ eviction_memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_disabled() && !eviction_memcg)
goto out;
- lruvec = mem_cgroup_lruvec(pgdat, memcg);
- refault = atomic_long_read(&lruvec->inactive_age);
- active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
+ eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
+ refault = atomic_long_read(&eviction_lruvec->inactive_age);
+ active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
/*
* Calculate the refault distance
@@ -302,6 +330,17 @@ void workingset_refault(struct page *page, void *shadow)
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
+ /*
+ * The activation decision for this page is made at the level
+ * where the eviction occurred, as that is where the LRU order
+ * during page reclaim is being determined.
+ *
+ * However, the cgroup that will own the page is the one that
+ * is actually experiencing the refault event.
+ */
+ memcg = page_memcg(page);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
/*
@@ -313,7 +352,7 @@ void workingset_refault(struct page *page, void *shadow)
goto out;
SetPageActive(page);
- atomic_long_inc(&lruvec->inactive_age);
+ advance_inactive_age(memcg, pgdat);
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
/* Page was active prior to eviction */
@@ -332,7 +371,6 @@ out:
void workingset_activation(struct page *page)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
/*
@@ -345,8 +383,7 @@ void workingset_activation(struct page *page)
memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
- lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
- atomic_long_inc(&lruvec->inactive_age);
+ advance_inactive_age(memcg, page_pgdat(page));
out:
rcu_read_unlock();
}
@@ -426,7 +463,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct lruvec *lruvec;
int i;
- lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
+ lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
NR_LRU_BASE + i);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 6d3d3f698ebb..43754d8ebce8 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -41,6 +41,7 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/rwlock.h>
#include <linux/zpool.h>
#include <linux/magic.h>
@@ -90,6 +91,7 @@ struct z3fold_buddy_slots {
*/
unsigned long slot[BUDDY_MASK + 1];
unsigned long pool; /* back link + flags */
+ rwlock_t lock;
};
#define HANDLE_FLAG_MASK (0x03)
@@ -124,6 +126,7 @@ struct z3fold_header {
unsigned short start_middle;
unsigned short first_num:2;
unsigned short mapped_count:2;
+ unsigned short foreign_handles:2;
};
/**
@@ -178,6 +181,19 @@ enum z3fold_page_flags {
PAGE_CLAIMED, /* by either reclaim or free */
};
+/*
+ * handle flags, go under HANDLE_FLAG_MASK
+ */
+enum z3fold_handle_flags {
+ HANDLES_ORPHANED = 0,
+};
+
+/*
+ * Forward declarations
+ */
+static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
+static void compact_page_work(struct work_struct *w);
+
/*****************
* Helpers
*****************/
@@ -191,8 +207,6 @@ static int size_to_chunks(size_t size)
#define for_each_unbuddied_list(_iter, _begin) \
for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
-static void compact_page_work(struct work_struct *w);
-
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
gfp_t gfp)
{
@@ -204,6 +218,7 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
if (slots) {
memset(slots->slot, 0, sizeof(slots->slot));
slots->pool = (unsigned long)pool;
+ rwlock_init(&slots->lock);
}
return slots;
@@ -219,25 +234,110 @@ static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
}
+/* Lock a z3fold page */
+static inline void z3fold_page_lock(struct z3fold_header *zhdr)
+{
+ spin_lock(&zhdr->page_lock);
+}
+
+/* Try to lock a z3fold page */
+static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
+{
+ return spin_trylock(&zhdr->page_lock);
+}
+
+/* Unlock a z3fold page */
+static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
+{
+ spin_unlock(&zhdr->page_lock);
+}
+
+
+static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
+ bool lock)
+{
+ struct z3fold_buddy_slots *slots;
+ struct z3fold_header *zhdr;
+ int locked = 0;
+
+ if (!(handle & (1 << PAGE_HEADLESS))) {
+ slots = handle_to_slots(handle);
+ do {
+ unsigned long addr;
+
+ read_lock(&slots->lock);
+ addr = *(unsigned long *)handle;
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ if (lock)
+ locked = z3fold_page_trylock(zhdr);
+ read_unlock(&slots->lock);
+ if (locked)
+ break;
+ cpu_relax();
+ } while (lock);
+ } else {
+ zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
+ }
+
+ return zhdr;
+}
+
+/* Returns the z3fold page where a given handle is stored */
+static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
+{
+ return __get_z3fold_header(h, false);
+}
+
+/* return locked z3fold page if it's not headless */
+static inline struct z3fold_header *get_z3fold_header(unsigned long h)
+{
+ return __get_z3fold_header(h, true);
+}
+
+static inline void put_z3fold_header(struct z3fold_header *zhdr)
+{
+ struct page *page = virt_to_page(zhdr);
+
+ if (!test_bit(PAGE_HEADLESS, &page->private))
+ z3fold_page_unlock(zhdr);
+}
+
static inline void free_handle(unsigned long handle)
{
struct z3fold_buddy_slots *slots;
+ struct z3fold_header *zhdr;
int i;
bool is_free;
if (handle & (1 << PAGE_HEADLESS))
return;
- WARN_ON(*(unsigned long *)handle == 0);
- *(unsigned long *)handle = 0;
+ if (WARN_ON(*(unsigned long *)handle == 0))
+ return;
+
+ zhdr = handle_to_z3fold_header(handle);
slots = handle_to_slots(handle);
+ write_lock(&slots->lock);
+ *(unsigned long *)handle = 0;
+ write_unlock(&slots->lock);
+ if (zhdr->slots == slots)
+ return; /* simple case, nothing else to do */
+
+ /* we are freeing a foreign handle if we are here */
+ zhdr->foreign_handles--;
is_free = true;
+ read_lock(&slots->lock);
+ if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
+ read_unlock(&slots->lock);
+ return;
+ }
for (i = 0; i <= BUDDY_MASK; i++) {
if (slots->slot[i]) {
is_free = false;
break;
}
}
+ read_unlock(&slots->lock);
if (is_free) {
struct z3fold_pool *pool = slots_to_pool(slots);
@@ -322,6 +422,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
zhdr->first_num = 0;
zhdr->start_middle = 0;
zhdr->cpu = -1;
+ zhdr->foreign_handles = 0;
zhdr->slots = slots;
zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
@@ -341,24 +442,6 @@ static void free_z3fold_page(struct page *page, bool headless)
__free_page(page);
}
-/* Lock a z3fold page */
-static inline void z3fold_page_lock(struct z3fold_header *zhdr)
-{
- spin_lock(&zhdr->page_lock);
-}
-
-/* Try to lock a z3fold page */
-static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
-{
- return spin_trylock(&zhdr->page_lock);
-}
-
-/* Unlock a z3fold page */
-static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
-{
- spin_unlock(&zhdr->page_lock);
-}
-
/* Helper function to build the index */
static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
{
@@ -389,7 +472,9 @@ static unsigned long __encode_handle(struct z3fold_header *zhdr,
if (bud == LAST)
h |= (zhdr->last_chunks << BUDDY_SHIFT);
+ write_lock(&slots->lock);
slots->slot[idx] = h;
+ write_unlock(&slots->lock);
return (unsigned long)&slots->slot[idx];
}
@@ -398,22 +483,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
return __encode_handle(zhdr, zhdr->slots, bud);
}
-/* Returns the z3fold page where a given handle is stored */
-static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
-{
- unsigned long addr = h;
-
- if (!(addr & (1 << PAGE_HEADLESS)))
- addr = *(unsigned long *)h;
-
- return (struct z3fold_header *)(addr & PAGE_MASK);
-}
-
/* only for LAST bud, returns zero otherwise */
static unsigned short handle_to_chunks(unsigned long handle)
{
- unsigned long addr = *(unsigned long *)handle;
+ struct z3fold_buddy_slots *slots = handle_to_slots(handle);
+ unsigned long addr;
+ read_lock(&slots->lock);
+ addr = *(unsigned long *)handle;
+ read_unlock(&slots->lock);
return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
}
@@ -425,10 +503,13 @@ static unsigned short handle_to_chunks(unsigned long handle)
static enum buddy handle_to_buddy(unsigned long handle)
{
struct z3fold_header *zhdr;
+ struct z3fold_buddy_slots *slots = handle_to_slots(handle);
unsigned long addr;
+ read_lock(&slots->lock);
WARN_ON(handle & (1 << PAGE_HEADLESS));
addr = *(unsigned long *)handle;
+ read_unlock(&slots->lock);
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
return (addr - zhdr->first_num) & BUDDY_MASK;
}
@@ -442,6 +523,8 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ bool is_free = true;
+ int i;
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
@@ -450,8 +533,25 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
if (!list_empty(&page->lru))
list_del_init(&page->lru);
spin_unlock(&pool->lock);
+
+ /* If there are no foreign handles, free the handles array */
+ read_lock(&zhdr->slots->lock);
+ for (i = 0; i <= BUDDY_MASK; i++) {
+ if (zhdr->slots->slot[i]) {
+ is_free = false;
+ break;
+ }
+ }
+ if (!is_free)
+ set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
+ read_unlock(&zhdr->slots->lock);
+
+ if (is_free)
+ kmem_cache_free(pool->c_handle, zhdr->slots);
+
if (locked)
z3fold_page_unlock(zhdr);
+
spin_lock(&pool->stale_lock);
list_add(&zhdr->buddy, &pool->stale);
queue_work(pool->release_wq, &pool->work);
@@ -479,6 +579,7 @@ static void release_z3fold_page_locked_list(struct kref *ref)
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
refcount);
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+
spin_lock(&pool->lock);
list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
@@ -559,6 +660,119 @@ static inline void *mchunk_memmove(struct z3fold_header *zhdr,
zhdr->middle_chunks << CHUNK_SHIFT);
}
+static inline bool buddy_single(struct z3fold_header *zhdr)
+{
+ return !((zhdr->first_chunks && zhdr->middle_chunks) ||
+ (zhdr->first_chunks && zhdr->last_chunks) ||
+ (zhdr->middle_chunks && zhdr->last_chunks));
+}
+
+static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
+{
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ void *p = zhdr;
+ unsigned long old_handle = 0;
+ size_t sz = 0;
+ struct z3fold_header *new_zhdr = NULL;
+ int first_idx = __idx(zhdr, FIRST);
+ int middle_idx = __idx(zhdr, MIDDLE);
+ int last_idx = __idx(zhdr, LAST);
+ unsigned short *moved_chunks = NULL;
+
+ /*
+ * No need to protect slots here -- all the slots are "local" and
+ * the page lock is already taken
+ */
+ if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
+ p += ZHDR_SIZE_ALIGNED;
+ sz = zhdr->first_chunks << CHUNK_SHIFT;
+ old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
+ moved_chunks = &zhdr->first_chunks;
+ } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
+ p += zhdr->start_middle << CHUNK_SHIFT;
+ sz = zhdr->middle_chunks << CHUNK_SHIFT;
+ old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
+ moved_chunks = &zhdr->middle_chunks;
+ } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
+ p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+ sz = zhdr->last_chunks << CHUNK_SHIFT;
+ old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
+ moved_chunks = &zhdr->last_chunks;
+ }
+
+ if (sz > 0) {
+ enum buddy new_bud = HEADLESS;
+ short chunks = size_to_chunks(sz);
+ void *q;
+
+ new_zhdr = __z3fold_alloc(pool, sz, false);
+ if (!new_zhdr)
+ return NULL;
+
+ if (WARN_ON(new_zhdr == zhdr))
+ goto out_fail;
+
+ if (new_zhdr->first_chunks == 0) {
+ if (new_zhdr->middle_chunks != 0 &&
+ chunks >= new_zhdr->start_middle) {
+ new_bud = LAST;
+ } else {
+ new_bud = FIRST;
+ }
+ } else if (new_zhdr->last_chunks == 0) {
+ new_bud = LAST;
+ } else if (new_zhdr->middle_chunks == 0) {
+ new_bud = MIDDLE;
+ }
+ q = new_zhdr;
+ switch (new_bud) {
+ case FIRST:
+ new_zhdr->first_chunks = chunks;
+ q += ZHDR_SIZE_ALIGNED;
+ break;
+ case MIDDLE:
+ new_zhdr->middle_chunks = chunks;
+ new_zhdr->start_middle =
+ new_zhdr->first_chunks + ZHDR_CHUNKS;
+ q += new_zhdr->start_middle << CHUNK_SHIFT;
+ break;
+ case LAST:
+ new_zhdr->last_chunks = chunks;
+ q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
+ break;
+ default:
+ goto out_fail;
+ }
+ new_zhdr->foreign_handles++;
+ memcpy(q, p, sz);
+ write_lock(&zhdr->slots->lock);
+ *(unsigned long *)old_handle = (unsigned long)new_zhdr +
+ __idx(new_zhdr, new_bud);
+ if (new_bud == LAST)
+ *(unsigned long *)old_handle |=
+ (new_zhdr->last_chunks << BUDDY_SHIFT);
+ write_unlock(&zhdr->slots->lock);
+ add_to_unbuddied(pool, new_zhdr);
+ z3fold_page_unlock(new_zhdr);
+
+ *moved_chunks = 0;
+ }
+
+ return new_zhdr;
+
+out_fail:
+ if (new_zhdr) {
+ if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
+ atomic64_dec(&pool->pages_nr);
+ else {
+ add_to_unbuddied(pool, new_zhdr);
+ z3fold_page_unlock(new_zhdr);
+ }
+ }
+ return NULL;
+
+}
+
#define BIG_CHUNK_GAP 3
/* Has to be called with lock held */
static int z3fold_compact_page(struct z3fold_header *zhdr)
@@ -638,6 +852,15 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
return;
}
+ if (!zhdr->foreign_handles && buddy_single(zhdr) &&
+ zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
+ atomic64_dec(&pool->pages_nr);
+ else
+ z3fold_page_unlock(zhdr);
+ return;
+ }
+
z3fold_compact_page(zhdr);
add_to_unbuddied(pool, zhdr);
z3fold_page_unlock(zhdr);
@@ -690,7 +913,8 @@ lookup:
spin_unlock(&pool->lock);
page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
put_cpu_ptr(pool->unbuddied);
@@ -734,7 +958,8 @@ lookup:
spin_unlock(&pool->lock);
page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
if (can_sleep)
@@ -1000,7 +1225,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
enum buddy bud;
bool page_claimed;
- zhdr = handle_to_z3fold_header(handle);
+ zhdr = get_z3fold_header(handle);
page = virt_to_page(zhdr);
page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
@@ -1014,6 +1239,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
+ put_z3fold_header(zhdr);
free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
}
@@ -1021,7 +1247,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
}
/* Non-headless case */
- z3fold_page_lock(zhdr);
bud = handle_to_buddy(handle);
switch (bud) {
@@ -1037,11 +1262,13 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
default:
pr_err("%s: unknown bud %d\n", __func__, bud);
WARN_ON(1);
- z3fold_page_unlock(zhdr);
+ put_z3fold_header(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
- free_handle(handle);
+ if (!page_claimed)
+ free_handle(handle);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
atomic64_dec(&pool->pages_nr);
return;
@@ -1053,7 +1280,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
}
if (unlikely(PageIsolated(page)) ||
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
- z3fold_page_unlock(zhdr);
+ put_z3fold_header(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
return;
}
@@ -1063,14 +1290,14 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
spin_unlock(&pool->lock);
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
- do_compact_page(zhdr, true);
clear_bit(PAGE_CLAIMED, &page->private);
+ do_compact_page(zhdr, true);
return;
}
kref_get(&zhdr->refcount);
- queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
+ queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ put_z3fold_header(zhdr);
}
/**
@@ -1111,11 +1338,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
*/
static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
{
- int i, ret = 0;
+ int i, ret = -1;
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
struct list_head *pos;
- struct z3fold_buddy_slots slots;
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
@@ -1153,6 +1379,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
zhdr = NULL;
continue; /* can't evict at this point */
}
+ if (zhdr->foreign_handles) {
+ clear_bit(PAGE_CLAIMED, &page->private);
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ continue; /* can't evict such page */
+ }
kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
@@ -1176,39 +1408,38 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
last_handle = 0;
middle_handle = 0;
if (zhdr->first_chunks)
- first_handle = __encode_handle(zhdr, &slots,
- FIRST);
+ first_handle = encode_handle(zhdr, FIRST);
if (zhdr->middle_chunks)
- middle_handle = __encode_handle(zhdr, &slots,
- MIDDLE);
+ middle_handle = encode_handle(zhdr, MIDDLE);
if (zhdr->last_chunks)
- last_handle = __encode_handle(zhdr, &slots,
- LAST);
+ last_handle = encode_handle(zhdr, LAST);
/*
* it's safe to unlock here because we hold a
* reference to this page
*/
z3fold_page_unlock(zhdr);
} else {
- first_handle = __encode_handle(zhdr, &slots, HEADLESS);
+ first_handle = encode_handle(zhdr, HEADLESS);
last_handle = middle_handle = 0;
}
-
/* Issue the eviction callback(s) */
if (middle_handle) {
ret = pool->ops->evict(pool, middle_handle);
if (ret)
goto next;
+ free_handle(middle_handle);
}
if (first_handle) {
ret = pool->ops->evict(pool, first_handle);
if (ret)
goto next;
+ free_handle(first_handle);
}
if (last_handle) {
ret = pool->ops->evict(pool, last_handle);
if (ret)
goto next;
+ free_handle(last_handle);
}
next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
@@ -1264,14 +1495,13 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
void *addr;
enum buddy buddy;
- zhdr = handle_to_z3fold_header(handle);
+ zhdr = get_z3fold_header(handle);
addr = zhdr;
page = virt_to_page(zhdr);
if (test_bit(PAGE_HEADLESS, &page->private))
goto out;
- z3fold_page_lock(zhdr);
buddy = handle_to_buddy(handle);
switch (buddy) {
case FIRST:
@@ -1293,8 +1523,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
if (addr)
zhdr->mapped_count++;
- z3fold_page_unlock(zhdr);
out:
+ put_z3fold_header(zhdr);
return addr;
}
@@ -1309,18 +1539,17 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
struct page *page;
enum buddy buddy;
- zhdr = handle_to_z3fold_header(handle);
+ zhdr = get_z3fold_header(handle);
page = virt_to_page(zhdr);
if (test_bit(PAGE_HEADLESS, &page->private))
return;
- z3fold_page_lock(zhdr);
buddy = handle_to_buddy(handle);
if (buddy == MIDDLE)
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
zhdr->mapped_count--;
- z3fold_page_unlock(zhdr);
+ put_z3fold_header(zhdr);
}
/**
@@ -1352,19 +1581,21 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
test_bit(PAGE_STALE, &page->private))
goto out;
+ if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
+ goto out;
+
pool = zhdr_to_pool(zhdr);
+ spin_lock(&pool->lock);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ if (!list_empty(&page->lru))
+ list_del_init(&page->lru);
+ spin_unlock(&pool->lock);
+
+ kref_get(&zhdr->refcount);
+ z3fold_page_unlock(zhdr);
+ return true;
- if (zhdr->mapped_count == 0) {
- kref_get(&zhdr->refcount);
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- spin_lock(&pool->lock);
- if (!list_empty(&page->lru))
- list_del(&page->lru);
- spin_unlock(&pool->lock);
- z3fold_page_unlock(zhdr);
- return true;
- }
out:
z3fold_page_unlock(zhdr);
return false;
@@ -1387,7 +1618,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
if (!z3fold_page_trylock(zhdr)) {
return -EAGAIN;
}
- if (zhdr->mapped_count != 0) {
+ if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
z3fold_page_unlock(zhdr);
return -EBUSY;
}
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index de75b9feaaed..672b5931bc8d 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -87,6 +87,7 @@ algorith||algorithm
algorithmical||algorithmically
algoritm||algorithm
algoritms||algorithms
+algorithmn||algorithm
algorrithm||algorithm
algorritm||algorithm
aligment||alignment
@@ -109,6 +110,7 @@ alredy||already
altough||although
alue||value
ambigious||ambiguous
+ambigous||ambiguous
amoung||among
amout||amount
amplifer||amplifier
@@ -179,6 +181,7 @@ attepmpt||attempt
attnetion||attention
attruibutes||attributes
authentification||authentication
+authenicated||authenticated
automaticaly||automatically
automaticly||automatically
automatize||automate
@@ -286,6 +289,7 @@ claread||cleared
clared||cleared
closeing||closing
clustred||clustered
+cnfiguration||configuration
coexistance||coexistence
colescing||coalescing
collapsable||collapsible
@@ -325,9 +329,11 @@ comression||compression
comunication||communication
conbination||combination
conditionaly||conditionally
+conditon||condition
conected||connected
conector||connector
connecetd||connected
+configration||configuration
configuartion||configuration
configuation||configuration
configued||configured
@@ -347,6 +353,7 @@ containts||contains
contaisn||contains
contant||contact
contence||contents
+contiguos||contiguous
continious||continuous
continous||continuous
continously||continuously
@@ -380,6 +387,7 @@ cylic||cyclic
dafault||default
deafult||default
deamon||daemon
+debouce||debounce
decompres||decompress
decsribed||described
decription||description
@@ -448,6 +456,7 @@ diffrent||different
differenciate||differentiate
diffrentiate||differentiate
difinition||definition
+digial||digital
dimention||dimension
dimesions||dimensions
dispalying||displaying
@@ -489,6 +498,7 @@ droput||dropout
druing||during
dynmaic||dynamic
eanable||enable
+eanble||enable
easilly||easily
ecspecially||especially
edditable||editable
@@ -502,6 +512,7 @@ elementry||elementary
eletronic||electronic
embeded||embedded
enabledi||enabled
+enbale||enable
enble||enable
enchanced||enhanced
encorporating||incorporating
@@ -536,6 +547,7 @@ excellant||excellent
execeeded||exceeded
execeeds||exceeds
exeed||exceed
+exeuction||execution
existance||existence
existant||existent
exixt||exist
@@ -601,10 +613,12 @@ frambuffer||framebuffer
framming||framing
framwork||framework
frequncy||frequency
+frequancy||frequency
frome||from
fucntion||function
fuction||function
fuctions||functions
+fullill||fulfill
funcation||function
funcion||function
functionallity||functionality
@@ -642,6 +656,7 @@ happend||happened
harware||hardware
heirarchically||hierarchically
helpfull||helpful
+hexdecimal||hexadecimal
hybernate||hibernate
hierachy||hierarchy
hierarchie||hierarchy
@@ -709,12 +724,14 @@ initalize||initialize
initation||initiation
initators||initiators
initialiazation||initialization
+initializationg||initialization
initializiation||initialization
initialze||initialize
initialzed||initialized
initialzing||initializing
initilization||initialization
initilize||initialize
+initliaze||initialize
inofficial||unofficial
inrerface||interface
insititute||institute
@@ -779,6 +796,7 @@ itertation||iteration
itslef||itself
jave||java
jeffies||jiffies
+jumpimng||jumping
juse||just
jus||just
kown||known
@@ -839,6 +857,7 @@ messags||messages
messgaes||messages
messsage||message
messsages||messages
+metdata||metadata
micropone||microphone
microprocesspr||microprocessor
migrateable||migratable
@@ -857,6 +876,7 @@ mismactch||mismatch
missign||missing
missmanaged||mismanaged
missmatch||mismatch
+misssing||missing
miximum||maximum
mmnemonic||mnemonic
mnay||many
@@ -912,6 +932,7 @@ occured||occurred
occuring||occurring
offser||offset
offet||offset
+offlaod||offload
offloded||offloaded
offseting||offsetting
omited||omitted
@@ -993,6 +1014,7 @@ poiter||pointer
posible||possible
positon||position
possibilites||possibilities
+potocol||protocol
powerfull||powerful
pramater||parameter
preamle||preamble
@@ -1061,11 +1083,13 @@ psychadelic||psychedelic
pwoer||power
queing||queuing
quering||querying
+queus||queues
randomally||randomly
raoming||roaming
reasearcher||researcher
reasearchers||researchers
reasearch||research
+receieve||receive
recepient||recipient
recevied||received
receving||receiving
@@ -1166,6 +1190,7 @@ scaleing||scaling
scaned||scanned
scaning||scanning
scarch||search
+schdule||schedule
seach||search
searchs||searches
secquence||sequence
@@ -1308,6 +1333,7 @@ taskelt||tasklet
teh||the
temorary||temporary
temproarily||temporarily
+temperture||temperature
thead||thread
therfore||therefore
thier||their
@@ -1354,6 +1380,7 @@ uknown||unknown
usupported||unsupported
uncommited||uncommitted
unconditionaly||unconditionally
+undeflow||underflow
underun||underrun
unecessary||unnecessary
unexecpted||unexpected
@@ -1414,6 +1441,7 @@ varible||variable
varient||variant
vaule||value
verbse||verbose
+veify||verify
verisons||versions
verison||version
verson||version
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index c67d32eeb668..334a7eea2004 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -290,6 +290,40 @@ static void mfd_assert_read_shared(int fd)
munmap(p, mfd_def_size);
}
+static void mfd_assert_fork_private_write(int fd)
+{
+ int *p;
+ pid_t pid;
+
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+
+ p[0] = 22;
+
+ pid = fork();
+ if (pid == 0) {
+ p[0] = 33;
+ exit(0);
+ } else {
+ waitpid(pid, NULL, 0);
+
+ if (p[0] != 22) {
+ printf("MAP_PRIVATE copy-on-write failed: %m\n");
+ abort();
+ }
+ }
+
+ munmap(p, mfd_def_size);
+}
+
static void mfd_assert_write(int fd)
{
ssize_t l;
@@ -760,6 +794,8 @@ static void test_seal_future_write(void)
mfd_assert_read_shared(fd2);
mfd_fail_write(fd2);
+ mfd_assert_fork_private_write(fd);
+
munmap(p, mfd_def_size);
close(fd2);
close(fd);
diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config
index 1c0d76cb5adf..93b90a9b1eeb 100644
--- a/tools/testing/selftests/vm/config
+++ b/tools/testing/selftests/vm/config
@@ -1,2 +1,3 @@
CONFIG_SYSVIPC=y
CONFIG_USERFAULTFD=y
+CONFIG_TEST_VMALLOC=m