aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-04-07 14:11:54 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-04-07 14:11:54 -0700
commit63bef48fd6c9d3f1ba4f0e23b4da1e007db6a3c0 (patch)
treef27c1ea7686b2ee30eea6b973a430f1c102bb03f /mm
parentMerge tag 'nfs-for-5.7-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs (diff)
parentipc/shm.c: make compat_ksys_shmctl() static (diff)
downloadwireguard-linux-63bef48fd6c9d3f1ba4f0e23b4da1e007db6a3c0.tar.xz
wireguard-linux-63bef48fd6c9d3f1ba4f0e23b4da1e007db6a3c0.zip
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - a lot more of MM, quite a bit more yet to come: (memcg, pagemap, vmalloc, pagealloc, migration, thp, ksm, madvise, virtio, userfaultfd, memory-hotplug, shmem, rmap, zswap, zsmalloc, cleanups) - various other subsystems (procfs, misc, MAINTAINERS, bitops, lib, checkpatch, epoll, binfmt, kallsyms, reiserfs, kmod, gcov, kconfig, ubsan, fault-injection, ipc) * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (158 commits) ipc/shm.c: make compat_ksys_shmctl() static ipc/mqueue.c: fix a brace coding style issue lib/Kconfig.debug: fix a typo "capabilitiy" -> "capability" ubsan: include bug type in report header kasan: unset panic_on_warn before calling panic() ubsan: check panic_on_warn drivers/misc/lkdtm/bugs.c: add arithmetic overflow and array bounds checks ubsan: split "bounds" checker from other options ubsan: add trap instrumentation option init/Kconfig: clean up ANON_INODES and old IO schedulers options kernel/gcov/fs.c: replace zero-length array with flexible-array member gcov: gcc_3_4: replace zero-length array with flexible-array member gcov: gcc_4_7: replace zero-length array with flexible-array member kernel/kmod.c: fix a typo "assuems" -> "assumes" reiserfs: clean up several indentation issues kallsyms: unexport kallsyms_lookup_name() and kallsyms_on_each_symbol() samples/hw_breakpoint: drop use of kallsyms_lookup_name() samples/hw_breakpoint: drop HW_BREAKPOINT_R when reporting writes fs/binfmt_elf.c: don't free interpreter's ELF pheaders on common path fs/binfmt_elf.c: allocate less for static executable ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig135
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c3
-rw-r--r--mm/dmapool.c4
-rw-r--r--mm/filemap.c14
-rw-r--r--mm/gup.c9
-rw-r--r--mm/huge_memory.c36
-rw-r--r--mm/hugetlb.c1
-rw-r--r--mm/hugetlb_cgroup.c6
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kasan/common.c23
-rw-r--r--mm/kasan/report.c10
-rw-r--r--mm/khugepaged.c39
-rw-r--r--mm/ksm.c5
-rw-r--r--mm/list_lru.c2
-rw-r--r--mm/memcontrol.c5
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c38
-rw-r--r--mm/memory_hotplug.c49
-rw-r--r--mm/mempolicy.c11
-rw-r--r--mm/migrate.c118
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c76
-rw-r--r--mm/page_alloc.c164
-rw-r--r--mm/page_ext.c5
-rw-r--r--mm/page_isolation.c6
-rw-r--r--mm/page_reporting.c364
-rw-r--r--mm/page_reporting.h54
-rw-r--r--mm/rmap.c23
-rw-r--r--mm/shmem.c166
-rw-r--r--mm/shuffle.c12
-rw-r--r--mm/shuffle.h6
-rw-r--r--mm/slab_common.c1
-rw-r--r--mm/slub.c3
-rw-r--r--mm/sparse.c136
-rw-r--r--mm/swap.c20
-rw-r--r--mm/swapfile.c1
-rw-r--r--mm/userfaultfd.c94
-rw-r--r--mm/vmalloc.c2
-rw-r--r--mm/vmscan.c12
-rw-r--r--mm/vmstat.c3
-rw-r--r--mm/zsmalloc.c10
-rw-r--r--mm/zswap.c24
44 files changed, 1267 insertions, 440 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ab80933be65f..691021492e78 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -237,6 +237,17 @@ config COMPACTION
linux-mm@kvack.org.
#
+# support for free page reporting
+config PAGE_REPORTING
+ bool "Free page reporting"
+ def_bool n
+ help
+ Free page reporting allows for the incremental acquisition of
+ free pages from the buddy allocator for the purpose of reporting
+ those pages to another entity, such as a hypervisor, so that the
+ memory can be freed within the host for other uses.
+
+#
# support for page migration
#
config MIGRATION
@@ -420,10 +431,6 @@ config THP_SWAP
For selection by architectures with reasonable THP sizes.
-config TRANSPARENT_HUGE_PAGECACHE
- def_bool y
- depends on TRANSPARENT_HUGEPAGE
-
#
# UP and nommu archs use km based percpu allocator
#
@@ -526,7 +533,6 @@ config MEM_SOFT_DIRTY
config ZSWAP
bool "Compressed cache for swap pages (EXPERIMENTAL)"
depends on FRONTSWAP && CRYPTO=y
- select CRYPTO_LZO
select ZPOOL
help
A lightweight compressed cache for swap pages. It takes
@@ -542,6 +548,123 @@ config ZSWAP
they have not be fully explored on the large set of potential
configurations and workloads that exist.
+choice
+ prompt "Compressed cache for swap pages default compressor"
+ depends on ZSWAP
+ default ZSWAP_COMPRESSOR_DEFAULT_LZO
+ help
+ Selects the default compression algorithm for the compressed cache
+ for swap pages.
+
+ For an overview what kind of performance can be expected from
+ a particular compression algorithm please refer to the benchmarks
+ available at the following LWN page:
+ https://lwn.net/Articles/751795/
+
+ If in doubt, select 'LZO'.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.compressor=' option.
+
+config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ bool "Deflate"
+ select CRYPTO_DEFLATE
+ help
+ Use the Deflate algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZO
+ bool "LZO"
+ select CRYPTO_LZO
+ help
+ Use the LZO algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_842
+ bool "842"
+ select CRYPTO_842
+ help
+ Use the 842 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ bool "LZ4"
+ select CRYPTO_LZ4
+ help
+ Use the LZ4 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ bool "LZ4HC"
+ select CRYPTO_LZ4HC
+ help
+ Use the LZ4HC algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ bool "zstd"
+ select CRYPTO_ZSTD
+ help
+ Use the zstd algorithm as the default compression algorithm.
+endchoice
+
+config ZSWAP_COMPRESSOR_DEFAULT
+ string
+ depends on ZSWAP
+ default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO
+ default "842" if ZSWAP_COMPRESSOR_DEFAULT_842
+ default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ default ""
+
+choice
+ prompt "Compressed cache for swap pages default allocator"
+ depends on ZSWAP
+ default ZSWAP_ZPOOL_DEFAULT_ZBUD
+ help
+ Selects the default allocator for the compressed cache for
+ swap pages.
+ The default is 'zbud' for compatibility, however please do
+ read the description of each of the allocators below before
+ making a right choice.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.zpool=' option.
+
+config ZSWAP_ZPOOL_DEFAULT_ZBUD
+ bool "zbud"
+ select ZBUD
+ help
+ Use the zbud allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ bool "z3fold"
+ select Z3FOLD
+ help
+ Use the z3fold allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ bool "zsmalloc"
+ select ZSMALLOC
+ help
+ Use the zsmalloc allocator as the default allocator.
+endchoice
+
+config ZSWAP_ZPOOL_DEFAULT
+ string
+ depends on ZSWAP
+ default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
+ default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ default ""
+
+config ZSWAP_DEFAULT_ON
+ bool "Enable the compressed cache for swap pages by default"
+ depends on ZSWAP
+ help
+ If selected, the compressed cache for swap pages will be enabled
+ at boot, otherwise it will be disabled.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.enabled=' option.
+
config ZPOOL
tristate "Common API for compressed memory storage"
help
@@ -714,7 +837,7 @@ config GUP_GET_PTE_LOW_HIGH
config READ_ONLY_THP_FOR_FS
bool "Read-only THP for filesystems (EXPERIMENTAL)"
- depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
+ depends on TRANSPARENT_HUGEPAGE && SHMEM
help
Allow khugepaged to put read-only file-backed pages in THP.
diff --git a/mm/Makefile b/mm/Makefile
index dbc8346d16ca..fccd3756b25f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -111,3 +111,4 @@ obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
diff --git a/mm/compaction.c b/mm/compaction.c
index df3da2f76fdc..46f0fcc93081 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -481,6 +481,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page,
*/
static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
struct compact_control *cc)
+ __acquires(lock)
{
/* Track if the lock is contended in async mode */
if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
@@ -989,7 +990,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_cache(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
hpage_nr_pages(page));
isolate_success:
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fe5d33060415..f9fb9bbd733e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -144,9 +144,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
else if (size < 4)
size = 4;
- if ((size % align) != 0)
- size = ALIGN(size, align);
-
+ size = ALIGN(size, align);
allocation = max_t(size_t, size, PAGE_SIZE);
if (!boundary)
diff --git a/mm/filemap.c b/mm/filemap.c
index 0fbdc8e30dd2..23a051a7ef0f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1693,6 +1693,11 @@ EXPORT_SYMBOL(pagecache_get_page);
* Any shadow entries of evicted pages, or swap entries from
* shmem/tmpfs, are included in the returned array.
*
+ * If it finds a Transparent Huge Page, head or tail, find_get_entries()
+ * stops at that page: the caller is likely to have a better way to handle
+ * the compound page as a whole, and then skip its extent, than repeatedly
+ * calling find_get_entries() to return all its tails.
+ *
* Return: the number of pages and shadow entries which were found.
*/
unsigned find_get_entries(struct address_space *mapping,
@@ -1724,8 +1729,15 @@ unsigned find_get_entries(struct address_space *mapping,
/* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- page = find_subpage(page, xas.xa_index);
+ /*
+ * Terminate early on finding a THP, to allow the caller to
+ * handle it all at once; but continue if this is hugetlbfs.
+ */
+ if (PageTransHuge(page) && !PageHuge(page)) {
+ page = find_subpage(page, xas.xa_index);
+ nr_entries = ret + 1;
+ }
export:
indices[ret] = xas.xa_index;
entries[ret] = page;
diff --git a/mm/gup.c b/mm/gup.c
index da3e03185144..a21230569520 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -351,7 +351,8 @@ static struct page *no_page_table(struct vm_area_struct *vma,
* But we can only make this optimization where a hole would surely
* be zero-filled if handle_mm_fault() actually did handle it.
*/
- if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
+ if ((flags & FOLL_DUMP) &&
+ (vma_is_anonymous(vma) || !vma->vm_ops->fault))
return ERR_PTR(-EFAULT);
return NULL;
}
@@ -1101,7 +1102,7 @@ retry:
goto retry;
case -EBUSY:
ret = 0;
- /* FALLTHRU */
+ fallthrough;
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
@@ -1416,7 +1417,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* We want mlock to succeed for regions that have any permissions
* other than PROT_NONE.
*/
- if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ if (vma_is_accessible(vma))
gup_flags |= FOLL_FORCE;
/*
@@ -1676,7 +1677,7 @@ check_again:
list_add_tail(&head->lru, &cma_page_list);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON +
- page_is_file_cache(head),
+ page_is_file_lru(head),
hpage_nr_pages(head));
}
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0f9389f9d1f8..6ecd1045113b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -326,7 +326,7 @@ static struct attribute *hugepage_attr[] = {
&defrag_attr.attr,
&use_zero_page_attr.attr,
&hpage_pmd_size_attr.attr,
-#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+#ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr,
#endif
#ifdef CONFIG_DEBUG_VM
@@ -597,6 +597,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
@@ -1043,6 +1044,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
+ /*
+ * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
+ * does not have the VM_UFFD_WP, which means that the uffd
+ * fork event is not enabled.
+ */
+ if (!(vma->vm_flags & VM_UFFD_WP))
+ pmd = pmd_clear_uffd_wp(pmd);
+
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
@@ -1446,6 +1455,7 @@ alloc:
put_page(page);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
goto out;
}
@@ -1977,13 +1987,16 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
* - HPAGE_PMD_NR is protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot, int prot_numa)
+ unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
pmd_t entry;
bool preserve_write;
int ret;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
@@ -2050,6 +2063,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mk_savedwrite(entry);
+ if (uffd_wp) {
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkuffd_wp(entry);
+ } else if (uffd_wp_resolve) {
+ /*
+ * Leave the write bit to be handled by PF interrupt
+ * handler, then things like COW could be properly
+ * handled.
+ */
+ entry = pmd_clear_uffd_wp(entry);
+ }
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
@@ -2198,7 +2222,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false;
+ bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
unsigned long addr;
int i;
@@ -2273,6 +2297,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = is_write_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
page = pmd_page(old_pmd);
if (pmd_dirty(old_pmd))
@@ -2280,6 +2305,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = pmd_write(old_pmd);
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
+ uffd_wp = pmd_uffd_wp(old_pmd);
}
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
@@ -2304,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
entry = maybe_mkwrite(entry, vma);
@@ -2313,6 +2341,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_mkold(entry);
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
}
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f9ea1e5197b4..f5fb53fdfa02 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2010,6 +2010,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
* of size 'delta'.
*/
static int gather_surplus_pages(struct hstate *h, int delta)
+ __must_hold(&hugetlb_lock)
{
struct list_head surplus_list;
struct page *page, *tmp;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index c2d7ae6cabd1..aabf65d4d91b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -467,14 +467,14 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
switch (MEMFILE_ATTR(cft->private)) {
case RES_RSVD_USAGE:
counter = &h_cg->rsvd_hugepage[idx];
- /* Fall through. */
+ fallthrough;
case RES_USAGE:
val = (u64)page_counter_read(counter);
seq_printf(seq, "%llu\n", val * PAGE_SIZE);
break;
case RES_RSVD_LIMIT:
counter = &h_cg->rsvd_hugepage[idx];
- /* Fall through. */
+ fallthrough;
case RES_LIMIT:
val = (u64)counter->max;
if (val == limit)
@@ -514,7 +514,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_RSVD_LIMIT:
rsvd = true;
- /* Fall through. */
+ fallthrough;
case RES_LIMIT:
mutex_lock(&hugetlb_limit_mutex);
ret = page_counter_set_max(
diff --git a/mm/internal.h b/mm/internal.h
index 2d58ae15a958..b5634e78f01d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -180,6 +180,8 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
}
extern int __isolate_free_page(struct page *page, unsigned int order);
+extern void __putback_isolated_page(struct page *page, unsigned int order,
+ int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index e61b4a492218..2906358e42f0 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -15,7 +15,6 @@
*/
#include <linux/export.h>
-#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
@@ -42,28 +41,6 @@
#include "kasan.h"
#include "../slab.h"
-static inline int in_irqentry_text(unsigned long ptr)
-{
- return (ptr >= (unsigned long)&__irqentry_text_start &&
- ptr < (unsigned long)&__irqentry_text_end) ||
- (ptr >= (unsigned long)&__softirqentry_text_start &&
- ptr < (unsigned long)&__softirqentry_text_end);
-}
-
-static inline unsigned int filter_irq_stacks(unsigned long *entries,
- unsigned int nr_entries)
-{
- unsigned int i;
-
- for (i = 0; i < nr_entries; i++) {
- if (in_irqentry_text(entries[i])) {
- /* Include the irqentry function into the stack. */
- return i + 1;
- }
- }
- return nr_entries;
-}
-
static inline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[KASAN_STACK_DEPTH];
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index cf5c17d5e361..80f23c9da6b0 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -92,8 +92,16 @@ static void end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn)
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
panic("panic_on_warn set ...\n");
+ }
kasan_enable_current();
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c659c68728bc..99d77ffb79c2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -29,6 +29,7 @@ enum scan_result {
SCAN_PMD_NULL,
SCAN_EXCEED_NONE_PTE,
SCAN_PTE_NON_PRESENT,
+ SCAN_PTE_UFFD_WP,
SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
@@ -414,8 +415,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
(IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
vma->vm_file &&
(vm_flags & VM_DENYWRITE))) {
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
- return false;
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
HPAGE_PMD_NR);
}
@@ -513,7 +512,7 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page)
{
- dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
+ dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));
unlock_page(page);
putback_lru_page(page);
}
@@ -613,7 +612,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_cache(page));
+ NR_ISOLATED_ANON + page_is_file_lru(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -1139,6 +1138,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
if (++unmapped <= khugepaged_max_ptes_swap) {
+ /*
+ * Always be strict with uffd-wp
+ * enabled swap entries. Please see
+ * comment below for pte_uffd_wp().
+ */
+ if (pte_swp_uffd_wp(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto out_unmap;
+ }
continue;
} else {
result = SCAN_EXCEED_SWAP_PTE;
@@ -1158,6 +1166,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
result = SCAN_PTE_NON_PRESENT;
goto out_unmap;
}
+ if (pte_uffd_wp(pteval)) {
+ /*
+ * Don't collapse the page if any of the small
+ * PTEs are armed with uffd write protection.
+ * Here we can also mark the new huge pmd as
+ * write protected if any of the small ones is
+ * marked but that could bring uknown
+ * userfault messages that falls outside of
+ * the registered range. So, just be simple.
+ */
+ result = SCAN_PTE_UFFD_WP;
+ goto out_unmap;
+ }
if (pte_write(pteval))
writable = true;
@@ -1258,7 +1279,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
}
}
-#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+#ifdef CONFIG_SHMEM
/*
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
@@ -1973,6 +1994,8 @@ skip:
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+ if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
+ goto skip;
while (khugepaged_scan.address < hend) {
int ret;
@@ -1984,14 +2007,10 @@ skip:
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
- struct file *file;
+ struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
- if (shmem_file(vma->vm_file)
- && !shmem_huge_enabled(vma))
- goto skip;
- file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);
ret = 1;
khugepaged_scan_file(mm, file, pgoff, hpage);
diff --git a/mm/ksm.c b/mm/ksm.c
index d17c7d57d0d8..a558da9e7177 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -455,7 +455,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
- * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
+ * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
* put_page(page);
*
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
@@ -2813,8 +2813,7 @@ static int ksm_memory_callback(struct notifier_block *self,
*/
ksm_check_stable_tree(mn->start_pfn,
mn->start_pfn + mn->nr_pages);
- /* fallthrough */
-
+ fallthrough;
case MEM_CANCEL_OFFLINE:
mutex_lock(&ksm_thread_mutex);
ksm_run &= ~KSM_RUN_OFFLINE;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 8de5e3784ee4..4d5294c39bba 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -223,7 +223,7 @@ restart:
switch (ret) {
case LRU_REMOVED_RETRY:
assert_spin_locked(&nlru->lock);
- /* fall through */
+ fallthrough;
case LRU_REMOVED:
isolated++;
nlru->nr_items--;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ca194864d802..05b4ec2c6499 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2254,7 +2254,8 @@ static void reclaim_high(struct mem_cgroup *memcg,
continue;
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
- } while ((memcg = parent_mem_cgroup(memcg)));
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
}
static void high_work_func(struct work_struct *work)
@@ -5812,7 +5813,7 @@ retry:
switch (get_mctgt_type(vma, addr, ptent, &target)) {
case MC_TARGET_DEVICE:
device = true;
- /* fall through */
+ fallthrough;
case MC_TARGET_PAGE:
page = target.page;
/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1c961cd26c0b..a96364be8ab4 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1810,7 +1810,7 @@ static int __soft_offline_page(struct page *page, int flags)
*/
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_lru(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
diff --git a/mm/memory.c b/mm/memory.c
index 586271f3efc6..19874d133a66 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*src_pte))
pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
} else if (is_device_private_entry(entry)) {
@@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
is_cow_mapping(vm_flags)) {
make_device_private_entry_read(&entry);
pte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
}
@@ -785,6 +789,14 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
+ /*
+ * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
+ * does not have the VM_UFFD_WP, which means that the uffd
+ * fork event is not enabled.
+ */
+ if (!(vm_flags & VM_UFFD_WP))
+ pte = pte_clear_uffd_wp(pte);
+
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
@@ -1940,7 +1952,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
* @vma: user vma to map to
* @addr: target user address to start at
* @pfn: page frame number of kernel physical memory address
- * @size: size of map area
+ * @size: size of mapping area
* @prot: page protection flags for this mapping
*
* Note: this is only safe if the mm semaphore is held when called.
@@ -2752,6 +2764,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_WP);
+ }
+
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
/*
@@ -3085,6 +3102,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(vmf->orig_pte)) {
+ pte = pte_mkuffd_wp(pte);
+ pte = pte_wrprotect(pte);
+ }
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -3373,7 +3394,7 @@ map_pte:
return 0;
}
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -3475,8 +3496,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
pte_t entry;
vm_fault_t ret;
- if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
- IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
@@ -3949,8 +3969,11 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
- if (vma_is_anonymous(vmf->vma))
+ if (vma_is_anonymous(vmf->vma)) {
+ if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+ return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf, orig_pmd);
+ }
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
@@ -3964,11 +3987,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
return VM_FAULT_FALLBACK;
}
-static inline bool vma_is_accessible(struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
-}
-
static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 19389cdc16a5..635e8e286598 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,18 +67,17 @@ void put_online_mems(void)
bool movable_node_enabled = false;
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-bool memhp_auto_online;
+int memhp_default_online_type = MMOP_OFFLINE;
#else
-bool memhp_auto_online = true;
+int memhp_default_online_type = MMOP_ONLINE;
#endif
-EXPORT_SYMBOL_GPL(memhp_auto_online);
static int __init setup_memhp_default_state(char *str)
{
- if (!strcmp(str, "online"))
- memhp_auto_online = true;
- else if (!strcmp(str, "offline"))
- memhp_auto_online = false;
+ const int online_type = memhp_online_type_from_str(str);
+
+ if (online_type >= 0)
+ memhp_default_online_type = online_type;
return 1;
}
@@ -105,7 +104,13 @@ static struct resource *register_memory_resource(u64 start, u64 size)
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
char *resource_name = "System RAM";
- if (start + size > max_mem_size)
+ /*
+ * Make sure value parsed from 'mem=' only restricts memory adding
+ * while booting, so that memory hotplug won't be impacted. Please
+ * refer to document of 'mem=' in kernel-parameters.txt for more
+ * details.
+ */
+ if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
return ERR_PTR(-E2BIG);
/*
@@ -301,8 +306,9 @@ static int check_hotplug_memory_addressable(unsigned long pfn,
int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
struct mhp_restrictions *restrictions)
{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
int err;
- unsigned long nr, start_sec, end_sec;
struct vmem_altmap *altmap = restrictions->altmap;
err = check_hotplug_memory_addressable(pfn, nr_pages);
@@ -325,18 +331,13 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
if (err)
return err;
- start_sec = pfn_to_section_nr(pfn);
- end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
- for (nr = start_sec; nr <= end_sec; nr++) {
- unsigned long pfns;
-
- pfns = min(nr_pages, PAGES_PER_SECTION
- - (pfn & ~PAGE_SECTION_MASK));
- err = sparse_add_section(nid, pfn, pfns, altmap);
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn,
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
+ err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
if (err)
break;
- pfn += pfns;
- nr_pages -= pfns;
cond_resched();
}
vmemmap_populate_print_last();
@@ -494,7 +495,7 @@ static void __remove_section(unsigned long pfn, unsigned long nr_pages,
unsigned long map_offset,
struct vmem_altmap *altmap)
{
- struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
+ struct mem_section *ms = __pfn_to_section(pfn);
if (WARN_ON_ONCE(!valid_section(ms)))
return;
@@ -528,7 +529,8 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
/* Select all remaining pages up to the next section boundary */
- cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK));
+ cur_nr_pages = min(end_pfn - pfn,
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
__remove_section(pfn, cur_nr_pages, map_offset, altmap);
map_offset = 0;
}
@@ -988,6 +990,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
+ mem->online_type = memhp_default_online_type;
return device_online(&mem->dev);
}
@@ -1060,7 +1063,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
mem_hotplug_done();
/* online pages if requested */
- if (memhp_auto_online)
+ if (memhp_default_online_type != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
@@ -1317,7 +1320,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
list_add_tail(&page->lru, &source);
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_lru(page));
} else {
pr_warn("failed to isolate pfn %lx\n", pfn);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5fb427aed612..b4a039d779b0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -442,6 +442,7 @@ static inline bool queue_pages_required(struct page *page,
*/
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
+ __releases(ptl)
{
int ret = 0;
struct page *page;
@@ -627,7 +628,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
{
int nr_updated;
- nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
@@ -678,8 +679,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (!is_vm_hugetlb_page(vma) &&
- (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
!(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
@@ -881,7 +881,6 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
switch (p->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
@@ -1023,7 +1022,7 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
if (!isolate_lru_page(head)) {
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
+ NR_ISOLATED_ANON + page_is_file_lru(head),
hpage_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
@@ -2066,7 +2065,6 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
break;
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
*mask = mempolicy->v.nodes;
break;
@@ -2333,7 +2331,6 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
switch (a->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
diff --git a/mm/migrate.c b/mm/migrate.c
index 7ded07081be9..7160c1556f79 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -193,7 +193,7 @@ void putback_movable_pages(struct list_head *l)
put_page(page);
} else {
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -hpage_nr_pages(page));
putback_lru_page(page);
}
}
@@ -243,11 +243,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
entry = pte_to_swp_entry(*pvmw.pte);
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
+ else if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_mkuffd_wp(pte);
if (unlikely(is_zone_device_page(new))) {
if (is_device_private_page(new)) {
entry = make_device_private_entry(new, pte_write(pte));
pte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_mkuffd_wp(pte);
}
}
@@ -647,6 +651,14 @@ void migrate_page_states(struct page *newpage, struct page *page)
if (PageWriteback(newpage))
end_page_writeback(newpage);
+ /*
+ * PG_readahead shares the same bit with PG_reclaim. The above
+ * end_page_writeback() may clear PG_readahead mistakenly, so set the
+ * bit after that.
+ */
+ if (PageReadahead(page))
+ SetPageReadahead(newpage);
+
copy_page_owner(page, newpage);
mem_cgroup_migrate(page, newpage);
@@ -1211,7 +1223,7 @@ out:
*/
if (likely(!__PageMovable(page)))
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -hpage_nr_pages(page));
}
/*
@@ -1518,9 +1530,6 @@ static int do_move_pages_to_node(struct mm_struct *mm,
{
int err;
- if (list_empty(pagelist))
- return 0;
-
err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
@@ -1587,7 +1596,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
err = 1;
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
+ NR_ISOLATED_ANON + page_is_file_lru(head),
hpage_nr_pages(head));
}
out_putpage:
@@ -1602,6 +1611,32 @@ out:
return err;
}
+static int move_pages_and_store_status(struct mm_struct *mm, int node,
+ struct list_head *pagelist, int __user *status,
+ int start, int i, unsigned long nr_pages)
+{
+ int err;
+
+ if (list_empty(pagelist))
+ return 0;
+
+ err = do_move_pages_to_node(mm, pagelist, node);
+ if (err) {
+ /*
+ * Positive err means the number of failed
+ * pages to migrate. Since we are going to
+ * abort and return the number of non-migrated
+ * pages, so need to incude the rest of the
+ * nr_pages that have not been attempted as
+ * well.
+ */
+ if (err > 0)
+ err += nr_pages - i - 1;
+ return err;
+ }
+ return store_status(status, start, node, i - start);
+}
+
/*
* Migrate an array of page address onto an array of nodes and fill
* the corresponding array of status.
@@ -1645,21 +1680,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
current_node = node;
start = i;
} else if (node != current_node) {
- err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err) {
- /*
- * Positive err means the number of failed
- * pages to migrate. Since we are going to
- * abort and return the number of non-migrated
- * pages, so need to incude the rest of the
- * nr_pages that have not been attempted as
- * well.
- */
- if (err > 0)
- err += nr_pages - i - 1;
- goto out;
- }
- err = store_status(status, start, current_node, i - start);
+ err = move_pages_and_store_status(mm, current_node,
+ &pagelist, status, start, i, nr_pages);
if (err)
goto out;
start = i;
@@ -1673,49 +1695,29 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
err = add_page_for_migration(mm, addr, current_node,
&pagelist, flags & MPOL_MF_MOVE_ALL);
- if (!err) {
- /* The page is already on the target node */
- err = store_status(status, i, current_node, 1);
- if (err)
- goto out_flush;
- continue;
- } else if (err > 0) {
+ if (err > 0) {
/* The page is successfully queued for migration */
continue;
}
- err = store_status(status, i, err, 1);
+ /*
+ * If the page is already on the target node (!err), store the
+ * node, otherwise, store the err.
+ */
+ err = store_status(status, i, err ? : current_node, 1);
if (err)
goto out_flush;
- err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err) {
- if (err > 0)
- err += nr_pages - i - 1;
+ err = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
+ if (err)
goto out;
- }
- if (i > start) {
- err = store_status(status, start, current_node, i - start);
- if (err)
- goto out;
- }
current_node = NUMA_NO_NODE;
}
out_flush:
- if (list_empty(&pagelist))
- return err;
-
/* Make sure we do not overwrite the existing error */
- err1 = do_move_pages_to_node(mm, &pagelist, current_node);
- /*
- * Don't have to report non-attempted pages here since:
- * - If the above loop is done gracefully all pages have been
- * attempted.
- * - If the above loop is aborted it means a fatal error
- * happened, should return ret.
- */
- if (!err1)
- err1 = store_status(status, start, current_node, i - start);
+ err1 = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
if (err >= 0)
err = err1;
out:
@@ -1957,7 +1959,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 0;
}
- page_lru = page_is_file_cache(page);
+ page_lru = page_is_file_lru(page);
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
hpage_nr_pages(page));
@@ -1993,7 +1995,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
* Don't migrate file pages that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
*/
- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
(vma->vm_flags & VM_EXEC))
goto out;
@@ -2001,7 +2003,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
* Also do not migrate dirty pages as not all filesystems can move
* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
*/
- if (page_is_file_cache(page) && PageDirty(page))
+ if (page_is_file_lru(page) && PageDirty(page))
goto out;
isolated = numamigrate_isolate_page(pgdat, page);
@@ -2016,7 +2018,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (!list_empty(&migratepages)) {
list_del(&page->lru);
dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_lru(page));
putback_lru_page(page);
}
isolated = 0;
@@ -2046,7 +2048,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
- int page_lru = page_is_file_cache(page);
+ int page_lru = page_is_file_lru(page);
unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
@@ -2340,6 +2342,8 @@ again:
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pte))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, addr, ptep, swp_pte);
/*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5c918388de99..7da6991d9435 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -37,7 +37,7 @@ void __init mminit_verify_zonelist(void)
struct zonelist *zonelist;
int i, listid, zoneid;
- BUG_ON(MAX_ZONELISTS > 2);
+ BUILD_BUG_ON(MAX_ZONELISTS > 2);
for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
/* Identify the zone and nodelist */
diff --git a/mm/mmap.c b/mm/mmap.c
index 94ae18398c59..8d77dbbb80fe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1460,7 +1460,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* with MAP_SHARED to preserve backward compatibility.
*/
flags &= LEGACY_MAP_MASK;
- /* fall through */
+ fallthrough;
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
@@ -1487,8 +1487,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
-
- /* fall through */
+ fallthrough;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
@@ -2358,8 +2357,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
gap_addr = TASK_SIZE;
next = vma->vm_next;
- if (next && next->vm_start < gap_addr &&
- (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+ if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
@@ -2440,7 +2438,7 @@ int expand_downwards(struct vm_area_struct *vma,
prev = vma->vm_prev;
/* Check that both stack segments have the same anon_vma? */
if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
- (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+ vma_is_accessible(prev)) {
if (address - prev->vm_end < stack_guard_gap)
return -ENOMEM;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 311c0dadf71c..1d823b050329 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,12 +37,16 @@
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa)
+ unsigned long cp_flags)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
int target_node = NUMA_NO_NODE;
+ bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
/*
* Can be called with only the mmap_sem for reading by
@@ -98,7 +102,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* it cannot move them all from MIGRATE_ASYNC
* context.
*/
- if (page_is_file_cache(page) && PageDirty(page))
+ if (page_is_file_lru(page) && PageDirty(page))
continue;
/*
@@ -114,6 +118,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (preserve_write)
ptent = pte_mk_savedwrite(ptent);
+ if (uffd_wp) {
+ ptent = pte_wrprotect(ptent);
+ ptent = pte_mkuffd_wp(ptent);
+ } else if (uffd_wp_resolve) {
+ /*
+ * Leave the write bit to be handled
+ * by PF interrupt handler, then
+ * things like COW could be properly
+ * handled.
+ */
+ ptent = pte_clear_uffd_wp(ptent);
+ }
+
/* Avoid taking write faults for known dirty pages */
if (dirty_accountable && pte_dirty(ptent) &&
(pte_soft_dirty(ptent) ||
@@ -122,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
pages++;
- } else if (IS_ENABLED(CONFIG_MIGRATION)) {
+ } else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
+ pte_t newpte;
if (is_write_migration_entry(entry)) {
- pte_t newpte;
/*
* A protection check is difficult so
* just be safe and disable write
@@ -135,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
- set_pte_at(vma->vm_mm, addr, pte, newpte);
-
- pages++;
- }
-
- if (is_write_device_private_entry(entry)) {
- pte_t newpte;
-
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (is_write_device_private_entry(entry)) {
/*
* We do not preserve soft-dirtiness. See
* copy_one_pte() for explanation.
*/
make_device_private_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
- set_pte_at(vma->vm_mm, addr, pte, newpte);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
+ } else {
+ newpte = oldpte;
+ }
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+ if (!pte_same(oldpte, newpte)) {
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
}
@@ -188,7 +211,7 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
- pgprot_t newprot, int dirty_accountable, int prot_numa)
+ pgprot_t newprot, unsigned long cp_flags)
{
pmd_t *pmd;
unsigned long next;
@@ -229,7 +252,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
__split_huge_pmd(vma, pmd, addr, false, NULL);
} else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
- newprot, prot_numa);
+ newprot, cp_flags);
if (nr_ptes) {
if (nr_ptes == HPAGE_PMD_NR) {
@@ -244,7 +267,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
/* fall through, the trans huge pmd just split */
}
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
- dirty_accountable, prot_numa);
+ cp_flags);
pages += this_pages;
next:
cond_resched();
@@ -260,7 +283,7 @@ next:
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
p4d_t *p4d, unsigned long addr, unsigned long end,
- pgprot_t newprot, int dirty_accountable, int prot_numa)
+ pgprot_t newprot, unsigned long cp_flags)
{
pud_t *pud;
unsigned long next;
@@ -272,7 +295,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
if (pud_none_or_clear_bad(pud))
continue;
pages += change_pmd_range(vma, pud, addr, next, newprot,
- dirty_accountable, prot_numa);
+ cp_flags);
} while (pud++, addr = next, addr != end);
return pages;
@@ -280,7 +303,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
pgd_t *pgd, unsigned long addr, unsigned long end,
- pgprot_t newprot, int dirty_accountable, int prot_numa)
+ pgprot_t newprot, unsigned long cp_flags)
{
p4d_t *p4d;
unsigned long next;
@@ -292,7 +315,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
if (p4d_none_or_clear_bad(p4d))
continue;
pages += change_pud_range(vma, p4d, addr, next, newprot,
- dirty_accountable, prot_numa);
+ cp_flags);
} while (p4d++, addr = next, addr != end);
return pages;
@@ -300,7 +323,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa)
+ unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -317,7 +340,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
if (pgd_none_or_clear_bad(pgd))
continue;
pages += change_p4d_range(vma, pgd, addr, next, newprot,
- dirty_accountable, prot_numa);
+ cp_flags);
} while (pgd++, addr = next, addr != end);
/* Only flush the TLB if we actually modified any entries: */
@@ -330,14 +353,17 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa)
+ unsigned long cp_flags)
{
unsigned long pages;
+ BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot);
else
- pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+ pages = change_protection_range(vma, start, end, newprot,
+ cp_flags);
return pages;
}
@@ -459,7 +485,7 @@ success:
vma_set_page_prot(vma);
change_protection(vma, start, end, vma->vm_page_prot,
- dirty_accountable, 0);
+ dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e5f76da8cd4e..114c56c3685d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -74,6 +74,7 @@
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
+#include "page_reporting.h"
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
@@ -864,6 +865,78 @@ compaction_capture(struct capture_control *capc, struct page *page,
}
#endif /* CONFIG_COMPACTION */
+/* Used for pages not on another list */
+static inline void add_to_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ struct free_area *area = &zone->free_area[order];
+
+ list_add(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+/* Used for pages not on another list */
+static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ struct free_area *area = &zone->free_area[order];
+
+ list_add_tail(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+/* Used for pages which are on another list */
+static inline void move_to_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ struct free_area *area = &zone->free_area[order];
+
+ list_move(&page->lru, &area->free_list[migratetype]);
+}
+
+static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+ unsigned int order)
+{
+ /* clear reported state and update reported page count */
+ if (page_reported(page))
+ __ClearPageReported(page);
+
+ list_del(&page->lru);
+ __ClearPageBuddy(page);
+ set_page_private(page, 0);
+ zone->free_area[order].nr_free--;
+}
+
+/*
+ * If this is not the largest possible page, check if the buddy
+ * of the next-highest order is free. If it is, it's possible
+ * that pages are being freed that will coalesce soon. In case,
+ * that is happening, add the free page to the tail of the list
+ * so it's less likely to be used soon and more likely to be merged
+ * as a higher order page
+ */
+static inline bool
+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
+ struct page *page, unsigned int order)
+{
+ struct page *higher_page, *higher_buddy;
+ unsigned long combined_pfn;
+
+ if (order >= MAX_ORDER - 2)
+ return false;
+
+ if (!pfn_valid_within(buddy_pfn))
+ return false;
+
+ combined_pfn = buddy_pfn & pfn;
+ higher_page = page + (combined_pfn - pfn);
+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
+
+ return pfn_valid_within(buddy_pfn) &&
+ page_is_buddy(higher_page, higher_buddy, order + 1);
+}
+
/*
* Freeing function for a buddy system allocator.
*
@@ -891,13 +964,14 @@ compaction_capture(struct capture_control *capc, struct page *page,
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
- int migratetype)
+ int migratetype, bool report)
{
- unsigned long combined_pfn;
+ struct capture_control *capc = task_capc(zone);
unsigned long uninitialized_var(buddy_pfn);
- struct page *buddy;
+ unsigned long combined_pfn;
unsigned int max_order;
- struct capture_control *capc = task_capc(zone);
+ struct page *buddy;
+ bool to_tail;
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
@@ -932,7 +1006,7 @@ continue_merging:
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
- del_page_from_free_area(buddy, &zone->free_area[order]);
+ del_page_from_free_list(buddy, zone, order);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -966,35 +1040,19 @@ continue_merging:
done_merging:
set_page_order(page, order);
- /*
- * If this is not the largest possible page, check if the buddy
- * of the next-highest order is free. If it is, it's possible
- * that pages are being freed that will coalesce soon. In case,
- * that is happening, add the free page to the tail of the list
- * so it's less likely to be used soon and more likely to be merged
- * as a higher order page
- */
- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
- && !is_shuffle_order(order)) {
- struct page *higher_page, *higher_buddy;
- combined_pfn = buddy_pfn & pfn;
- higher_page = page + (combined_pfn - pfn);
- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
- if (pfn_valid_within(buddy_pfn) &&
- page_is_buddy(higher_page, higher_buddy, order + 1)) {
- add_to_free_area_tail(page, &zone->free_area[order],
- migratetype);
- return;
- }
- }
-
if (is_shuffle_order(order))
- add_to_free_area_random(page, &zone->free_area[order],
- migratetype);
+ to_tail = shuffle_pick_tail();
+ else
+ to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
+
+ if (to_tail)
+ add_to_free_list_tail(page, zone, order, migratetype);
else
- add_to_free_area(page, &zone->free_area[order], migratetype);
+ add_to_free_list(page, zone, order, migratetype);
+ /* Notify page reporting subsystem of freed page */
+ if (report)
+ page_reporting_notify_free(order);
}
/*
@@ -1311,7 +1369,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
@@ -1327,7 +1385,7 @@ static void free_one_page(struct zone *zone,
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
- __free_one_page(page, pfn, zone, order, migratetype);
+ __free_one_page(page, pfn, zone, order, migratetype, true);
spin_unlock(&zone->lock);
}
@@ -2008,13 +2066,11 @@ void __init init_cma_reserved_pageblock(struct page *page)
* -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
- int low, int high, struct free_area *area,
- int migratetype)
+ int low, int high, int migratetype)
{
unsigned long size = 1 << high;
while (high > low) {
- area--;
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
@@ -2028,7 +2084,7 @@ static inline void expand(struct zone *zone, struct page *page,
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- add_to_free_area(&page[size], area, migratetype);
+ add_to_free_list(&page[size], zone, high, migratetype);
set_page_order(&page[size], high);
}
}
@@ -2186,8 +2242,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- del_page_from_free_area(page, area);
- expand(zone, page, order, current_order, area, migratetype);
+ del_page_from_free_list(page, zone, current_order);
+ expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
@@ -2261,7 +2317,7 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
order = page_order(page);
- move_to_free_area(page, &zone->free_area[order], migratetype);
+ move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -2377,7 +2433,6 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
- struct free_area *area;
int free_pages, movable_pages, alike_pages;
int old_block_type;
@@ -2448,8 +2503,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
return;
single_page:
- area = &zone->free_area[current_order];
- move_to_free_area(page, area, start_type);
+ move_to_free_list(page, zone, current_order, start_type);
}
/*
@@ -3120,7 +3174,6 @@ EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
- struct free_area *area = &page_zone(page)->free_area[order];
unsigned long watermark;
struct zone *zone;
int mt;
@@ -3146,7 +3199,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
/* Remove page from free list */
- del_page_from_free_area(page, area);
+ del_page_from_free_list(page, zone, order);
/*
* Set the pageblock if the isolated page is at least half of a
@@ -3167,6 +3220,25 @@ int __isolate_free_page(struct page *page, unsigned int order)
return 1UL << order;
}
+/**
+ * __putback_isolated_page - Return a now-isolated page back where we got it
+ * @page: Page that was isolated
+ * @order: Order of the isolated page
+ *
+ * This function is meant to return a page pulled from the free lists via
+ * __isolate_free_page back to the free lists they were pulled from.
+ */
+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
+{
+ struct zone *zone = page_zone(page);
+
+ /* zone lock should be held when this function is called */
+ lockdep_assert_held(&zone->lock);
+
+ /* Return isolated page to tail of freelist. */
+ __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+}
+
/*
* Update NUMA hit/miss statistics
*
@@ -8713,7 +8785,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
offlined_pages += 1 << order;
- del_page_from_free_area(page, &zone->free_area[order]);
+ del_page_from_free_list(page, zone, order);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 08ded037f89f..a3616f7a0e9e 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -303,11 +303,8 @@ static int __meminit online_page_ext(unsigned long start_pfn,
VM_BUG_ON(!node_state(nid, N_ONLINE));
}
- for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
- if (!pfn_in_present_section(pfn))
- continue;
+ for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
fail = init_section_page_ext(pfn, nid);
- }
if (!fail)
return 0;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a9fd7c740c23..2c11a38d6e87 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -117,13 +117,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
__mod_zone_freepage_state(zone, nr_pages, migratetype);
}
set_pageblock_migratetype(page, migratetype);
+ if (isolated_page)
+ __putback_isolated_page(page, order, migratetype);
zone->nr_isolate_pageblock--;
out:
spin_unlock_irqrestore(&zone->lock, flags);
- if (isolated_page) {
- post_alloc_hook(page, order, __GFP_MOVABLE);
- __free_pages(page, order);
- }
}
static inline struct page *
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
new file mode 100644
index 000000000000..3bbd471cfc81
--- /dev/null
+++ b/mm/page_reporting.c
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/page_reporting.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+
+#include "page_reporting.h"
+#include "internal.h"
+
+#define PAGE_REPORTING_DELAY (2 * HZ)
+static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
+
+enum {
+ PAGE_REPORTING_IDLE = 0,
+ PAGE_REPORTING_REQUESTED,
+ PAGE_REPORTING_ACTIVE
+};
+
+/* request page reporting */
+static void
+__page_reporting_request(struct page_reporting_dev_info *prdev)
+{
+ unsigned int state;
+
+ /* Check to see if we are in desired state */
+ state = atomic_read(&prdev->state);
+ if (state == PAGE_REPORTING_REQUESTED)
+ return;
+
+ /*
+ * If reporting is already active there is nothing we need to do.
+ * Test against 0 as that represents PAGE_REPORTING_IDLE.
+ */
+ state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
+ if (state != PAGE_REPORTING_IDLE)
+ return;
+
+ /*
+ * Delay the start of work to allow a sizable queue to build. For
+ * now we are limiting this to running no more than once every
+ * couple of seconds.
+ */
+ schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+}
+
+/* notify prdev of free page reporting request */
+void __page_reporting_notify(void)
+{
+ struct page_reporting_dev_info *prdev;
+
+ /*
+ * We use RCU to protect the pr_dev_info pointer. In almost all
+ * cases this should be present, however in the unlikely case of
+ * a shutdown this will be NULL and we should exit.
+ */
+ rcu_read_lock();
+ prdev = rcu_dereference(pr_dev_info);
+ if (likely(prdev))
+ __page_reporting_request(prdev);
+
+ rcu_read_unlock();
+}
+
+static void
+page_reporting_drain(struct page_reporting_dev_info *prdev,
+ struct scatterlist *sgl, unsigned int nents, bool reported)
+{
+ struct scatterlist *sg = sgl;
+
+ /*
+ * Drain the now reported pages back into their respective
+ * free lists/areas. We assume at least one page is populated.
+ */
+ do {
+ struct page *page = sg_page(sg);
+ int mt = get_pageblock_migratetype(page);
+ unsigned int order = get_order(sg->length);
+
+ __putback_isolated_page(page, order, mt);
+
+ /* If the pages were not reported due to error skip flagging */
+ if (!reported)
+ continue;
+
+ /*
+ * If page was not comingled with another page we can
+ * consider the result to be "reported" since the page
+ * hasn't been modified, otherwise we will need to
+ * report on the new larger page when we make our way
+ * up to that higher order.
+ */
+ if (PageBuddy(page) && page_order(page) == order)
+ __SetPageReported(page);
+ } while ((sg = sg_next(sg)));
+
+ /* reinitialize scatterlist now that it is empty */
+ sg_init_table(sgl, nents);
+}
+
+/*
+ * The page reporting cycle consists of 4 stages, fill, report, drain, and
+ * idle. We will cycle through the first 3 stages until we cannot obtain a
+ * full scatterlist of pages, in that case we will switch to idle.
+ */
+static int
+page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
+ unsigned int order, unsigned int mt,
+ struct scatterlist *sgl, unsigned int *offset)
+{
+ struct free_area *area = &zone->free_area[order];
+ struct list_head *list = &area->free_list[mt];
+ unsigned int page_len = PAGE_SIZE << order;
+ struct page *page, *next;
+ long budget;
+ int err = 0;
+
+ /*
+ * Perform early check, if free area is empty there is
+ * nothing to process so we can skip this free_list.
+ */
+ if (list_empty(list))
+ return err;
+
+ spin_lock_irq(&zone->lock);
+
+ /*
+ * Limit how many calls we will be making to the page reporting
+ * device for this list. By doing this we avoid processing any
+ * given list for too long.
+ *
+ * The current value used allows us enough calls to process over a
+ * sixteenth of the current list plus one additional call to handle
+ * any pages that may have already been present from the previous
+ * list processed. This should result in us reporting all pages on
+ * an idle system in about 30 seconds.
+ *
+ * The division here should be cheap since PAGE_REPORTING_CAPACITY
+ * should always be a power of 2.
+ */
+ budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
+
+ /* loop through free list adding unreported pages to sg list */
+ list_for_each_entry_safe(page, next, list, lru) {
+ /* We are going to skip over the reported pages. */
+ if (PageReported(page))
+ continue;
+
+ /*
+ * If we fully consumed our budget then update our
+ * state to indicate that we are requesting additional
+ * processing and exit this list.
+ */
+ if (budget < 0) {
+ atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
+ next = page;
+ break;
+ }
+
+ /* Attempt to pull page from list and place in scatterlist */
+ if (*offset) {
+ if (!__isolate_free_page(page, order)) {
+ next = page;
+ break;
+ }
+
+ /* Add page to scatter list */
+ --(*offset);
+ sg_set_page(&sgl[*offset], page, page_len, 0);
+
+ continue;
+ }
+
+ /*
+ * Make the first non-reported page in the free list
+ * the new head of the free list before we release the
+ * zone lock.
+ */
+ if (&page->lru != list && !list_is_first(&page->lru, list))
+ list_rotate_to_front(&page->lru, list);
+
+ /* release lock before waiting on report processing */
+ spin_unlock_irq(&zone->lock);
+
+ /* begin processing pages in local list */
+ err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
+
+ /* reset offset since the full list was reported */
+ *offset = PAGE_REPORTING_CAPACITY;
+
+ /* update budget to reflect call to report function */
+ budget--;
+
+ /* reacquire zone lock and resume processing */
+ spin_lock_irq(&zone->lock);
+
+ /* flush reported pages from the sg list */
+ page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
+
+ /*
+ * Reset next to first entry, the old next isn't valid
+ * since we dropped the lock to report the pages
+ */
+ next = list_first_entry(list, struct page, lru);
+
+ /* exit on error */
+ if (err)
+ break;
+ }
+
+ /* Rotate any leftover pages to the head of the freelist */
+ if (&next->lru != list && !list_is_first(&next->lru, list))
+ list_rotate_to_front(&next->lru, list);
+
+ spin_unlock_irq(&zone->lock);
+
+ return err;
+}
+
+static int
+page_reporting_process_zone(struct page_reporting_dev_info *prdev,
+ struct scatterlist *sgl, struct zone *zone)
+{
+ unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
+ unsigned long watermark;
+ int err = 0;
+
+ /* Generate minimum watermark to be able to guarantee progress */
+ watermark = low_wmark_pages(zone) +
+ (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
+
+ /*
+ * Cancel request if insufficient free memory or if we failed
+ * to allocate page reporting statistics for the zone.
+ */
+ if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ return err;
+
+ /* Process each free list starting from lowest order/mt */
+ for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
+ for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+ /* We do not pull pages from the isolate free list */
+ if (is_migrate_isolate(mt))
+ continue;
+
+ err = page_reporting_cycle(prdev, zone, order, mt,
+ sgl, &offset);
+ if (err)
+ return err;
+ }
+ }
+
+ /* report the leftover pages before going idle */
+ leftover = PAGE_REPORTING_CAPACITY - offset;
+ if (leftover) {
+ sgl = &sgl[offset];
+ err = prdev->report(prdev, sgl, leftover);
+
+ /* flush any remaining pages out from the last report */
+ spin_lock_irq(&zone->lock);
+ page_reporting_drain(prdev, sgl, leftover, !err);
+ spin_unlock_irq(&zone->lock);
+ }
+
+ return err;
+}
+
+static void page_reporting_process(struct work_struct *work)
+{
+ struct delayed_work *d_work = to_delayed_work(work);
+ struct page_reporting_dev_info *prdev =
+ container_of(d_work, struct page_reporting_dev_info, work);
+ int err = 0, state = PAGE_REPORTING_ACTIVE;
+ struct scatterlist *sgl;
+ struct zone *zone;
+
+ /*
+ * Change the state to "Active" so that we can track if there is
+ * anyone requests page reporting after we complete our pass. If
+ * the state is not altered by the end of the pass we will switch
+ * to idle and quit scheduling reporting runs.
+ */
+ atomic_set(&prdev->state, state);
+
+ /* allocate scatterlist to store pages being reported on */
+ sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
+ if (!sgl)
+ goto err_out;
+
+ sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
+
+ for_each_zone(zone) {
+ err = page_reporting_process_zone(prdev, sgl, zone);
+ if (err)
+ break;
+ }
+
+ kfree(sgl);
+err_out:
+ /*
+ * If the state has reverted back to requested then there may be
+ * additional pages to be processed. We will defer for 2s to allow
+ * more pages to accumulate.
+ */
+ state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
+ if (state == PAGE_REPORTING_REQUESTED)
+ schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+}
+
+static DEFINE_MUTEX(page_reporting_mutex);
+DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
+
+int page_reporting_register(struct page_reporting_dev_info *prdev)
+{
+ int err = 0;
+
+ mutex_lock(&page_reporting_mutex);
+
+ /* nothing to do if already in use */
+ if (rcu_access_pointer(pr_dev_info)) {
+ err = -EBUSY;
+ goto err_out;
+ }
+
+ /* initialize state and work structures */
+ atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
+ INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
+
+ /* Begin initial flush of zones */
+ __page_reporting_request(prdev);
+
+ /* Assign device to allow notifications */
+ rcu_assign_pointer(pr_dev_info, prdev);
+
+ /* enable page reporting notification */
+ if (!static_key_enabled(&page_reporting_enabled)) {
+ static_branch_enable(&page_reporting_enabled);
+ pr_info("Free page reporting enabled\n");
+ }
+err_out:
+ mutex_unlock(&page_reporting_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(page_reporting_register);
+
+void page_reporting_unregister(struct page_reporting_dev_info *prdev)
+{
+ mutex_lock(&page_reporting_mutex);
+
+ if (rcu_access_pointer(pr_dev_info) == prdev) {
+ /* Disable page reporting notification */
+ RCU_INIT_POINTER(pr_dev_info, NULL);
+ synchronize_rcu();
+
+ /* Flush any existing work, and lock it out */
+ cancel_delayed_work_sync(&prdev->work);
+ }
+
+ mutex_unlock(&page_reporting_mutex);
+}
+EXPORT_SYMBOL_GPL(page_reporting_unregister);
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
new file mode 100644
index 000000000000..aa6d37f4dc22
--- /dev/null
+++ b/mm/page_reporting.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_PAGE_REPORTING_H
+#define _MM_PAGE_REPORTING_H
+
+#include <linux/mmzone.h>
+#include <linux/pageblock-flags.h>
+#include <linux/page-isolation.h>
+#include <linux/jump_label.h>
+#include <linux/slab.h>
+#include <asm/pgtable.h>
+#include <linux/scatterlist.h>
+
+#define PAGE_REPORTING_MIN_ORDER pageblock_order
+
+#ifdef CONFIG_PAGE_REPORTING
+DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
+void __page_reporting_notify(void);
+
+static inline bool page_reported(struct page *page)
+{
+ return static_branch_unlikely(&page_reporting_enabled) &&
+ PageReported(page);
+}
+
+/**
+ * page_reporting_notify_free - Free page notification to start page processing
+ *
+ * This function is meant to act as a screener for __page_reporting_notify
+ * which will determine if a give zone has crossed over the high-water mark
+ * that will justify us beginning page treatment. If we have crossed that
+ * threshold then it will start the process of pulling some pages and
+ * placing them in the batch list for treatment.
+ */
+static inline void page_reporting_notify_free(unsigned int order)
+{
+ /* Called from hot path in __free_one_page() */
+ if (!static_branch_unlikely(&page_reporting_enabled))
+ return;
+
+ /* Determine if we have crossed reporting threshold */
+ if (order < PAGE_REPORTING_MIN_ORDER)
+ return;
+
+ /* This will add a few cycles, but should be called infrequently */
+ __page_reporting_notify();
+}
+#else /* CONFIG_PAGE_REPORTING */
+#define page_reported(_page) false
+
+static inline void page_reporting_notify_free(unsigned int order)
+{
+}
+#endif /* CONFIG_PAGE_REPORTING */
+#endif /*_MM_PAGE_REPORTING_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 2df75a119c83..f79a206b271a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -275,19 +275,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
- struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev;
-
- /*
- * If parent share anon_vma with its vm_prev, keep this sharing in in
- * child.
- *
- * 1. Parent has vm_prev, which implies we have vm_prev.
- * 2. Parent and its vm_prev have the same anon_vma.
- */
- if (!dst->anon_vma && src->anon_vma &&
- pprev && pprev->anon_vma == src->anon_vma)
- dst->anon_vma = prev->anon_vma;
-
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
@@ -946,7 +933,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
set_pte_at(vma->vm_mm, address, pte, entry);
ret = 1;
} else {
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_t *pmd = pvmw.pmd;
pmd_t entry;
@@ -1385,7 +1372,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
struct page *subpage;
bool ret = true;
struct mmu_notifier_range range;
- enum ttu_flags flags = (enum ttu_flags)arg;
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1515,6 +1502,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
@@ -1614,6 +1603,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
@@ -1680,6 +1671,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
diff --git a/mm/shmem.c b/mm/shmem.c
index f47347cb30f6..d722eb830317 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -410,7 +410,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
#define SHMEM_HUGE_DENY (-1)
#define SHMEM_HUGE_FORCE (-2)
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */
static int shmem_huge __read_mostly;
@@ -580,7 +580,7 @@ static long shmem_unused_huge_count(struct super_block *sb,
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
return READ_ONCE(sbinfo->shrinklist_len);
}
-#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
#define shmem_huge SHMEM_HUGE_DENY
@@ -589,11 +589,11 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
{
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
{
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
(shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
shmem_huge != SHMEM_HUGE_DENY)
return true;
@@ -789,6 +789,32 @@ void shmem_unlock_mapping(struct address_space *mapping)
}
/*
+ * Check whether a hole-punch or truncation needs to split a huge page,
+ * returning true if no split was required, or the split has been successful.
+ *
+ * Eviction (or truncation to 0 size) should never need to split a huge page;
+ * but in rare cases might do so, if shmem_undo_range() failed to trylock on
+ * head, and then succeeded to trylock on tail.
+ *
+ * A split can only succeed when there are no additional references on the
+ * huge page: so the split below relies upon find_get_entries() having stopped
+ * when it found a subpage of the huge page, without getting further references.
+ */
+static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
+{
+ if (!PageTransCompound(page))
+ return true;
+
+ /* Just proceed to delete a huge page wholly within the range punched */
+ if (PageHead(page) &&
+ page->index >= start && page->index + HPAGE_PMD_NR <= end)
+ return true;
+
+ /* Try to split huge page, so we can truly punch the hole or truncate */
+ return split_huge_page(page) >= 0;
+}
+
+/*
* Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
@@ -838,31 +864,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (!trylock_page(page))
continue;
- if (PageTransTail(page)) {
- /* Middle of THP: zero out the page */
- clear_highpage(page);
- unlock_page(page);
- continue;
- } else if (PageTransHuge(page)) {
- if (index == round_down(end, HPAGE_PMD_NR)) {
- /*
- * Range ends in the middle of THP:
- * zero out the page
- */
- clear_highpage(page);
- unlock_page(page);
- continue;
- }
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- }
-
- if (!unfalloc || !PageUptodate(page)) {
- VM_BUG_ON_PAGE(PageTail(page), page);
- if (page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if ((!unfalloc || !PageUptodate(page)) &&
+ page_mapping(page) == mapping) {
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (shmem_punch_compound(page, start, end))
truncate_inode_page(mapping, page);
- }
}
unlock_page(page);
}
@@ -936,43 +942,25 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
lock_page(page);
- if (PageTransTail(page)) {
- /* Middle of THP: zero out the page */
- clear_highpage(page);
- unlock_page(page);
- /*
- * Partial thp truncate due 'start' in middle
- * of THP: don't need to look on these pages
- * again on !pvec.nr restart.
- */
- if (index != round_down(end, HPAGE_PMD_NR))
- start++;
- continue;
- } else if (PageTransHuge(page)) {
- if (index == round_down(end, HPAGE_PMD_NR)) {
- /*
- * Range ends in the middle of THP:
- * zero out the page
- */
- clear_highpage(page);
- unlock_page(page);
- continue;
- }
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- }
-
if (!unfalloc || !PageUptodate(page)) {
- VM_BUG_ON_PAGE(PageTail(page), page);
- if (page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- truncate_inode_page(mapping, page);
- } else {
+ if (page_mapping(page) != mapping) {
/* Page was replaced by swap: retry */
unlock_page(page);
index--;
break;
}
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (shmem_punch_compound(page, start, end))
+ truncate_inode_page(mapping, page);
+ else {
+ /* Wipe the page and don't get stuck */
+ clear_highpage(page);
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ if (index <
+ round_up(start, HPAGE_PMD_NR))
+ start = index + 1;
+ }
}
unlock_page(page);
}
@@ -1059,7 +1047,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
* Part of the huge page can be beyond i_size: subject
* to shrink under memory pressure.
*/
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
spin_lock(&sbinfo->shrinklist_lock);
/*
* _careful to defend against unlocked access to
@@ -1472,9 +1460,6 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
pgoff_t hindex;
struct page *page;
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
- return NULL;
-
hindex = round_down(index, HPAGE_PMD_NR);
if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
XA_PRESENT))
@@ -1486,6 +1471,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
+ else
+ count_vm_event(THP_FILE_FALLBACK);
return page;
}
@@ -1511,7 +1498,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
int nr;
int err = -ENOSPC;
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
nr = huge ? HPAGE_PMD_NR : 1;
@@ -1813,17 +1800,20 @@ repeat:
if (shmem_huge == SHMEM_HUGE_FORCE)
goto alloc_huge;
switch (sbinfo->huge) {
- loff_t i_size;
- pgoff_t off;
case SHMEM_HUGE_NEVER:
goto alloc_nohuge;
- case SHMEM_HUGE_WITHIN_SIZE:
+ case SHMEM_HUGE_WITHIN_SIZE: {
+ loff_t i_size;
+ pgoff_t off;
+
off = round_up(index, HPAGE_PMD_NR);
i_size = round_up(i_size_read(inode), PAGE_SIZE);
if (i_size >= HPAGE_PMD_SIZE &&
i_size >> PAGE_SHIFT >= off)
goto alloc_huge;
- /* fallthrough */
+
+ fallthrough;
+ }
case SHMEM_HUGE_ADVISE:
if (sgp_huge == SGP_HUGE)
goto alloc_huge;
@@ -1871,8 +1861,13 @@ alloc_nohuge:
error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
PageTransHuge(page));
- if (error)
+ if (error) {
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ }
goto unacct;
+ }
error = shmem_add_to_page_cache(page, mapping, hindex,
NULL, gfp & GFP_RECLAIM_MASK);
if (error) {
@@ -2089,7 +2084,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
get_area = current->mm->get_unmapped_area;
addr = get_area(file, uaddr, len, pgoff, flags);
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return addr;
if (IS_ERR_VALUE(addr))
return addr;
@@ -2228,7 +2223,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
(vma->vm_end & HPAGE_PMD_MASK)) {
khugepaged_enter(vma, vma->vm_flags);
@@ -3113,12 +3108,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- error = 0;
+ if (error && error != -EOPNOTSUPP) {
+ iput(inode);
+ return error;
}
inode->i_size = len-1;
@@ -3455,7 +3447,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
case Opt_huge:
ctx->huge = result.uint_32;
if (ctx->huge != SHMEM_HUGE_NEVER &&
- !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
has_transparent_hugepage()))
goto unsupported_parameter;
ctx->seen |= SHMEM_SEEN_HUGE;
@@ -3601,7 +3593,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, sbinfo->gid));
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
if (sbinfo->huge)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
@@ -3846,7 +3838,7 @@ static const struct super_operations shmem_ops = {
.evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
.nr_cached_objects = shmem_unused_huge_count,
.free_cached_objects = shmem_unused_huge_scan,
#endif
@@ -3908,7 +3900,7 @@ int __init shmem_init(void)
goto out1;
}
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
else
@@ -3924,7 +3916,7 @@ out2:
return error;
}
-#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -3976,9 +3968,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
struct kobj_attribute shmem_enabled_attr =
__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool shmem_huge_enabled(struct vm_area_struct *vma)
{
struct inode *inode = file_inode(vma->vm_file);
@@ -4004,7 +3996,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
if (i_size >= HPAGE_PMD_SIZE &&
i_size >> PAGE_SHIFT >= off)
return true;
- /* fall through */
+ fallthrough;
case SHMEM_HUGE_ADVISE:
/* TODO: implement fadvise() hints */
return (vma->vm_flags & VM_HUGEPAGE);
@@ -4013,7 +4005,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
return false;
}
}
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#else /* !CONFIG_SHMEM */
@@ -4182,7 +4174,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
(vma->vm_end & HPAGE_PMD_MASK)) {
khugepaged_enter(vma, vma->vm_flags);
diff --git a/mm/shuffle.c b/mm/shuffle.c
index c716059cbd3c..44406d9977c7 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -183,11 +183,11 @@ void __meminit __shuffle_free_memory(pg_data_t *pgdat)
shuffle_zone(z);
}
-void add_to_free_area_random(struct page *page, struct free_area *area,
- int migratetype)
+bool shuffle_pick_tail(void)
{
static u64 rand;
static u8 rand_bits;
+ bool ret;
/*
* The lack of locking is deliberate. If 2 threads race to
@@ -198,10 +198,10 @@ void add_to_free_area_random(struct page *page, struct free_area *area,
rand = get_random_u64();
}
- if (rand & 1)
- add_to_free_area(page, area, migratetype);
- else
- add_to_free_area_tail(page, area, migratetype);
+ ret = rand & 1;
+
rand_bits--;
rand >>= 1;
+
+ return ret;
}
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 777a257a0d2f..4d79f03b6658 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -22,6 +22,7 @@ enum mm_shuffle_ctl {
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
extern void __shuffle_free_memory(pg_data_t *pgdat);
+extern bool shuffle_pick_tail(void);
static inline void shuffle_free_memory(pg_data_t *pgdat)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
@@ -44,6 +45,11 @@ static inline bool is_shuffle_order(int order)
return order >= SHUFFLE_ORDER;
}
#else
+static inline bool shuffle_pick_tail(void)
+{
+ return false;
+}
+
static inline void shuffle_free_memory(pg_data_t *pgdat)
{
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5282f881d2f5..93ec4a574d8d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1581,6 +1581,7 @@ static int slabinfo_open(struct inode *inode, struct file *file)
}
static const struct proc_ops slabinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = slabinfo_open,
.proc_read = seq_read,
.proc_write = slabinfo_write,
diff --git a/mm/slub.c b/mm/slub.c
index 3098e0cf2899..332d4b459a90 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -449,6 +449,7 @@ static DEFINE_SPINLOCK(object_map_lock);
* not vanish from under us.
*/
static unsigned long *get_map(struct kmem_cache *s, struct page *page)
+ __acquires(&object_map_lock)
{
void *p;
void *addr = page_address(page);
@@ -465,7 +466,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
return object_map;
}
-static void put_map(unsigned long *map)
+static void put_map(unsigned long *map) __releases(&object_map_lock)
{
VM_BUG_ON(map != object_map);
lockdep_assert_held(&object_map_lock);
diff --git a/mm/sparse.c b/mm/sparse.c
index f1af4d4ee80b..1aee5a481571 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -209,6 +209,7 @@ static inline unsigned long first_present_section_nr(void)
return next_present_section_nr(-1);
}
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
static void subsection_mask_set(unsigned long *map, unsigned long pfn,
unsigned long nr_pages)
{
@@ -243,6 +244,11 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
nr_pages -= pfns;
}
}
+#else
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+}
+#endif
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
@@ -660,6 +666,55 @@ static void free_map_bootmem(struct page *memmap)
vmemmap_free(start, end, NULL);
}
+
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+ struct mem_section *ms = __pfn_to_section(pfn);
+ unsigned long *subsection_map = ms->usage
+ ? &ms->usage->subsection_map[0] : NULL;
+
+ subsection_mask_set(map, pfn, nr_pages);
+ if (subsection_map)
+ bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+ if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+ "section already deactivated (%#lx + %ld)\n",
+ pfn, nr_pages))
+ return -EINVAL;
+
+ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+ return 0;
+}
+
+static bool is_subsection_map_empty(struct mem_section *ms)
+{
+ return bitmap_empty(&ms->usage->subsection_map[0],
+ SUBSECTIONS_PER_SECTION);
+}
+
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ struct mem_section *ms = __pfn_to_section(pfn);
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ unsigned long *subsection_map;
+ int rc = 0;
+
+ subsection_mask_set(map, pfn, nr_pages);
+
+ subsection_map = &ms->usage->subsection_map[0];
+
+ if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+ rc = -EINVAL;
+ else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+ rc = -EEXIST;
+ else
+ bitmap_or(subsection_map, map, subsection_map,
+ SUBSECTIONS_PER_SECTION);
+
+ return rc;
+}
#else
struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
@@ -703,48 +758,51 @@ static void free_map_bootmem(struct page *memmap)
put_page_bootmem(page);
}
}
+
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ return 0;
+}
+
+static bool is_subsection_map_empty(struct mem_section *ms)
+{
+ return true;
+}
+
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ return 0;
+}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+/*
+ * To deactivate a memory region, there are 3 cases to handle across
+ * two configurations (SPARSEMEM_VMEMMAP={y,n}):
+ *
+ * 1. deactivation of a partial hot-added section (only possible in
+ * the SPARSEMEM_VMEMMAP=y case).
+ * a) section was present at memory init.
+ * b) section was hot-added post memory init.
+ * 2. deactivation of a complete hot-added section.
+ * 3. deactivation of a complete section from memory init.
+ *
+ * For 1, when subsection_map does not empty we will not be freeing the
+ * usage map, but still need to free the vmemmap range.
+ *
+ * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
+ */
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
- DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
struct mem_section *ms = __pfn_to_section(pfn);
bool section_is_early = early_section(ms);
struct page *memmap = NULL;
bool empty;
- unsigned long *subsection_map = ms->usage
- ? &ms->usage->subsection_map[0] : NULL;
-
- subsection_mask_set(map, pfn, nr_pages);
- if (subsection_map)
- bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
- if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
- "section already deactivated (%#lx + %ld)\n",
- pfn, nr_pages))
+ if (clear_subsection_map(pfn, nr_pages))
return;
- /*
- * There are 3 cases to handle across two configurations
- * (SPARSEMEM_VMEMMAP={y,n}):
- *
- * 1/ deactivation of a partial hot-added section (only possible
- * in the SPARSEMEM_VMEMMAP=y case).
- * a/ section was present at memory init
- * b/ section was hot-added post memory init
- * 2/ deactivation of a complete hot-added section
- * 3/ deactivation of a complete section from memory init
- *
- * For 1/, when subsection_map does not empty we will not be
- * freeing the usage map, but still need to free the vmemmap
- * range.
- *
- * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
- */
- bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
- empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION);
+ empty = is_subsection_map_empty(ms);
if (empty) {
unsigned long section_nr = pfn_to_section_nr(pfn);
@@ -780,31 +838,19 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
static struct page * __meminit section_activate(int nid, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
- DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
struct mem_section *ms = __pfn_to_section(pfn);
struct mem_section_usage *usage = NULL;
- unsigned long *subsection_map;
struct page *memmap;
int rc = 0;
- subsection_mask_set(map, pfn, nr_pages);
-
if (!ms->usage) {
usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
if (!usage)
return ERR_PTR(-ENOMEM);
ms->usage = usage;
}
- subsection_map = &ms->usage->subsection_map[0];
-
- if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
- rc = -EINVAL;
- else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
- rc = -EEXIST;
- else
- bitmap_or(subsection_map, map, subsection_map,
- SUBSECTIONS_PER_SECTION);
+ rc = fill_subsection_map(pfn, nr_pages);
if (rc) {
if (usage)
ms->usage = NULL;
@@ -840,6 +886,10 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
*
* This is only intended for hotplug.
*
+ * Note that only VMEMMAP supports sub-section aligned hotplug,
+ * the proper alignment and size are gated by check_pfn_span().
+ *
+ *
* Return:
* * 0 - On success.
* * -EEXIST - Section has been present.
diff --git a/mm/swap.c b/mm/swap.c
index a4af8c999963..bf9a79fed62d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -276,7 +276,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_cache(page);
+ int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru);
@@ -394,7 +394,7 @@ void mark_page_accessed(struct page *page)
else
__lru_cache_activate_page(page);
ClearPageReferenced(page);
- if (page_is_file_cache(page))
+ if (page_is_file_lru(page))
workingset_activation(page);
}
if (page_is_idle(page))
@@ -515,7 +515,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
return;
active = PageActive(page);
- file = page_is_file_cache(page);
+ file = page_is_file_lru(page);
lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru + active);
@@ -548,7 +548,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_cache(page);
+ int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
@@ -573,9 +573,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
ClearPageActive(page);
ClearPageReferenced(page);
/*
- * lazyfree pages are clean anonymous pages. They have
- * SwapBacked flag cleared to distinguish normal anonymous
- * pages
+ * Lazyfree pages are clean anonymous pages. They have
+ * PG_swapbacked flag cleared, to distinguish them from normal
+ * anonymous pages
*/
ClearPageSwapBacked(page);
add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
@@ -962,7 +962,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
if (page_evictable(page)) {
lru = page_lru(page);
- update_page_reclaim_stat(lruvec, page_is_file_cache(page),
+ update_page_reclaim_stat(lruvec, page_is_file_lru(page),
PageActive(page));
if (was_unevictable)
count_vm_event(UNEVICTABLE_PGRESCUED);
@@ -1004,6 +1004,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
* ascending indexes. There may be holes in the indices due to
* not-present entries.
*
+ * Only one subpage of a Transparent Huge Page is returned in one call:
+ * allowing truncate_inode_pages_range() to evict the whole THP without
+ * cycling through a pagevec of extra references.
+ *
* pagevec_lookup_entries() returns the number of entries which were
* found.
*/
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 273a923c275c..5871a2aa86a5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2797,6 +2797,7 @@ static int swaps_open(struct inode *inode, struct file *file)
}
static const struct proc_ops swaps_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = swaps_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index bd96855f3961..512576e171ce 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -53,7 +53,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- struct page **pagep)
+ struct page **pagep,
+ bool wp_copy)
{
struct mem_cgroup *memcg;
pte_t _dst_pte, *dst_pte;
@@ -99,9 +100,13 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
goto out_release;
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
- if (dst_vma->vm_flags & VM_WRITE)
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+ _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
+ if (dst_vma->vm_flags & VM_WRITE) {
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+ else
+ _dst_pte = pte_mkwrite(_dst_pte);
+ }
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
if (dst_vma->vm_file) {
@@ -415,7 +420,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
unsigned long dst_addr,
unsigned long src_addr,
struct page **page,
- bool zeropage)
+ bool zeropage,
+ bool wp_copy)
{
ssize_t err;
@@ -432,11 +438,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
if (!(dst_vma->vm_flags & VM_SHARED)) {
if (!zeropage)
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, page);
+ dst_addr, src_addr, page,
+ wp_copy);
else
err = mfill_zeropage_pte(dst_mm, dst_pmd,
dst_vma, dst_addr);
} else {
+ VM_WARN_ON_ONCE(wp_copy);
if (!zeropage)
err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
dst_vma, dst_addr,
@@ -454,7 +462,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long src_start,
unsigned long len,
bool zeropage,
- bool *mmap_changing)
+ bool *mmap_changing,
+ __u64 mode)
{
struct vm_area_struct *dst_vma;
ssize_t err;
@@ -462,6 +471,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long src_addr, dst_addr;
long copied;
struct page *page;
+ bool wp_copy;
/*
* Sanitize the command parameters:
@@ -508,6 +518,14 @@ retry:
goto out_unlock;
/*
+ * validate 'mode' now that we know the dst_vma: don't allow
+ * a wrprotect copy if the userfaultfd didn't register as WP.
+ */
+ wp_copy = mode & UFFDIO_COPY_MODE_WP;
+ if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+ goto out_unlock;
+
+ /*
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
@@ -562,7 +580,7 @@ retry:
BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, zeropage);
+ src_addr, &page, zeropage, wp_copy);
cond_resched();
if (unlikely(err == -ENOENT)) {
@@ -609,14 +627,68 @@ out:
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- bool *mmap_changing)
+ bool *mmap_changing, __u64 mode)
{
return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
- mmap_changing);
+ mmap_changing, mode);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
+ return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+}
+
+int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, bool enable_wp, bool *mmap_changing)
+{
+ struct vm_area_struct *dst_vma;
+ pgprot_t newprot;
+ int err;
+
+ /*
+ * Sanitize the command parameters:
+ */
+ BUG_ON(start & ~PAGE_MASK);
+ BUG_ON(len & ~PAGE_MASK);
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ BUG_ON(start + len <= start);
+
+ down_read(&dst_mm->mmap_sem);
+
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ err = -EAGAIN;
+ if (mmap_changing && READ_ONCE(*mmap_changing))
+ goto out_unlock;
+
+ err = -ENOENT;
+ dst_vma = find_dst_vma(dst_mm, start, len);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+ if (!userfaultfd_wp(dst_vma))
+ goto out_unlock;
+ if (!vma_is_anonymous(dst_vma))
+ goto out_unlock;
+
+ if (enable_wp)
+ newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+ else
+ newprot = vm_get_page_prot(dst_vma->vm_flags);
+
+ change_protection(dst_vma, start, start + len, newprot,
+ enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+
+ err = 0;
+out_unlock:
+ up_read(&dst_mm->mmap_sem);
+ return err;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8eeb0ecee5..399f219544f7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3368,7 +3368,7 @@ retry:
goto overflow;
/*
- * If required width exeeds current VA block, move
+ * If required width exceeds current VA block, move
* base downwards and then recheck.
*/
if (base + end > va->va_end) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e8e690d2813..b06868fc4926 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -919,7 +919,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* exceptional entries and shadow exceptional entries in the
* same address_space.
*/
- if (reclaimed && page_is_file_cache(page) &&
+ if (reclaimed && page_is_file_lru(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
@@ -1043,7 +1043,7 @@ static void page_check_dirty_writeback(struct page *page,
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
- if (!page_is_file_cache(page) ||
+ if (!page_is_file_lru(page) ||
(PageAnon(page) && !PageSwapBacked(page))) {
*dirty = false;
*writeback = false;
@@ -1315,7 +1315,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* the rest of the LRU for clean pages and see
* the same dirty pages again (PageReclaim).
*/
- if (page_is_file_cache(page) &&
+ if (page_is_file_lru(page) &&
(!current_is_kswapd() || !PageReclaim(page) ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
@@ -1459,7 +1459,7 @@ activate_locked:
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
- int type = page_is_file_cache(page);
+ int type = page_is_file_lru(page);
SetPageActive(page);
stat->nr_activate[type] += nr_pages;
count_memcg_page_event(page, PGACTIVATE);
@@ -1497,7 +1497,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
- if (page_is_file_cache(page) && !PageDirty(page) &&
+ if (page_is_file_lru(page) && !PageDirty(page) &&
!__PageMovable(page) && !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
@@ -2053,7 +2053,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
- if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
+ if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
list_add(&page->lru, &l_active);
continue;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c9c0d71f917f..96d21a792b57 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1256,9 +1256,12 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"thp_fault_alloc",
"thp_fault_fallback",
+ "thp_fault_fallback_charge",
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
"thp_file_alloc",
+ "thp_file_fallback",
+ "thp_file_fallback_charge",
"thp_file_mapped",
"thp_split_page",
"thp_split_page_failed",
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 22d17ecfe7df..2f836a2b993f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -424,7 +424,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle,
case ZPOOL_MM_WO:
zs_mm = ZS_MM_WO;
break;
- case ZPOOL_MM_RW: /* fall through */
+ case ZPOOL_MM_RW:
default:
zs_mm = ZS_MM_RW;
break;
@@ -891,12 +891,12 @@ static inline int trypin_tag(unsigned long handle)
return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
-static void pin_tag(unsigned long handle)
+static void pin_tag(unsigned long handle) __acquires(bitlock)
{
bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
-static void unpin_tag(unsigned long handle)
+static void unpin_tag(unsigned long handle) __releases(bitlock)
{
bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
@@ -1833,12 +1833,12 @@ static void migrate_lock_init(struct zspage *zspage)
rwlock_init(&zspage->lock);
}
-static void migrate_read_lock(struct zspage *zspage)
+static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
{
read_lock(&zspage->lock);
}
-static void migrate_read_unlock(struct zspage *zspage)
+static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
{
read_unlock(&zspage->lock);
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 55094e63b72d..fbb782924ccc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -77,8 +77,8 @@ static bool zswap_pool_reached_full;
#define ZSWAP_PARAM_UNSET ""
-/* Enable/disable zswap (disabled by default) */
-static bool zswap_enabled;
+/* Enable/disable zswap */
+static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
static int zswap_enabled_param_set(const char *,
const struct kernel_param *);
static struct kernel_param_ops zswap_enabled_param_ops = {
@@ -88,8 +88,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = {
module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
/* Crypto compressor to use */
-#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
static int zswap_compressor_param_set(const char *,
const struct kernel_param *);
static struct kernel_param_ops zswap_compressor_param_ops = {
@@ -101,8 +100,7 @@ module_param_cb(compressor, &zswap_compressor_param_ops,
&zswap_compressor, 0644);
/* Compressed storage zpool to use */
-#define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
static int zswap_zpool_param_set(const char *, const struct kernel_param *);
static struct kernel_param_ops zswap_zpool_param_ops = {
.set = zswap_zpool_param_set,
@@ -599,11 +597,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void)
bool has_comp, has_zpool;
has_comp = crypto_has_comp(zswap_compressor, 0, 0);
- if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
+ if (!has_comp && strcmp(zswap_compressor,
+ CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
pr_err("compressor %s not available, using default %s\n",
- zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
+ zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
param_free_charp(&zswap_compressor);
- zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+ zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
has_comp = crypto_has_comp(zswap_compressor, 0, 0);
}
if (!has_comp) {
@@ -614,11 +613,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void)
}
has_zpool = zpool_has_pool(zswap_zpool_type);
- if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
+ if (!has_zpool && strcmp(zswap_zpool_type,
+ CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
pr_err("zpool %s not available, using default %s\n",
- zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
+ zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
param_free_charp(&zswap_zpool_type);
- zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+ zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
has_zpool = zpool_has_pool(zswap_zpool_type);
}
if (!has_zpool) {