aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-09 10:30:07 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-09 10:30:07 -0700
commitfbf4432ff71b7a25bef993a5312906946d27f446 (patch)
treecf3e0024af4b8f9376eff75743f1fa1526e40900 /mm
parentremove gperf left-overs from build system (diff)
parentipc: optimize semget/shmget/msgget for lots of keys (diff)
downloadlinux-dev-fbf4432ff71b7a25bef993a5312906946d27f446.tar.xz
linux-dev-fbf4432ff71b7a25bef993a5312906946d27f446.zip
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - most of the rest of MM - a small number of misc things - lib/ updates - checkpatch - autofs updates - ipc/ updates * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (126 commits) ipc: optimize semget/shmget/msgget for lots of keys ipc/sem: play nicer with large nsops allocations ipc/sem: drop sem_checkid helper ipc: convert kern_ipc_perm.refcount from atomic_t to refcount_t ipc: convert sem_undo_list.refcnt from atomic_t to refcount_t ipc: convert ipc_namespace.count from atomic_t to refcount_t kcov: support compat processes sh: defconfig: cleanup from old Kconfig options mn10300: defconfig: cleanup from old Kconfig options m32r: defconfig: cleanup from old Kconfig options drivers/pps: use surrounding "if PPS" to remove numerous dependency checks drivers/pps: aesthetic tweaks to PPS-related content cpumask: make cpumask_next() out-of-line kmod: move #ifdef CONFIG_MODULES wrapper to Makefile kmod: split off umh headers into its own file MAINTAINERS: clarify kmod is just a kernel module loader kmod: split out umh code into its own file test_kmod: flip INT checks to be consistent test_kmod: remove paranoid UINT_MAX check on uint range processing vfat: deduplicate hex2bin() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig54
-rw-r--r--mm/Makefile1
-rw-r--r--mm/balloon_compaction.c8
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/gup.c29
-rw-r--r--mm/hmm.c1257
-rw-r--r--mm/huge_memory.c174
-rw-r--r--mm/interval_tree.c10
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c256
-rw-r--r--mm/memory.c138
-rw-r--r--mm/memory_hotplug.c14
-rw-r--r--mm/mempolicy.c142
-rw-r--r--mm/migrate.c1007
-rw-r--r--mm/mlock.c10
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c18
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c13
-rw-r--r--mm/page_vma_mapped.c28
-rw-r--r--mm/pgtable-generic.c3
-rw-r--r--mm/rmap.c50
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c11
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/vmstat.c154
-rw-r--r--mm/zsmalloc.c17
28 files changed, 3180 insertions, 241 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0ded10a22639..9c4bdddd80c2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -262,6 +262,9 @@ config MIGRATION
config ARCH_ENABLE_HUGEPAGE_MIGRATION
bool
+config ARCH_ENABLE_THP_MIGRATION
+ bool
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
@@ -673,7 +676,7 @@ config ARCH_HAS_ZONE_DEVICE
bool
config ZONE_DEVICE
- bool "Device memory (pmem, etc...) hotplug support"
+ bool "Device memory (pmem, HMM, etc...) hotplug support"
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
@@ -689,6 +692,55 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
+config ARCH_HAS_HMM
+ bool
+ default y
+ depends on (X86_64 || PPC64)
+ depends on ZONE_DEVICE
+ depends on MMU && 64BIT
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
+
+config MIGRATE_VMA_HELPER
+ bool
+
+config HMM
+ bool
+ select MIGRATE_VMA_HELPER
+
+config HMM_MIRROR
+ bool "HMM mirror CPU page table into a device page table"
+ depends on ARCH_HAS_HMM
+ select MMU_NOTIFIER
+ select HMM
+ help
+ Select HMM_MIRROR if you want to mirror range of the CPU page table of a
+ process into a device page table. Here, mirror means "keep synchronized".
+ Prerequisites: the device must provide the ability to write-protect its
+ page tables (at PAGE_SIZE granularity), and must be able to recover from
+ the resulting potential page faults.
+
+config DEVICE_PRIVATE
+ bool "Unaddressable device memory (GPU memory, ...)"
+ depends on ARCH_HAS_HMM
+ select HMM
+
+ help
+ Allows creation of struct pages to represent unaddressable device
+ memory; i.e., memory that is only accessible from the device (or
+ group of devices). You likely also want to select HMM_MIRROR.
+
+config DEVICE_PUBLIC
+ bool "Addressable device memory (like GPU memory)"
+ depends on ARCH_HAS_HMM
+ select HMM
+
+ help
+ Allows creation of struct pages to represent addressable device
+ memory; i.e., memory that is accessible from both the device and
+ the CPU
+
config FRAME_VECTOR
bool
diff --git a/mm/Makefile b/mm/Makefile
index 411bd24d4a7c..e3ac3aeb533b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,3 +104,4 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
+obj-$(CONFIG_HMM) += hmm.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index b06d9fe23a28..68d28924ba79 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -139,6 +139,14 @@ int balloon_page_migrate(struct address_space *mapping,
{
struct balloon_dev_info *balloon = balloon_page_device(page);
+ /*
+ * We can not easily support the no copy case here so ignore it as it
+ * is unlikely to be use with ballon pages. See include/linux/hmm.h for
+ * user of the MIGRATE_SYNC_NO_COPY mode.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a43013112581..702f239cd6db 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -52,7 +52,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
goto out;
}
- if (IS_DAX(inode)) {
+ bdi = inode_to_bdi(mapping->host);
+
+ if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) {
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
@@ -75,8 +77,6 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
else
endbyte--; /* inclusive */
- bdi = inode_to_bdi(mapping->host);
-
switch (advice) {
case POSIX_FADV_NORMAL:
f.file->f_ra.ra_pages = bdi->ra_pages;
diff --git a/mm/gup.c b/mm/gup.c
index 33d651deeae2..b2b4d4263768 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -234,6 +234,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return page;
return no_page_table(vma, flags);
}
+retry:
+ if (!pmd_present(*pmd)) {
+ if (likely(!(flags & FOLL_MIGRATION)))
+ return no_page_table(vma, flags);
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(*pmd));
+ if (is_pmd_migration_entry(*pmd))
+ pmd_migration_entry_wait(mm, pmd);
+ goto retry;
+ }
if (pmd_devmap(*pmd)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags);
@@ -247,7 +257,15 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
+retry_locked:
ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_present(*pmd))) {
+ spin_unlock(ptl);
+ if (likely(!(flags & FOLL_MIGRATION)))
+ return no_page_table(vma, flags);
+ pmd_migration_entry_wait(mm, pmd);
+ goto retry_locked;
+ }
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags);
@@ -424,7 +442,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pud = pud_offset(p4d, address);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
+ if (!pmd_present(*pmd))
return -EFAULT;
VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, address);
@@ -438,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap;
*page = pte_page(*pte);
+
+ /*
+ * This should never happen (a device public page in the gate
+ * area).
+ */
+ if (is_device_public_page(*page))
+ goto unmap;
}
get_page(*page);
out:
@@ -1534,7 +1559,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
+ if (!pmd_present(pmd))
return 0;
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
diff --git a/mm/hmm.c b/mm/hmm.c
new file mode 100644
index 000000000000..a88a847bccba
--- /dev/null
+++ b/mm/hmm.c
@@ -0,0 +1,1257 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
+ */
+/*
+ * Refer to include/linux/hmm.h for information about heterogeneous memory
+ * management or HMM for short.
+ */
+#include <linux/mm.h>
+#include <linux/hmm.h>
+#include <linux/init.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
+#include <linux/memremap.h>
+#include <linux/jump_label.h>
+#include <linux/mmu_notifier.h>
+#include <linux/memory_hotplug.h>
+
+#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+/*
+ * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
+ */
+DEFINE_STATIC_KEY_FALSE(device_private_key);
+EXPORT_SYMBOL(device_private_key);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @sequence: we track updates to the CPU page table with a sequence number
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ */
+struct hmm {
+ struct mm_struct *mm;
+ spinlock_t lock;
+ atomic_t sequence;
+ struct list_head ranges;
+ struct list_head mirrors;
+ struct mmu_notifier mmu_notifier;
+ struct rw_semaphore mirrors_sem;
+};
+
+/*
+ * hmm_register - register HMM against an mm (HMM internal)
+ *
+ * @mm: mm struct to attach to
+ *
+ * This is not intended to be used directly by device drivers. It allocates an
+ * HMM struct if mm does not have one, and initializes it.
+ */
+static struct hmm *hmm_register(struct mm_struct *mm)
+{
+ struct hmm *hmm = READ_ONCE(mm->hmm);
+ bool cleanup = false;
+
+ /*
+ * The hmm struct can only be freed once the mm_struct goes away,
+ * hence we should always have pre-allocated an new hmm struct
+ * above.
+ */
+ if (hmm)
+ return hmm;
+
+ hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+ if (!hmm)
+ return NULL;
+ INIT_LIST_HEAD(&hmm->mirrors);
+ init_rwsem(&hmm->mirrors_sem);
+ atomic_set(&hmm->sequence, 0);
+ hmm->mmu_notifier.ops = NULL;
+ INIT_LIST_HEAD(&hmm->ranges);
+ spin_lock_init(&hmm->lock);
+ hmm->mm = mm;
+
+ /*
+ * We should only get here if hold the mmap_sem in write mode ie on
+ * registration of first mirror through hmm_mirror_register()
+ */
+ hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
+ if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
+ kfree(hmm);
+ return NULL;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (!mm->hmm)
+ mm->hmm = hmm;
+ else
+ cleanup = true;
+ spin_unlock(&mm->page_table_lock);
+
+ if (cleanup) {
+ mmu_notifier_unregister(&hmm->mmu_notifier, mm);
+ kfree(hmm);
+ }
+
+ return mm->hmm;
+}
+
+void hmm_mm_destroy(struct mm_struct *mm)
+{
+ kfree(mm->hmm);
+}
+
+static void hmm_invalidate_range(struct hmm *hmm,
+ enum hmm_update_type action,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm_mirror *mirror;
+ struct hmm_range *range;
+
+ spin_lock(&hmm->lock);
+ list_for_each_entry(range, &hmm->ranges, list) {
+ unsigned long addr, idx, npages;
+
+ if (end < range->start || start >= range->end)
+ continue;
+
+ range->valid = false;
+ addr = max(start, range->start);
+ idx = (addr - range->start) >> PAGE_SHIFT;
+ npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
+ memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
+ }
+ spin_unlock(&hmm->lock);
+
+ down_read(&hmm->mirrors_sem);
+ list_for_each_entry(mirror, &hmm->mirrors, list)
+ mirror->ops->sync_cpu_device_pagetables(mirror, action,
+ start, end);
+ up_read(&hmm->mirrors_sem);
+}
+
+static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm *hmm = mm->hmm;
+
+ VM_BUG_ON(!hmm);
+
+ atomic_inc(&hmm->sequence);
+}
+
+static void hmm_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm *hmm = mm->hmm;
+
+ VM_BUG_ON(!hmm);
+
+ hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+}
+
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
+ .invalidate_range_start = hmm_invalidate_range_start,
+ .invalidate_range_end = hmm_invalidate_range_end,
+};
+
+/*
+ * hmm_mirror_register() - register a mirror against an mm
+ *
+ * @mirror: new mirror struct to register
+ * @mm: mm to register against
+ *
+ * To start mirroring a process address space, the device driver must register
+ * an HMM mirror struct.
+ *
+ * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
+ */
+int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
+{
+ /* Sanity check */
+ if (!mm || !mirror || !mirror->ops)
+ return -EINVAL;
+
+ mirror->hmm = hmm_register(mm);
+ if (!mirror->hmm)
+ return -ENOMEM;
+
+ down_write(&mirror->hmm->mirrors_sem);
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
+
+ return 0;
+}
+EXPORT_SYMBOL(hmm_mirror_register);
+
+/*
+ * hmm_mirror_unregister() - unregister a mirror
+ *
+ * @mirror: new mirror struct to register
+ *
+ * Stop mirroring a process address space, and cleanup.
+ */
+void hmm_mirror_unregister(struct hmm_mirror *mirror)
+{
+ struct hmm *hmm = mirror->hmm;
+
+ down_write(&hmm->mirrors_sem);
+ list_del(&mirror->list);
+ up_write(&hmm->mirrors_sem);
+}
+EXPORT_SYMBOL(hmm_mirror_unregister);
+
+struct hmm_vma_walk {
+ struct hmm_range *range;
+ unsigned long last;
+ bool fault;
+ bool block;
+ bool write;
+};
+
+static int hmm_vma_do_fault(struct mm_walk *walk,
+ unsigned long addr,
+ hmm_pfn_t *pfn)
+{
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int r;
+
+ flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
+ flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
+ r = handle_mm_fault(vma, addr, flags);
+ if (r & VM_FAULT_RETRY)
+ return -EBUSY;
+ if (r & VM_FAULT_ERROR) {
+ *pfn = HMM_PFN_ERROR;
+ return -EFAULT;
+ }
+
+ return -EAGAIN;
+}
+
+static void hmm_pfns_special(hmm_pfn_t *pfns,
+ unsigned long addr,
+ unsigned long end)
+{
+ for (; addr < end; addr += PAGE_SIZE, pfns++)
+ *pfns = HMM_PFN_SPECIAL;
+}
+
+static int hmm_pfns_bad(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_range *range = walk->private;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ for (; addr < end; addr += PAGE_SIZE, i++)
+ pfns[i] = HMM_PFN_ERROR;
+
+ return 0;
+}
+
+static void hmm_pfns_clear(hmm_pfn_t *pfns,
+ unsigned long addr,
+ unsigned long end)
+{
+ for (; addr < end; addr += PAGE_SIZE, pfns++)
+ *pfns = 0;
+}
+
+static int hmm_vma_walk_hole(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
+ hmm_vma_walk->last = addr;
+ i = (addr - range->start) >> PAGE_SHIFT;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ pfns[i] = HMM_PFN_EMPTY;
+ if (hmm_vma_walk->fault) {
+ int ret;
+
+ ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+ }
+
+ return hmm_vma_walk->fault ? -EAGAIN : 0;
+}
+
+static int hmm_vma_walk_clear(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
+ hmm_vma_walk->last = addr;
+ i = (addr - range->start) >> PAGE_SHIFT;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ pfns[i] = 0;
+ if (hmm_vma_walk->fault) {
+ int ret;
+
+ ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+ }
+
+ return hmm_vma_walk->fault ? -EAGAIN : 0;
+}
+
+static int hmm_vma_walk_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long addr = start, i;
+ bool write_fault;
+ hmm_pfn_t flag;
+ pte_t *ptep;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
+ write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
+
+again:
+ if (pmd_none(*pmdp))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
+ return hmm_pfns_bad(start, end, walk);
+
+ if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
+ unsigned long pfn;
+ pmd_t pmd;
+
+ /*
+ * No need to take pmd_lock here, even if some other threads
+ * is splitting the huge pmd we will get that event through
+ * mmu_notifier callback.
+ *
+ * So just read pmd value and check again its a transparent
+ * huge or device mapping one and compute corresponding pfn
+ * values.
+ */
+ pmd = pmd_read_atomic(pmdp);
+ barrier();
+ if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+ goto again;
+ if (pmd_protnone(pmd))
+ return hmm_vma_walk_clear(start, end, walk);
+
+ if (write_fault && !pmd_write(pmd))
+ return hmm_vma_walk_clear(start, end, walk);
+
+ pfn = pmd_pfn(pmd) + pte_index(addr);
+ flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
+ pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
+ return 0;
+ }
+
+ if (pmd_bad(*pmdp))
+ return hmm_pfns_bad(start, end, walk);
+
+ ptep = pte_offset_map(pmdp, addr);
+ for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
+ pte_t pte = *ptep;
+
+ pfns[i] = 0;
+
+ if (pte_none(pte)) {
+ pfns[i] = HMM_PFN_EMPTY;
+ if (hmm_vma_walk->fault)
+ goto fault;
+ continue;
+ }
+
+ if (!pte_present(pte)) {
+ swp_entry_t entry;
+
+ if (!non_swap_entry(entry)) {
+ if (hmm_vma_walk->fault)
+ goto fault;
+ continue;
+ }
+
+ entry = pte_to_swp_entry(pte);
+
+ /*
+ * This is a special swap entry, ignore migration, use
+ * device and report anything else as error.
+ */
+ if (is_device_private_entry(entry)) {
+ pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry));
+ if (is_write_device_private_entry(entry)) {
+ pfns[i] |= HMM_PFN_WRITE;
+ } else if (write_fault)
+ goto fault;
+ pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE;
+ pfns[i] |= flag;
+ } else if (is_migration_entry(entry)) {
+ if (hmm_vma_walk->fault) {
+ pte_unmap(ptep);
+ hmm_vma_walk->last = addr;
+ migration_entry_wait(vma->vm_mm,
+ pmdp, addr);
+ return -EAGAIN;
+ }
+ continue;
+ } else {
+ /* Report error for everything else */
+ pfns[i] = HMM_PFN_ERROR;
+ }
+ continue;
+ }
+
+ if (write_fault && !pte_write(pte))
+ goto fault;
+
+ pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
+ pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
+ continue;
+
+fault:
+ pte_unmap(ptep);
+ /* Fault all pages in range */
+ return hmm_vma_walk_clear(start, end, walk);
+ }
+ pte_unmap(ptep - 1);
+
+ return 0;
+}
+
+/*
+ * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
+ * @vma: virtual memory area containing the virtual address range
+ * @range: used to track snapshot validity
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
+ *
+ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
+ * validity is tracked by range struct. See hmm_vma_range_done() for further
+ * information.
+ *
+ * The range struct is initialized here. It tracks the CPU page table, but only
+ * if the function returns success (0), in which case the caller must then call
+ * hmm_vma_range_done() to stop CPU page table update tracking on this range.
+ *
+ * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
+ * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ */
+int hmm_vma_get_pfns(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns)
+{
+ struct hmm_vma_walk hmm_vma_walk;
+ struct mm_walk mm_walk;
+ struct hmm *hmm;
+
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(pfns, start, end);
+ return -EINVAL;
+ }
+
+ /* Sanity check, this really should not happen ! */
+ if (start < vma->vm_start || start >= vma->vm_end)
+ return -EINVAL;
+ if (end < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ hmm = hmm_register(vma->vm_mm);
+ if (!hmm)
+ return -ENOMEM;
+ /* Caller must have registered a mirror, via hmm_mirror_register() ! */
+ if (!hmm->mmu_notifier.ops)
+ return -EINVAL;
+
+ /* Initialize range to track CPU page table update */
+ range->start = start;
+ range->pfns = pfns;
+ range->end = end;
+ spin_lock(&hmm->lock);
+ range->valid = true;
+ list_add_rcu(&range->list, &hmm->ranges);
+ spin_unlock(&hmm->lock);
+
+ hmm_vma_walk.fault = false;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+
+ walk_page_range(start, end, &mm_walk);
+ return 0;
+}
+EXPORT_SYMBOL(hmm_vma_get_pfns);
+
+/*
+ * hmm_vma_range_done() - stop tracking change to CPU page table over a range
+ * @vma: virtual memory area containing the virtual address range
+ * @range: range being tracked
+ * Returns: false if range data has been invalidated, true otherwise
+ *
+ * Range struct is used to track updates to the CPU page table after a call to
+ * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
+ * using the data, or wants to lock updates to the data it got from those
+ * functions, it must call the hmm_vma_range_done() function, which will then
+ * stop tracking CPU page table updates.
+ *
+ * Note that device driver must still implement general CPU page table update
+ * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
+ * the mmu_notifier API directly.
+ *
+ * CPU page table update tracking done through hmm_range is only temporary and
+ * to be used while trying to duplicate CPU page table contents for a range of
+ * virtual addresses.
+ *
+ * There are two ways to use this :
+ * again:
+ * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * trans = device_build_page_table_update_transaction(pfns);
+ * device_page_table_lock();
+ * if (!hmm_vma_range_done(vma, range)) {
+ * device_page_table_unlock();
+ * goto again;
+ * }
+ * device_commit_transaction(trans);
+ * device_page_table_unlock();
+ *
+ * Or:
+ * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * device_page_table_lock();
+ * hmm_vma_range_done(vma, range);
+ * device_update_page_table(pfns);
+ * device_page_table_unlock();
+ */
+bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
+{
+ unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
+ struct hmm *hmm;
+
+ if (range->end <= range->start) {
+ BUG();
+ return false;
+ }
+
+ hmm = hmm_register(vma->vm_mm);
+ if (!hmm) {
+ memset(range->pfns, 0, sizeof(*range->pfns) * npages);
+ return false;
+ }
+
+ spin_lock(&hmm->lock);
+ list_del_rcu(&range->list);
+ spin_unlock(&hmm->lock);
+
+ return range->valid;
+}
+EXPORT_SYMBOL(hmm_vma_range_done);
+
+/*
+ * hmm_vma_fault() - try to fault some address in a virtual address range
+ * @vma: virtual memory area containing the virtual address range
+ * @range: use to track pfns array content validity
+ * @start: fault range virtual start address (inclusive)
+ * @end: fault range virtual end address (exclusive)
+ * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
+ * @write: is it a write fault
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ *
+ * This is similar to a regular CPU page fault except that it will not trigger
+ * any memory migration if the memory being faulted is not accessible by CPUs.
+ *
+ * On error, for one virtual address in the range, the function will set the
+ * hmm_pfn_t error flag for the corresponding pfn entry.
+ *
+ * Expected use pattern:
+ * retry:
+ * down_read(&mm->mmap_sem);
+ * // Find vma and address device wants to fault, initialize hmm_pfn_t
+ * // array accordingly
+ * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
+ * switch (ret) {
+ * case -EAGAIN:
+ * hmm_vma_range_done(vma, range);
+ * // You might want to rate limit or yield to play nicely, you may
+ * // also commit any valid pfn in the array assuming that you are
+ * // getting true from hmm_vma_range_monitor_end()
+ * goto retry;
+ * case 0:
+ * break;
+ * default:
+ * // Handle error !
+ * up_read(&mm->mmap_sem)
+ * return;
+ * }
+ * // Take device driver lock that serialize device page table update
+ * driver_lock_device_page_table_update();
+ * hmm_vma_range_done(vma, range);
+ * // Commit pfns we got from hmm_vma_fault()
+ * driver_unlock_device_page_table_update();
+ * up_read(&mm->mmap_sem)
+ *
+ * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
+ * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
+ *
+ * YOU HAVE BEEN WARNED !
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns,
+ bool write,
+ bool block)
+{
+ struct hmm_vma_walk hmm_vma_walk;
+ struct mm_walk mm_walk;
+ struct hmm *hmm;
+ int ret;
+
+ /* Sanity check, this really should not happen ! */
+ if (start < vma->vm_start || start >= vma->vm_end)
+ return -EINVAL;
+ if (end < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ hmm = hmm_register(vma->vm_mm);
+ if (!hmm) {
+ hmm_pfns_clear(pfns, start, end);
+ return -ENOMEM;
+ }
+ /* Caller must have registered a mirror using hmm_mirror_register() */
+ if (!hmm->mmu_notifier.ops)
+ return -EINVAL;
+
+ /* Initialize range to track CPU page table update */
+ range->start = start;
+ range->pfns = pfns;
+ range->end = end;
+ spin_lock(&hmm->lock);
+ range->valid = true;
+ list_add_rcu(&range->list, &hmm->ranges);
+ spin_unlock(&hmm->lock);
+
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(pfns, start, end);
+ return 0;
+ }
+
+ hmm_vma_walk.fault = true;
+ hmm_vma_walk.write = write;
+ hmm_vma_walk.block = block;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ hmm_vma_walk.last = range->start;
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+
+ do {
+ ret = walk_page_range(start, end, &mm_walk);
+ start = hmm_vma_walk.last;
+ } while (ret == -EAGAIN);
+
+ if (ret) {
+ unsigned long i;
+
+ i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
+ hmm_vma_range_done(vma, range);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(hmm_vma_fault);
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+
+ page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ if (!page)
+ return NULL;
+ lock_page(page);
+ return page;
+}
+EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
+
+
+static void hmm_devmem_ref_release(struct percpu_ref *ref)
+{
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(ref, struct hmm_devmem, ref);
+ complete(&devmem->completion);
+}
+
+static void hmm_devmem_ref_exit(void *data)
+{
+ struct percpu_ref *ref = data;
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(ref, struct hmm_devmem, ref);
+ percpu_ref_exit(ref);
+ devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
+}
+
+static void hmm_devmem_ref_kill(void *data)
+{
+ struct percpu_ref *ref = data;
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(ref, struct hmm_devmem, ref);
+ percpu_ref_kill(ref);
+ wait_for_completion(&devmem->completion);
+ devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
+}
+
+static int hmm_devmem_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ const struct page *page,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ struct hmm_devmem *devmem = page->pgmap->data;
+
+ return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
+}
+
+static void hmm_devmem_free(struct page *page, void *data)
+{
+ struct hmm_devmem *devmem = data;
+
+ devmem->ops->free(devmem, page);
+}
+
+static DEFINE_MUTEX(hmm_devmem_lock);
+static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
+
+static void hmm_devmem_radix_release(struct resource *resource)
+{
+ resource_size_t key, align_start, align_size, align_end;
+
+ align_start = resource->start & ~(PA_SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
+ align_end = align_start + align_size - 1;
+
+ mutex_lock(&hmm_devmem_lock);
+ for (key = resource->start;
+ key <= resource->end;
+ key += PA_SECTION_SIZE)
+ radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
+ mutex_unlock(&hmm_devmem_lock);
+}
+
+static void hmm_devmem_release(struct device *dev, void *data)
+{
+ struct hmm_devmem *devmem = data;
+ struct resource *resource = devmem->resource;
+ unsigned long start_pfn, npages;
+ struct zone *zone;
+ struct page *page;
+
+ if (percpu_ref_tryget_live(&devmem->ref)) {
+ dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+ percpu_ref_put(&devmem->ref);
+ }
+
+ /* pages are dead and unused, undo the arch mapping */
+ start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
+ npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
+
+ page = pfn_to_page(start_pfn);
+ zone = page_zone(page);
+
+ mem_hotplug_begin();
+ if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
+ __remove_pages(zone, start_pfn, npages);
+ else
+ arch_remove_memory(start_pfn << PAGE_SHIFT,
+ npages << PAGE_SHIFT);
+ mem_hotplug_done();
+
+ hmm_devmem_radix_release(resource);
+}
+
+static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT);
+}
+
+static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
+{
+ resource_size_t key, align_start, align_size, align_end;
+ struct device *device = devmem->device;
+ int ret, nid, is_ram;
+ unsigned long pfn;
+
+ align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
+ align_size = ALIGN(devmem->resource->start +
+ resource_size(devmem->resource),
+ PA_SECTION_SIZE) - align_start;
+
+ is_ram = region_intersects(align_start, align_size,
+ IORESOURCE_SYSTEM_RAM,
+ IORES_DESC_NONE);
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+ __func__, devmem->resource);
+ return -ENXIO;
+ }
+ if (is_ram == REGION_INTERSECTS)
+ return -ENXIO;
+
+ if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
+ devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
+ else
+ devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+
+ devmem->pagemap.res = devmem->resource;
+ devmem->pagemap.page_fault = hmm_devmem_fault;
+ devmem->pagemap.page_free = hmm_devmem_free;
+ devmem->pagemap.dev = devmem->device;
+ devmem->pagemap.ref = &devmem->ref;
+ devmem->pagemap.data = devmem;
+
+ mutex_lock(&hmm_devmem_lock);
+ align_end = align_start + align_size - 1;
+ for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
+ struct hmm_devmem *dup;
+
+ rcu_read_lock();
+ dup = hmm_devmem_find(key);
+ rcu_read_unlock();
+ if (dup) {
+ dev_err(device, "%s: collides with mapping for %s\n",
+ __func__, dev_name(dup->device));
+ mutex_unlock(&hmm_devmem_lock);
+ ret = -EBUSY;
+ goto error;
+ }
+ ret = radix_tree_insert(&hmm_devmem_radix,
+ key >> PA_SECTION_SHIFT,
+ devmem);
+ if (ret) {
+ dev_err(device, "%s: failed: %d\n", __func__, ret);
+ mutex_unlock(&hmm_devmem_lock);
+ goto error_radix;
+ }
+ }
+ mutex_unlock(&hmm_devmem_lock);
+
+ nid = dev_to_node(device);
+ if (nid < 0)
+ nid = numa_mem_id();
+
+ mem_hotplug_begin();
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For device public memory, which is accesible by the CPU, we do
+ * want the linear mapping and thus use arch_add_memory().
+ */
+ if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
+ ret = arch_add_memory(nid, align_start, align_size, false);
+ else
+ ret = add_pages(nid, align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, false);
+ if (ret) {
+ mem_hotplug_done();
+ goto error_add_memory;
+ }
+ move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT);
+ mem_hotplug_done();
+
+ for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ page->pgmap = &devmem->pagemap;
+ }
+ return 0;
+
+error_add_memory:
+ untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+error_radix:
+ hmm_devmem_radix_release(devmem->resource);
+error:
+ return ret;
+}
+
+static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
+{
+ struct hmm_devmem *devmem = data;
+
+ return devmem->resource == match_data;
+}
+
+static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
+{
+ devres_release(devmem->device, &hmm_devmem_release,
+ &hmm_devmem_match, devmem->resource);
+}
+
+/*
+ * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
+ *
+ * @ops: memory event device driver callback (see struct hmm_devmem_ops)
+ * @device: device struct to bind the resource too
+ * @size: size in bytes of the device memory to add
+ * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
+ *
+ * This function first finds an empty range of physical address big enough to
+ * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
+ * in turn allocates struct pages. It does not do anything beyond that; all
+ * events affecting the memory will go through the various callbacks provided
+ * by hmm_devmem_ops struct.
+ *
+ * Device driver should call this function during device initialization and
+ * is then responsible of memory management. HMM only provides helpers.
+ */
+struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ unsigned long size)
+{
+ struct hmm_devmem *devmem;
+ resource_size_t addr;
+ int ret;
+
+ static_branch_enable(&device_private_key);
+
+ devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+ GFP_KERNEL, dev_to_node(device));
+ if (!devmem)
+ return ERR_PTR(-ENOMEM);
+
+ init_completion(&devmem->completion);
+ devmem->pfn_first = -1UL;
+ devmem->pfn_last = -1UL;
+ devmem->resource = NULL;
+ devmem->device = device;
+ devmem->ops = ops;
+
+ ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+ 0, GFP_KERNEL);
+ if (ret)
+ goto error_percpu_ref;
+
+ ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ if (ret)
+ goto error_devm_add_action;
+
+ size = ALIGN(size, PA_SECTION_SIZE);
+ addr = min((unsigned long)iomem_resource.end,
+ (1UL << MAX_PHYSMEM_BITS) - 1);
+ addr = addr - size + 1UL;
+
+ /*
+ * FIXME add a new helper to quickly walk resource tree and find free
+ * range
+ *
+ * FIXME what about ioport_resource resource ?
+ */
+ for (; addr > size && addr >= iomem_resource.start; addr -= size) {
+ ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
+ if (ret != REGION_DISJOINT)
+ continue;
+
+ devmem->resource = devm_request_mem_region(device, addr, size,
+ dev_name(device));
+ if (!devmem->resource) {
+ ret = -ENOMEM;
+ goto error_no_resource;
+ }
+ break;
+ }
+ if (!devmem->resource) {
+ ret = -ERANGE;
+ goto error_no_resource;
+ }
+
+ devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+ devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+ devmem->pfn_last = devmem->pfn_first +
+ (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+ ret = hmm_devmem_pages_create(devmem);
+ if (ret)
+ goto error_pages;
+
+ devres_add(device, devmem);
+
+ ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+ if (ret) {
+ hmm_devmem_remove(devmem);
+ return ERR_PTR(ret);
+ }
+
+ return devmem;
+
+error_pages:
+ devm_release_mem_region(device, devmem->resource->start,
+ resource_size(devmem->resource));
+error_no_resource:
+error_devm_add_action:
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+error_percpu_ref:
+ devres_free(devmem);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(hmm_devmem_add);
+
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ struct resource *res)
+{
+ struct hmm_devmem *devmem;
+ int ret;
+
+ if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
+ return ERR_PTR(-EINVAL);
+
+ static_branch_enable(&device_private_key);
+
+ devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+ GFP_KERNEL, dev_to_node(device));
+ if (!devmem)
+ return ERR_PTR(-ENOMEM);
+
+ init_completion(&devmem->completion);
+ devmem->pfn_first = -1UL;
+ devmem->pfn_last = -1UL;
+ devmem->resource = res;
+ devmem->device = device;
+ devmem->ops = ops;
+
+ ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+ 0, GFP_KERNEL);
+ if (ret)
+ goto error_percpu_ref;
+
+ ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ if (ret)
+ goto error_devm_add_action;
+
+
+ devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+ devmem->pfn_last = devmem->pfn_first +
+ (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+ ret = hmm_devmem_pages_create(devmem);
+ if (ret)
+ goto error_devm_add_action;
+
+ devres_add(device, devmem);
+
+ ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+ if (ret) {
+ hmm_devmem_remove(devmem);
+ return ERR_PTR(ret);
+ }
+
+ return devmem;
+
+error_devm_add_action:
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+error_percpu_ref:
+ devres_free(devmem);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(hmm_devmem_add_resource);
+
+/*
+ * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
+ *
+ * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
+ *
+ * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
+ * of the device driver. It will free struct page and remove the resource that
+ * reserved the physical address range for this device memory.
+ */
+void hmm_devmem_remove(struct hmm_devmem *devmem)
+{
+ resource_size_t start, size;
+ struct device *device;
+ bool cdm = false;
+
+ if (!devmem)
+ return;
+
+ device = devmem->device;
+ start = devmem->resource->start;
+ size = resource_size(devmem->resource);
+
+ cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+ hmm_devmem_pages_remove(devmem);
+
+ if (!cdm)
+ devm_release_mem_region(device, start, size);
+}
+EXPORT_SYMBOL(hmm_devmem_remove);
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper
+ * and it is not needed to make use of any HMM functionality.
+ */
+#define HMM_DEVICE_MAX 256
+
+static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
+static DEFINE_SPINLOCK(hmm_device_lock);
+static struct class *hmm_device_class;
+static dev_t hmm_device_devt;
+
+static void hmm_device_release(struct device *device)
+{
+ struct hmm_device *hmm_device;
+
+ hmm_device = container_of(device, struct hmm_device, device);
+ spin_lock(&hmm_device_lock);
+ clear_bit(hmm_device->minor, hmm_device_mask);
+ spin_unlock(&hmm_device_lock);
+
+ kfree(hmm_device);
+}
+
+struct hmm_device *hmm_device_new(void *drvdata)
+{
+ struct hmm_device *hmm_device;
+
+ hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
+ if (!hmm_device)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock(&hmm_device_lock);
+ hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
+ if (hmm_device->minor >= HMM_DEVICE_MAX) {
+ spin_unlock(&hmm_device_lock);
+ kfree(hmm_device);
+ return ERR_PTR(-EBUSY);
+ }
+ set_bit(hmm_device->minor, hmm_device_mask);
+ spin_unlock(&hmm_device_lock);
+
+ dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
+ hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
+ hmm_device->minor);
+ hmm_device->device.release = hmm_device_release;
+ dev_set_drvdata(&hmm_device->device, drvdata);
+ hmm_device->device.class = hmm_device_class;
+ device_initialize(&hmm_device->device);
+
+ return hmm_device;
+}
+EXPORT_SYMBOL(hmm_device_new);
+
+void hmm_device_put(struct hmm_device *hmm_device)
+{
+ put_device(&hmm_device->device);
+}
+EXPORT_SYMBOL(hmm_device_put);
+
+static int __init hmm_init(void)
+{
+ int ret;
+
+ ret = alloc_chrdev_region(&hmm_device_devt, 0,
+ HMM_DEVICE_MAX,
+ "hmm_device");
+ if (ret)
+ return ret;
+
+ hmm_device_class = class_create(THIS_MODULE, "hmm_device");
+ if (IS_ERR(hmm_device_class)) {
+ unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
+ return PTR_ERR(hmm_device_class);
+ }
+ return 0;
+}
+
+device_initcall(hmm_init);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0b51e70e0a8b..269b5df58543 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -928,6 +928,25 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ if (unlikely(is_swap_pmd(pmd))) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+ VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ if (is_write_migration_entry(entry)) {
+ make_migration_entry_read(&entry);
+ pmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*src_pmd))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ ret = 0;
+ goto out_unlock;
+ }
+#endif
+
if (unlikely(!pmd_trans_huge(pmd))) {
pte_free(dst_mm, pgtable);
goto out_unlock;
@@ -1599,6 +1618,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_huge_zero_pmd(orig_pmd))
goto out;
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto out;
+ }
+
page = pmd_page(orig_pmd);
/*
* If other processes are mapping this page, we couldn't discard
@@ -1684,10 +1709,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
spin_unlock(ptl);
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
- struct page *page = pmd_page(orig_pmd);
- page_remove_rmap(page, true);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
- VM_BUG_ON_PAGE(!PageHead(page), page);
+ struct page *page = NULL;
+ int flush_needed = 1;
+
+ if (pmd_present(orig_pmd)) {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page, true);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ } else if (thp_migration_supported()) {
+ swp_entry_t entry;
+
+ VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
+ entry = pmd_to_swp_entry(orig_pmd);
+ page = pfn_to_page(swp_offset(entry));
+ flush_needed = 0;
+ } else
+ WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
if (PageAnon(page)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
@@ -1696,8 +1735,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
}
+
spin_unlock(ptl);
- tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
+ if (flush_needed)
+ tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
}
return 1;
}
@@ -1717,6 +1758,17 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
}
#endif
+static pmd_t move_soft_dirty_pmd(pmd_t pmd)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ if (unlikely(is_pmd_migration_entry(pmd)))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ else if (pmd_present(pmd))
+ pmd = pmd_mksoft_dirty(pmd);
+#endif
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
@@ -1759,7 +1811,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
- set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+ pmd = move_soft_dirty_pmd(pmd);
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
if (force_flush)
@@ -1794,6 +1847,27 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
preserve_write = prot_numa && pmd_write(*pmd);
ret = 1;
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ if (is_swap_pmd(*pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(*pmd);
+
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+ if (is_write_migration_entry(entry)) {
+ pmd_t newpmd;
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ make_migration_entry_read(&entry);
+ newpmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*pmd))
+ newpmd = pmd_swp_mksoft_dirty(newpmd);
+ set_pmd_at(mm, addr, pmd, newpmd);
+ }
+ goto unlock;
+ }
+#endif
+
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -1859,7 +1933,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
spinlock_t *ptl;
ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
+ pmd_devmap(*pmd)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -1977,14 +2052,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t _pmd;
- bool young, write, dirty, soft_dirty;
+ bool young, write, dirty, soft_dirty, pmd_migration = false;
unsigned long addr;
int i;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+ && !pmd_devmap(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -2009,7 +2085,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- page = pmd_page(*pmd);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ pmd_migration = is_pmd_migration_entry(*pmd);
+ if (pmd_migration) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(*pmd);
+ page = pfn_to_page(swp_offset(entry));
+ } else
+#endif
+ page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
write = pmd_write(*pmd);
@@ -2028,7 +2113,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* transferred to avoid any possibility of altering
* permissions across VMAs.
*/
- if (freeze) {
+ if (freeze || pmd_migration) {
swp_entry_t swp_entry;
swp_entry = make_migration_entry(page + i, write);
entry = swp_entry_to_pte(swp_entry);
@@ -2127,7 +2212,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
page = pmd_page(*pmd);
if (PageMlocked(page))
clear_page_mlock(page);
- } else if (!pmd_devmap(*pmd))
+ } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
goto out;
__split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
@@ -2210,7 +2295,7 @@ static void freeze_page(struct page *page)
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
- ttu_flags |= TTU_MIGRATION;
+ ttu_flags |= TTU_SPLIT_FREEZE;
unmap_success = try_to_unmap(page, ttu_flags);
VM_BUG_ON_PAGE(!unmap_success, page);
@@ -2745,3 +2830,66 @@ static int __init split_huge_pages_debugfs(void)
}
late_initcall(split_huge_pages_debugfs);
#endif
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+ struct page *page)
+{
+ struct vm_area_struct *vma = pvmw->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address = pvmw->address;
+ pmd_t pmdval;
+ swp_entry_t entry;
+ pmd_t pmdswp;
+
+ if (!(pvmw->pmd && !pvmw->pte))
+ return;
+
+ mmu_notifier_invalidate_range_start(mm, address,
+ address + HPAGE_PMD_SIZE);
+
+ flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
+ pmdval = *pvmw->pmd;
+ pmdp_invalidate(vma, address, pvmw->pmd);
+ if (pmd_dirty(pmdval))
+ set_page_dirty(page);
+ entry = make_migration_entry(page, pmd_write(pmdval));
+ pmdswp = swp_entry_to_pmd(entry);
+ if (pmd_soft_dirty(pmdval))
+ pmdswp = pmd_swp_mksoft_dirty(pmdswp);
+ set_pmd_at(mm, address, pvmw->pmd, pmdswp);
+ page_remove_rmap(page, true);
+ put_page(page);
+
+ mmu_notifier_invalidate_range_end(mm, address,
+ address + HPAGE_PMD_SIZE);
+}
+
+void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
+{
+ struct vm_area_struct *vma = pvmw->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address = pvmw->address;
+ unsigned long mmun_start = address & HPAGE_PMD_MASK;
+ pmd_t pmde;
+ swp_entry_t entry;
+
+ if (!(pvmw->pmd && !pvmw->pte))
+ return;
+
+ entry = pmd_to_swp_entry(*pvmw->pmd);
+ get_page(new);
+ pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
+ if (pmd_swp_soft_dirty(*pvmw->pmd))
+ pmde = pmd_mksoft_dirty(pmde);
+ if (is_write_migration_entry(entry))
+ pmde = maybe_pmd_mkwrite(pmde, vma);
+
+ flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
+ page_add_anon_rmap(new, vma, mmun_start, true);
+ set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_vma_page(new);
+ update_mmu_cache_pmd(vma, address, pvmw->pmd);
+}
+#endif
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index f2c2492681bf..b47664358796 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -28,7 +28,7 @@ INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct vm_area_struct *prev,
- struct rb_root *root)
+ struct rb_root_cached *root)
{
struct rb_node **link;
struct vm_area_struct *parent;
@@ -55,7 +55,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
node->shared.rb_subtree_last = last;
rb_link_node(&node->shared.rb, &parent->shared.rb, link);
- rb_insert_augmented(&node->shared.rb, root,
+ rb_insert_augmented(&node->shared.rb, &root->rb_root,
&vma_interval_tree_augment);
}
@@ -74,7 +74,7 @@ INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
static inline, __anon_vma_interval_tree)
void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
- struct rb_root *root)
+ struct rb_root_cached *root)
{
#ifdef CONFIG_DEBUG_VM_RB
node->cached_vma_start = avc_start_pgoff(node);
@@ -84,13 +84,13 @@ void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
}
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
- struct rb_root *root)
+ struct rb_root_cached *root)
{
__anon_vma_interval_tree_remove(node, root);
}
struct anon_vma_chain *
-anon_vma_interval_tree_iter_first(struct rb_root *root,
+anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
unsigned long first, unsigned long last)
{
return __anon_vma_interval_tree_iter_first(root, first, last);
diff --git a/mm/madvise.c b/mm/madvise.c
index eea1c733286f..21261ff0466f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
}
- page = vm_normal_page(vma, addr, ptent);
+ page = _vm_normal_page(vma, addr, ptent, true);
if (!page)
continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6532b219b222..15af3da5af02 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -119,6 +119,7 @@ static const char *const mem_cgroup_lru_names[] = {
struct mem_cgroup_tree_per_node {
struct rb_root rb_root;
+ struct rb_node *rb_rightmost;
spinlock_t lock;
};
@@ -386,6 +387,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup_per_node *mz_node;
+ bool rightmost = true;
if (mz->on_tree)
return;
@@ -397,8 +399,11 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
tree_node);
- if (mz->usage_in_excess < mz_node->usage_in_excess)
+ if (mz->usage_in_excess < mz_node->usage_in_excess) {
p = &(*p)->rb_left;
+ rightmost = false;
+ }
+
/*
* We can't avoid mem cgroups that are over their soft
* limit by the same amount
@@ -406,6 +411,10 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
else if (mz->usage_in_excess >= mz_node->usage_in_excess)
p = &(*p)->rb_right;
}
+
+ if (rightmost)
+ mctz->rb_rightmost = &mz->tree_node;
+
rb_link_node(&mz->tree_node, parent, p);
rb_insert_color(&mz->tree_node, &mctz->rb_root);
mz->on_tree = true;
@@ -416,6 +425,10 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
{
if (!mz->on_tree)
return;
+
+ if (&mz->tree_node == mctz->rb_rightmost)
+ mctz->rb_rightmost = rb_prev(&mz->tree_node);
+
rb_erase(&mz->tree_node, &mctz->rb_root);
mz->on_tree = false;
}
@@ -496,16 +509,15 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
- struct rb_node *rightmost = NULL;
struct mem_cgroup_per_node *mz;
retry:
mz = NULL;
- rightmost = rb_last(&mctz->rb_root);
- if (!rightmost)
+ if (!mctz->rb_rightmost)
goto done; /* Nothing to reclaim from */
- mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
+ mz = rb_entry(mctz->rb_rightmost,
+ struct mem_cgroup_per_node, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
@@ -1792,6 +1804,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
}
stock->nr_pages += nr_pages;
+ if (stock->nr_pages > CHARGE_BATCH)
+ drain_stock(stock);
+
local_irq_restore(flags);
}
@@ -4414,12 +4429,13 @@ enum mc_target_type {
MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
+ MC_TARGET_DEVICE,
};
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
- struct page *page = vm_normal_page(vma, addr, ptent);
+ struct page *page = _vm_normal_page(vma, addr, ptent, true);
if (!page || !page_mapped(page))
return NULL;
@@ -4436,7 +4452,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
return page;
}
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
pte_t ptent, swp_entry_t *entry)
{
@@ -4445,6 +4461,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
+
+ /*
+ * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+ * a device and because they are not accessible by CPU they are store
+ * as special swap entry in the CPU page table.
+ */
+ if (is_device_private_entry(ent)) {
+ page = device_private_entry_to_page(ent);
+ /*
+ * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+ * a refcount of 1 when free (unlike normal page)
+ */
+ if (!page_ref_add_unless(page, 1, 1))
+ return NULL;
+ return page;
+ }
+
/*
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
@@ -4605,6 +4638,13 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
+ * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * For now we such page is charge like a regular page would be as for all
+ * intent and purposes it is just special memory taking the place of a
+ * regular page.
+ *
+ * See Documentations/vm/hmm.txt and include/linux/hmm.h
*
* Called with pte lock held.
*/
@@ -4633,6 +4673,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
+ if (is_device_private_page(page) ||
+ is_device_public_page(page))
+ ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
}
@@ -4664,6 +4707,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
struct page *page = NULL;
enum mc_target_type ret = MC_TARGET_NONE;
+ if (unlikely(is_swap_pmd(pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(pmd));
+ return ret;
+ }
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!(mc.flags & MOVE_ANON))
@@ -4695,6 +4743,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
+ /*
+ * Note their can not be MC_TARGET_DEVICE for now as we do not
+ * support transparent huge page with MEMORY_DEVICE_PUBLIC or
+ * MEMORY_DEVICE_PRIVATE but this might change.
+ */
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
@@ -4910,6 +4963,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
putback_lru_page(page);
}
put_page(page);
+ } else if (target_type == MC_TARGET_DEVICE) {
+ page = target.page;
+ if (!mem_cgroup_move_account(page, true,
+ mc.from, mc.to)) {
+ mc.precharge -= HPAGE_PMD_NR;
+ mc.moved_charge += HPAGE_PMD_NR;
+ }
+ put_page(page);
}
spin_unlock(ptl);
return 0;
@@ -4921,12 +4982,16 @@ retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
+ bool device = false;
swp_entry_t ent;
if (!mc.precharge)
break;
switch (get_mctgt_type(vma, addr, ptent, &target)) {
+ case MC_TARGET_DEVICE:
+ device = true;
+ /* fall through */
case MC_TARGET_PAGE:
page = target.page;
/*
@@ -4937,7 +5002,7 @@ retry:
*/
if (PageTransCompound(page))
goto put;
- if (isolate_lru_page(page))
+ if (!device && isolate_lru_page(page))
goto put;
if (!mem_cgroup_move_account(page, false,
mc.from, mc.to)) {
@@ -4945,7 +5010,8 @@ retry:
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
- putback_lru_page(page);
+ if (!device)
+ putback_lru_page(page);
put: /* get_mctgt_type() gets the page */
put_page(page);
break;
@@ -5535,48 +5601,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
cancel_charge(memcg, nr_pages);
}
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
- unsigned long nr_anon, unsigned long nr_file,
- unsigned long nr_kmem, unsigned long nr_huge,
- unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+ struct mem_cgroup *memcg;
+ unsigned long pgpgout;
+ unsigned long nr_anon;
+ unsigned long nr_file;
+ unsigned long nr_kmem;
+ unsigned long nr_huge;
+ unsigned long nr_shmem;
+ struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
+{
+ memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
{
- unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+ unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
- if (!mem_cgroup_is_root(memcg)) {
- page_counter_uncharge(&memcg->memory, nr_pages);
+ if (!mem_cgroup_is_root(ug->memcg)) {
+ page_counter_uncharge(&ug->memcg->memory, nr_pages);
if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
- page_counter_uncharge(&memcg->kmem, nr_kmem);
- memcg_oom_recover(memcg);
+ page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+ page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+ memcg_oom_recover(ug->memcg);
}
local_irq_save(flags);
- __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
- __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
- __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
- __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
- __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
- __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
- memcg_check_events(memcg, dummy_page);
+ __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+ __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+ __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+ __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+ __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+ __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+ memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
- if (!mem_cgroup_is_root(memcg))
- css_put_many(&memcg->css, nr_pages);
+ if (!mem_cgroup_is_root(ug->memcg))
+ css_put_many(&ug->memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+ if (!page->mem_cgroup)
+ return;
+
+ /*
+ * Nobody should be changing or seriously looking at
+ * page->mem_cgroup at this point, we have fully
+ * exclusive access to the page.
+ */
+
+ if (ug->memcg != page->mem_cgroup) {
+ if (ug->memcg) {
+ uncharge_batch(ug);
+ uncharge_gather_clear(ug);
+ }
+ ug->memcg = page->mem_cgroup;
+ }
+
+ if (!PageKmemcg(page)) {
+ unsigned int nr_pages = 1;
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ ug->nr_huge += nr_pages;
+ }
+ if (PageAnon(page))
+ ug->nr_anon += nr_pages;
+ else {
+ ug->nr_file += nr_pages;
+ if (PageSwapBacked(page))
+ ug->nr_shmem += nr_pages;
+ }
+ ug->pgpgout++;
+ } else {
+ ug->nr_kmem += 1 << compound_order(page);
+ __ClearPageKmemcg(page);
+ }
+
+ ug->dummy_page = page;
+ page->mem_cgroup = NULL;
}
static void uncharge_list(struct list_head *page_list)
{
- struct mem_cgroup *memcg = NULL;
- unsigned long nr_shmem = 0;
- unsigned long nr_anon = 0;
- unsigned long nr_file = 0;
- unsigned long nr_huge = 0;
- unsigned long nr_kmem = 0;
- unsigned long pgpgout = 0;
+ struct uncharge_gather ug;
struct list_head *next;
- struct page *page;
+
+ uncharge_gather_clear(&ug);
/*
* Note that the list can be a single page->lru; hence the
@@ -5584,57 +5704,16 @@ static void uncharge_list(struct list_head *page_list)
*/
next = page_list->next;
do {
+ struct page *page;
+
page = list_entry(next, struct page, lru);
next = page->lru.next;
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
-
- if (!page->mem_cgroup)
- continue;
-
- /*
- * Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point, we have fully
- * exclusive access to the page.
- */
-
- if (memcg != page->mem_cgroup) {
- if (memcg) {
- uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_kmem, nr_huge, nr_shmem, page);
- pgpgout = nr_anon = nr_file = nr_kmem = 0;
- nr_huge = nr_shmem = 0;
- }
- memcg = page->mem_cgroup;
- }
-
- if (!PageKmemcg(page)) {
- unsigned int nr_pages = 1;
-
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- nr_huge += nr_pages;
- }
- if (PageAnon(page))
- nr_anon += nr_pages;
- else {
- nr_file += nr_pages;
- if (PageSwapBacked(page))
- nr_shmem += nr_pages;
- }
- pgpgout++;
- } else {
- nr_kmem += 1 << compound_order(page);
- __ClearPageKmemcg(page);
- }
-
- page->mem_cgroup = NULL;
+ uncharge_page(page, &ug);
} while (next != page_list);
- if (memcg)
- uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_kmem, nr_huge, nr_shmem, page);
+ if (ug.memcg)
+ uncharge_batch(&ug);
}
/**
@@ -5646,6 +5725,8 @@ static void uncharge_list(struct list_head *page_list)
*/
void mem_cgroup_uncharge(struct page *page)
{
+ struct uncharge_gather ug;
+
if (mem_cgroup_disabled())
return;
@@ -5653,8 +5734,9 @@ void mem_cgroup_uncharge(struct page *page)
if (!page->mem_cgroup)
return;
- INIT_LIST_HEAD(&page->lru);
- uncharge_list(&page->lru);
+ uncharge_gather_clear(&ug);
+ uncharge_page(page, &ug);
+ uncharge_batch(&ug);
}
/**
@@ -5819,8 +5901,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
- page_counter_uncharge(&memcg->memory, nr_pages);
- css_put_many(&memcg->css, nr_pages);
+ refill_stock(memcg, nr_pages);
}
static int __init cgroup_memory(char *s)
@@ -5876,6 +5957,7 @@ static int __init mem_cgroup_init(void)
node_online(node) ? node : NUMA_NO_NODE);
rtpn->rb_root = RB_ROOT;
+ rtpn->rb_rightmost = NULL;
spin_lock_init(&rtpn->lock);
soft_limit_tree.rb_tree_per_node[node] = rtpn;
}
diff --git a/mm/memory.c b/mm/memory.c
index 13ee83b43878..ec4e15494901 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/memremap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -817,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
#else
# define HAVE_PTE_SPECIAL 0
#endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device)
{
unsigned long pfn = pte_pfn(pte);
@@ -829,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return vma->vm_ops->find_special_page(vma, addr);
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
- if (!is_zero_pfn(pfn))
- print_bad_pte(vma, addr, pte, NULL);
+ if (is_zero_pfn(pfn))
+ return NULL;
+
+ /*
+ * Device public pages are special pages (they are ZONE_DEVICE
+ * pages but different from persistent memory). They behave
+ * allmost like normal pages. The difference is that they are
+ * not on the lru and thus should never be involve with any-
+ * thing that involve lru manipulation (mlock, numa balancing,
+ * ...).
+ *
+ * This is why we still want to return NULL for such page from
+ * vm_normal_page() so that we do not have to special case all
+ * call site of vm_normal_page().
+ */
+ if (likely(pfn < highest_memmap_pfn)) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (is_device_public_page(page)) {
+ if (with_public_device)
+ return page;
+ return NULL;
+ }
+ }
+ print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
@@ -956,6 +980,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_swp_mksoft_dirty(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
+ } else if (is_device_private_entry(entry)) {
+ page = device_private_entry_to_page(entry);
+
+ /*
+ * Update rss count even for unaddressable pages, as
+ * they should treated just like normal pages in this
+ * respect.
+ *
+ * We will likely want to have some new rss counters
+ * for unaddressable pages, at some point. But for now
+ * keep things as they are.
+ */
+ get_page(page);
+ rss[mm_counter(page)]++;
+ page_dup_rmap(page, false);
+
+ /*
+ * We do not preserve soft-dirty information, because so
+ * far, checkpoint/restore is the only feature that
+ * requires that. And checkpoint/restore does not work
+ * when a device driver is involved (you cannot easily
+ * save and restore device driver state).
+ */
+ if (is_write_device_private_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ make_device_private_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
}
goto out_set_pte;
}
@@ -982,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
+ } else if (pte_devmap(pte)) {
+ page = pte_page(pte);
+
+ /*
+ * Cache coherent device memory behave like regular page and
+ * not like persistent memory page. For more informations see
+ * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
+ */
+ if (is_device_public_page(page)) {
+ get_page(page);
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
+ }
}
out_set_pte:
@@ -1065,7 +1131,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
+ if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
+ || pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
err = copy_huge_pmd(dst_mm, src_mm,
@@ -1236,7 +1303,7 @@ again:
if (pte_present(ptent)) {
struct page *page;
- page = vm_normal_page(vma, addr, ptent);
+ page = _vm_normal_page(vma, addr, ptent, true);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
@@ -1273,6 +1340,29 @@ again:
}
continue;
}
+
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+ struct page *page = device_private_entry_to_page(entry);
+
+ if (unlikely(details && details->check_mapping)) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+ * invalidate cache without truncating:
+ * unmap shared but keep private pages.
+ */
+ if (details->check_mapping !=
+ page_rmapping(page))
+ continue;
+ }
+
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
+ put_page(page);
+ continue;
+ }
+
/* If details->check_mapping, we leave swap entries. */
if (unlikely(details))
continue;
@@ -1326,7 +1416,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
@@ -2671,7 +2761,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
-static inline void unmap_mapping_range_tree(struct rb_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
struct zap_details *details)
{
struct vm_area_struct *vma;
@@ -2735,7 +2825,7 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
i_mmap_lock_write(mapping);
- if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
}
@@ -2775,6 +2865,14 @@ int do_swap_page(struct vm_fault *vmf)
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
+ } else if (is_device_private_entry(entry)) {
+ /*
+ * For un-addressable device memory we call the pgmap
+ * fault handler callback. The callback must migrate
+ * the page back to some CPU accessible page.
+ */
+ ret = device_private_entry_fault(vma, vmf->address, entry,
+ vmf->flags, vmf->pmd);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
@@ -3863,6 +3961,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
@@ -3885,7 +3984,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
- unsigned int dirty = flags & FAULT_FLAG_WRITE;
/* NUMA case for anonymous PUDs would go here */
@@ -3911,12 +4009,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
pmd_t orig_pmd = *vmf.pmd;
barrier();
+ if (unlikely(is_swap_pmd(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ if (is_pmd_migration_entry(orig_pmd))
+ pmd_migration_entry_wait(mm, vmf.pmd);
+ return 0;
+ }
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if ((vmf.flags & FAULT_FLAG_WRITE) &&
- !pmd_write(orig_pmd)) {
+ if (dirty && !pmd_write(orig_pmd)) {
ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -3949,6 +4053,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ return VM_FAULT_SIGSEGV;
+
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
@@ -3956,11 +4065,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
- if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
- flags & FAULT_FLAG_INSTRUCTION,
- flags & FAULT_FLAG_REMOTE))
- return VM_FAULT_SIGSEGV;
-
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 73bf17df6899..e882cb6da994 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -99,7 +99,7 @@ void mem_hotplug_done(void)
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
- struct resource *res;
+ struct resource *res, *conflict;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
if (!res)
return ERR_PTR(-ENOMEM);
@@ -108,7 +108,13 @@ static struct resource *register_memory_resource(u64 start, u64 size)
res->start = start;
res->end = start + size - 1;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- if (request_resource(&iomem_resource, res) < 0) {
+ conflict = request_resource_conflict(&iomem_resource, res);
+ if (conflict) {
+ if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+ pr_debug("Device unaddressable memory block "
+ "memory hotplug at %#010llx !\n",
+ (unsigned long long)start);
+ }
pr_debug("System RAM resource %pR cannot be added\n", res);
kfree(res);
return ERR_PTR(-EEXIST);
@@ -1380,7 +1386,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (isolate_huge_page(page, &source))
move_pages -= 1 << compound_order(head);
continue;
- }
+ } else if (thp_migration_supported() && PageTransHuge(page))
+ pfn = page_to_pfn(compound_head(page))
+ + hpage_nr_pages(page) - 1;
if (!get_page_unless_zero(page))
continue;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 618ab125228b..006ba625c0b8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,6 +97,7 @@
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
+#include <linux/swapops.h>
#include <asm/tlbflush.h>
#include <linux/uaccess.h>
@@ -412,6 +413,64 @@ struct queue_pages {
};
/*
+ * Check if the page's nid is in qp->nmask.
+ *
+ * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
+ * in the invert of qp->nmask.
+ */
+static inline bool queue_pages_required(struct page *page,
+ struct queue_pages *qp)
+{
+ int nid = page_to_nid(page);
+ unsigned long flags = qp->flags;
+
+ return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
+}
+
+static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ int ret = 0;
+ struct page *page;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags;
+
+ if (unlikely(is_pmd_migration_entry(*pmd))) {
+ ret = 1;
+ goto unlock;
+ }
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
+ goto out;
+ }
+ if (!thp_migration_supported()) {
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ goto out;
+ }
+ if (!queue_pages_required(page, qp)) {
+ ret = 1;
+ goto unlock;
+ }
+
+ ret = 1;
+ flags = qp->flags;
+ /* go to thp migration */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ migrate_page_add(page, qp->pagelist, flags);
+unlock:
+ spin_unlock(ptl);
+out:
+ return ret;
+}
+
+/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*/
@@ -422,30 +481,15 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int nid, ret;
+ int ret;
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge(*pmd)) {
- ptl = pmd_lock(walk->mm, pmd);
- if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
- if (is_huge_zero_page(page)) {
- spin_unlock(ptl);
- __split_huge_pmd(vma, pmd, addr, false, NULL);
- } else {
- get_page(page);
- spin_unlock(ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (ret)
- return 0;
- }
- } else {
- spin_unlock(ptl);
- }
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
+ if (ret)
+ return 0;
}
if (pmd_trans_unstable(pmd))
@@ -464,10 +508,9 @@ retry:
*/
if (PageReserved(page))
continue;
- nid = page_to_nid(page);
- if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
+ if (!queue_pages_required(page, qp))
continue;
- if (PageTransCompound(page)) {
+ if (PageTransCompound(page) && !thp_migration_supported()) {
get_page(page);
pte_unmap_unlock(pte, ptl);
lock_page(page);
@@ -497,7 +540,6 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int nid;
struct page *page;
spinlock_t *ptl;
pte_t entry;
@@ -507,8 +549,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
if (!pte_present(entry))
goto unlock;
page = pte_page(entry);
- nid = page_to_nid(page);
- if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
+ if (!queue_pages_required(page, qp))
goto unlock;
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
@@ -881,19 +922,21 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
#ifdef CONFIG_MIGRATION
/*
- * page migration
+ * page migration, thp tail pages can be passed.
*/
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
+ struct page *head = compound_head(page);
/*
* Avoid migrating a page that is shared with others.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
- if (!isolate_lru_page(page)) {
- list_add_tail(&page->lru, pagelist);
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
+ if (!isolate_lru_page(head)) {
+ list_add_tail(&head->lru, pagelist);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_cache(head),
+ hpage_nr_pages(head));
}
}
}
@@ -903,7 +946,17 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
- else
+ else if (thp_migration_supported() && PageTransHuge(page)) {
+ struct page *thp;
+
+ thp = alloc_pages_node(node,
+ (GFP_TRANSHUGE | __GFP_THISNODE),
+ HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ } else
return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE, 0);
}
@@ -1069,6 +1122,15 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
if (PageHuge(page)) {
BUG_ON(!vma);
return alloc_huge_page_noerr(vma, address, 1);
+ } else if (thp_migration_supported() && PageTransHuge(page)) {
+ struct page *thp;
+
+ thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
+ HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
}
/*
* if !vma, alloc_page_vma() will use task or system default policy
@@ -1683,8 +1745,7 @@ unsigned int mempolicy_slab_node(void)
* node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
* number of present nodes.
*/
-static unsigned offset_il_node(struct mempolicy *pol,
- struct vm_area_struct *vma, unsigned long n)
+static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
@@ -1717,7 +1778,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
BUG_ON(shift < PAGE_SHIFT);
off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
off += (addr - vma->vm_start) >> shift;
- return offset_il_node(pol, vma, off);
+ return offset_il_node(pol, off);
} else
return interleave_nodes(pol);
}
@@ -2172,20 +2233,15 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
int polnid = -1;
int ret = -1;
- BUG_ON(!vma);
-
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
- BUG_ON(addr >= vma->vm_end);
- BUG_ON(addr < vma->vm_start);
-
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
- polnid = offset_il_node(pol, vma, pgoff);
+ polnid = offset_il_node(pol, pgoff);
break;
case MPOL_PREFERRED:
diff --git a/mm/migrate.c b/mm/migrate.c
index e84eeb4e4356..6954c1435833 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,9 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
@@ -185,8 +188,8 @@ void putback_movable_pages(struct list_head *l)
unlock_page(page);
put_page(page);
} else {
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_cache(page), -hpage_nr_pages(page));
putback_lru_page(page);
}
}
@@ -216,6 +219,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
new = page - pvmw.page->index +
linear_page_index(vma, pvmw.address);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte) {
+ VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+ remove_migration_pmd(&pvmw, new);
+ continue;
+ }
+#endif
+
get_page(new);
pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
if (pte_swp_soft_dirty(*pvmw.pte))
@@ -228,7 +240,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
- flush_dcache_page(new);
+ if (unlikely(is_zone_device_page(new))) {
+ if (is_device_private_page(new)) {
+ entry = make_device_private_entry(new, pte_write(pte));
+ pte = swp_entry_to_pte(entry);
+ } else if (is_device_public_page(new)) {
+ pte = pte_mkdevmap(pte);
+ flush_dcache_page(new);
+ }
+ } else
+ flush_dcache_page(new);
+
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
@@ -330,6 +352,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,
__migration_entry_wait(mm, pte, ptl);
}
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
+{
+ spinlock_t *ptl;
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmd);
+ if (!is_pmd_migration_entry(*pmd))
+ goto unlock;
+ page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
+ if (!get_page_unless_zero(page))
+ goto unlock;
+ spin_unlock(ptl);
+ wait_on_page_locked(page);
+ put_page(page);
+ return;
+unlock:
+ spin_unlock(ptl);
+}
+#endif
+
#ifdef CONFIG_BLOCK
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
@@ -398,6 +441,13 @@ int migrate_page_move_mapping(struct address_space *mapping,
int expected_count = 1 + extra_count;
void **pslot;
+ /*
+ * Device public or private pages have an extra refcount as they are
+ * ZONE_DEVICE pages.
+ */
+ expected_count += is_device_private_page(page);
+ expected_count += is_device_public_page(page);
+
if (!mapping) {
/* Anonymous page without mapping */
if (page_count(page) != expected_count)
@@ -604,15 +654,10 @@ static void copy_huge_page(struct page *dst, struct page *src)
/*
* Copy the page to its new location
*/
-void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_states(struct page *newpage, struct page *page)
{
int cpupid;
- if (PageHuge(page) || PageTransHuge(page))
- copy_huge_page(newpage, page);
- else
- copy_highpage(newpage, page);
-
if (PageError(page))
SetPageError(newpage);
if (PageReferenced(page))
@@ -666,6 +711,17 @@ void migrate_page_copy(struct page *newpage, struct page *page)
mem_cgroup_migrate(page, newpage);
}
+EXPORT_SYMBOL(migrate_page_states);
+
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ if (PageHuge(page) || PageTransHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
+
+ migrate_page_states(newpage, page);
+}
EXPORT_SYMBOL(migrate_page_copy);
/************************************************************
@@ -691,7 +747,10 @@ int migrate_page(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page);
@@ -741,12 +800,15 @@ int buffer_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
bh = head;
do {
unlock_buffer(bh);
- put_bh(bh);
+ put_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
@@ -805,8 +867,13 @@ static int fallback_migrate_page(struct address_space *mapping,
{
if (PageDirty(page)) {
/* Only writeback pages in full synchronous migration */
- if (mode != MIGRATE_SYNC)
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
return -EBUSY;
+ }
return writeout(mapping, page);
}
@@ -943,7 +1010,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* the retry loop is too short and in the sync-light case,
* the overhead of stalling is too much
*/
- if (mode != MIGRATE_SYNC) {
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
rc = -EBUSY;
goto out_unlock;
}
@@ -1088,7 +1159,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
}
- if (unlikely(PageTransHuge(page))) {
+ if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
lock_page(page);
rc = split_huge_page(page);
unlock_page(page);
@@ -1116,8 +1187,8 @@ out:
* as __PageMovable
*/
if (likely(!__PageMovable(page)))
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_cache(page), -hpage_nr_pages(page));
}
/*
@@ -1213,8 +1284,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return -ENOMEM;
if (!trylock_page(hpage)) {
- if (!force || mode != MIGRATE_SYNC)
+ if (!force)
+ goto out;
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
goto out;
+ }
lock_page(hpage);
}
@@ -1391,7 +1469,17 @@ static struct page *new_page_node(struct page *p, unsigned long private,
if (PageHuge(p))
return alloc_huge_page_node(page_hstate(compound_head(p)),
pm->node);
- else
+ else if (thp_migration_supported() && PageTransHuge(p)) {
+ struct page *thp;
+
+ thp = alloc_pages_node(pm->node,
+ (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
+ HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ } else
return __alloc_pages_node(pm->node,
GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
}
@@ -1418,6 +1506,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
struct vm_area_struct *vma;
struct page *page;
+ struct page *head;
+ unsigned int follflags;
err = -EFAULT;
vma = find_vma(mm, pp->addr);
@@ -1425,8 +1515,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto set_status;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, pp->addr,
- FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
+ follflags = FOLL_GET | FOLL_DUMP;
+ if (!thp_migration_supported())
+ follflags |= FOLL_SPLIT;
+ page = follow_page(vma, pp->addr, follflags);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1436,7 +1528,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!page)
goto set_status;
- pp->page = page;
err = page_to_nid(page);
if (err == pp->node)
@@ -1451,16 +1542,22 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto put_and_set;
if (PageHuge(page)) {
- if (PageHead(page))
+ if (PageHead(page)) {
isolate_huge_page(page, &pagelist);
+ err = 0;
+ pp->page = page;
+ }
goto put_and_set;
}
- err = isolate_lru_page(page);
+ pp->page = compound_head(page);
+ head = compound_head(page);
+ err = isolate_lru_page(head);
if (!err) {
- list_add_tail(&page->lru, &pagelist);
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ list_add_tail(&head->lru, &pagelist);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_cache(head),
+ hpage_nr_pages(head));
}
put_and_set:
/*
@@ -2029,3 +2126,859 @@ out_unlock:
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_NUMA */
+
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
+struct migrate_vma {
+ struct vm_area_struct *vma;
+ unsigned long *dst;
+ unsigned long *src;
+ unsigned long cpages;
+ unsigned long npages;
+ unsigned long start;
+ unsigned long end;
+};
+
+static int migrate_vma_collect_hole(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[migrate->npages] = 0;
+ migrate->cpages++;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_skip(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = 0;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+again:
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, walk);
+
+ if (pmd_trans_huge(*pmdp)) {
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (unlikely(!pmd_trans_huge(*pmdp))) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
+ page = pmd_page(*pmdp);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmdp, addr);
+ if (pmd_trans_unstable(pmdp))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ } else {
+ int ret;
+
+ get_page(page);
+ spin_unlock(ptl);
+ if (unlikely(!trylock_page(page)))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end,
+ walk);
+ }
+ }
+
+ if (unlikely(pmd_bad(*pmdp)))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+
+ for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ unsigned long mpfn, pfn;
+ struct page *page;
+ swp_entry_t entry;
+ pte_t pte;
+
+ pte = *ptep;
+ pfn = pte_pfn(pte);
+
+ if (pte_none(pte)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ pfn = 0;
+ goto next;
+ }
+
+ if (!pte_present(pte)) {
+ mpfn = pfn = 0;
+
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = pte_to_swp_entry(pte);
+ if (!is_device_private_entry(entry))
+ goto next;
+
+ page = device_private_entry_to_page(entry);
+ mpfn = migrate_pfn(page_to_pfn(page))|
+ MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+ if (is_write_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
+ if (is_zero_pfn(pfn)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ pfn = 0;
+ goto next;
+ }
+ page = _vm_normal_page(migrate->vma, addr, pte, true);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+
+ /* FIXME support THP */
+ if (!page || !page->mapping || PageTransCompound(page)) {
+ mpfn = pfn = 0;
+ goto next;
+ }
+ pfn = page_to_pfn(page);
+
+ /*
+ * By getting a reference on the page we pin it and that blocks
+ * any kind of migration. Side effect is that it "freezes" the
+ * pte.
+ *
+ * We drop this reference after isolating the page from the lru
+ * for non device page (device page are not on the lru and thus
+ * can't be dropped from it).
+ */
+ get_page(page);
+ migrate->cpages++;
+
+ /*
+ * Optimize for the common case where page is only mapped once
+ * in one process. If we can lock the page, then we can safely
+ * set up a special migration page table entry now.
+ */
+ if (trylock_page(page)) {
+ pte_t swp_pte;
+
+ mpfn |= MIGRATE_PFN_LOCKED;
+ ptep_get_and_clear(mm, addr, ptep);
+
+ /* Setup special migration page table entry */
+ entry = make_migration_entry(page, pte_write(pte));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, addr, ptep, swp_pte);
+
+ /*
+ * This is like regular unmap: we remove the rmap and
+ * drop page refcount. Page won't be freed, as we took
+ * a reference just above.
+ */
+ page_remove_rmap(page, false);
+ put_page(page);
+
+ if (pte_present(pte))
+ unmapped++;
+ }
+
+next:
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = mpfn;
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
+ /* Only flush the TLB if we actually modified any entries */
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return 0;
+}
+
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+ struct mm_walk mm_walk;
+
+ mm_walk.pmd_entry = migrate_vma_collect_pmd;
+ mm_walk.pte_entry = NULL;
+ mm_walk.pte_hole = migrate_vma_collect_hole;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.vma = migrate->vma;
+ mm_walk.mm = migrate->vma->vm_mm;
+ mm_walk.private = migrate;
+
+ mmu_notifier_invalidate_range_start(mm_walk.mm,
+ migrate->start,
+ migrate->end);
+ walk_page_range(migrate->start, migrate->end, &mm_walk);
+ mmu_notifier_invalidate_range_end(mm_walk.mm,
+ migrate->start,
+ migrate->end);
+
+ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * migrate_page_move_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page)
+{
+ /*
+ * One extra ref because caller holds an extra reference, either from
+ * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+ * a device page.
+ */
+ int extra = 1;
+
+ /*
+ * FIXME support THP (transparent huge page), it is bit more complex to
+ * check them than regular pages, because they can be mapped with a pmd
+ * or with a pte (split pte mapping).
+ */
+ if (PageCompound(page))
+ return false;
+
+ /* Page from ZONE_DEVICE have one extra reference */
+ if (is_zone_device_page(page)) {
+ /*
+ * Private page can never be pin as they have no valid pte and
+ * GUP will fail for those. Yet if there is a pending migration
+ * a thread might try to wait on the pte migration entry and
+ * will bump the page reference count. Sadly there is no way to
+ * differentiate a regular pin from migration wait. Hence to
+ * avoid 2 racing thread trying to migrate back to CPU to enter
+ * infinite loop (one stoping migration because the other is
+ * waiting on pte migration entry). We always return true here.
+ *
+ * FIXME proper solution is to rework migration_entry_wait() so
+ * it does not need to take a reference on page.
+ */
+ if (is_device_private_page(page))
+ return true;
+
+ /*
+ * Only allow device public page to be migrated and account for
+ * the extra reference count imply by ZONE_DEVICE pages.
+ */
+ if (!is_device_public_page(page))
+ return false;
+ extra++;
+ }
+
+ /* For file back page */
+ if (page_mapping(page))
+ extra += 1 + page_has_private(page);
+
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+ return true;
+}
+
+/*
+ * migrate_vma_prepare() - lock pages and isolate them from the lru
+ * @migrate: migrate struct containing all migration information
+ *
+ * This locks pages that have been collected by migrate_vma_collect(). Once each
+ * page is locked it is isolated from the lru (for non-device pages). Finally,
+ * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
+ * migrated by concurrent kernel threads.
+ */
+static void migrate_vma_prepare(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+ bool allow_drain = true;
+
+ lru_add_drain();
+
+ for (i = 0; (i < npages) && migrate->cpages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ bool remap = true;
+
+ if (!page)
+ continue;
+
+ if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
+ /*
+ * Because we are migrating several pages there can be
+ * a deadlock between 2 concurrent migration where each
+ * are waiting on each other page lock.
+ *
+ * Make migrate_vma() a best effort thing and backoff
+ * for any page we can not lock right away.
+ */
+ if (!trylock_page(page)) {
+ migrate->src[i] = 0;
+ migrate->cpages--;
+ put_page(page);
+ continue;
+ }
+ remap = false;
+ migrate->src[i] |= MIGRATE_PFN_LOCKED;
+ }
+
+ /* ZONE_DEVICE pages are not on LRU */
+ if (!is_zone_device_page(page)) {
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's pagevec */
+ lru_add_drain_all();
+ allow_drain = false;
+ }
+
+ if (isolate_lru_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+ put_page(page);
+ }
+ continue;
+ }
+
+ /* Drop the reference we took in collect */
+ put_page(page);
+ }
+
+ if (!migrate_vma_check_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+
+ if (!is_zone_device_page(page)) {
+ get_page(page);
+ putback_lru_page(page);
+ }
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+
+ if (!is_zone_device_page(page))
+ putback_lru_page(page);
+ else
+ put_page(page);
+ }
+ }
+ }
+
+ for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_pte(page, migrate->vma, addr, page);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ put_page(page);
+ restore--;
+ }
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Replace page mapping (CPU page table pte) with a special migration pte entry
+ * and check again if it has been pinned. Pinned pages are restored because we
+ * cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ if (page_mapped(page)) {
+ try_to_unmap(page, flags);
+ if (page_mapped(page))
+ goto restore;
+ }
+
+ if (migrate_vma_check_page(page))
+ continue;
+
+restore:
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ }
+
+ for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_ptes(page, page, false);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ restore--;
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+ }
+}
+
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ unsigned long *dst)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ spinlock_t *ptl;
+ pte_t entry;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ /* Only allow populating anonymous memory */
+ if (!vma_is_anonymous(vma))
+ goto abort;
+
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ goto abort;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ goto abort;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ goto abort;
+
+ if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ goto abort;
+
+ /*
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
+ * pte_offset_map() on pmds where a huge pmd might be created
+ * from a different thread.
+ *
+ * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * parallel threads are excluded by other means.
+ *
+ * Here we only have down_read(mmap_sem).
+ */
+ if (pte_alloc(mm, pmdp, addr))
+ goto abort;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(pmdp)))
+ goto abort;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto abort;
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ goto abort;
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ if (is_zone_device_page(page)) {
+ if (is_device_private_page(page)) {
+ swp_entry_t swp_entry;
+
+ swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+ entry = swp_entry_to_pte(swp_entry);
+ } else if (is_device_public_page(page)) {
+ entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_mkdevmap(entry);
+ }
+ } else {
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+
+ if (pte_present(*ptep)) {
+ unsigned long pfn = pte_pfn(*ptep);
+
+ if (!is_zero_pfn(pfn)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+ flush = true;
+ } else if (!pte_none(*ptep)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+
+ /*
+ * Check for usefaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+
+ inc_mm_counter(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
+ if (!is_zone_device_page(page))
+ lru_cache_add_active_or_unevictable(page, vma);
+ get_page(page);
+
+ if (flush) {
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush_notify(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ } else {
+ /* No need to invalidate - it was non-present before */
+ set_pte_at(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ }
+
+ pte_unmap_unlock(ptep, ptl);
+ *src = MIGRATE_PFN_MIGRATE;
+ return;
+
+abort:
+ *src &= ~MIGRATE_PFN_MIGRATE;
+}
+
+/*
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+static void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr, i, mmu_start;
+ bool notified = false;
+
+ for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct address_space *mapping;
+ int r;
+
+ if (!newpage) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ if (!page) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+ continue;
+ }
+ if (!notified) {
+ mmu_start = addr;
+ notified = true;
+ mmu_notifier_invalidate_range_start(mm,
+ mmu_start,
+ migrate->end);
+ }
+ migrate_vma_insert_page(migrate, addr, newpage,
+ &migrate->src[i],
+ &migrate->dst[i]);
+ continue;
+ }
+
+ mapping = page_mapping(page);
+
+ if (is_zone_device_page(newpage)) {
+ if (is_device_private_page(newpage)) {
+ /*
+ * For now only support private anonymous when
+ * migrating to un-addressable device memory.
+ */
+ if (mapping) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ } else if (!is_device_public_page(newpage)) {
+ /*
+ * Other types of ZONE_DEVICE page are not
+ * supported.
+ */
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ }
+
+ r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+ if (r != MIGRATEPAGE_SUCCESS)
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+
+ if (notified)
+ mmu_notifier_invalidate_range_end(mm, mmu_start,
+ migrate->end);
+}
+
+/*
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+static void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ unsigned long i;
+
+ for (i = 0; i < npages; i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ continue;
+ }
+
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ newpage = page;
+ }
+
+ remove_migration_ptes(page, newpage, false);
+ unlock_page(page);
+ migrate->cpages--;
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+
+ if (newpage != page) {
+ unlock_page(newpage);
+ if (is_zone_device_page(newpage))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+ }
+}
+
+/*
+ * migrate_vma() - migrate a range of memory inside vma
+ *
+ * @ops: migration callback for allocating destination memory and copying
+ * @vma: virtual memory area containing the range to be migrated
+ * @start: start address of the range to migrate (inclusive)
+ * @end: end address of the range to migrate (exclusive)
+ * @src: array of hmm_pfn_t containing source pfns
+ * @dst: array of hmm_pfn_t containing destination pfns
+ * @private: pointer passed back to each of the callback
+ * Returns: 0 on success, error code otherwise
+ *
+ * This function tries to migrate a range of memory virtual address range, using
+ * callbacks to allocate and copy memory from source to destination. First it
+ * collects all the pages backing each virtual address in the range, saving this
+ * inside the src array. Then it locks those pages and unmaps them. Once the pages
+ * are locked and unmapped, it checks whether each page is pinned or not. Pages
+ * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
+ * in the corresponding src array entry. It then restores any pages that are
+ * pinned, by remapping and unlocking those pages.
+ *
+ * At this point it calls the alloc_and_copy() callback. For documentation on
+ * what is expected from that callback, see struct migrate_vma_ops comments in
+ * include/linux/migrate.h
+ *
+ * After the alloc_and_copy() callback, this function goes over each entry in
+ * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then the function tries to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the struct
+ * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
+ * array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * It then calls the finalize_and_map() callback. See comments for "struct
+ * migrate_vma_ops", in include/linux/migrate.h for details about
+ * finalize_and_map() behavior.
+ *
+ * After the finalize_and_map() callback, for successfully migrated pages, this
+ * function updates the CPU page table to point to new pages, otherwise it
+ * restores the CPU page table to point to the original source pages.
+ *
+ * Function returns 0 after the above steps, even if no pages were migrated
+ * (The function only returns an error if any of the arguments are invalid.)
+ *
+ * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
+ * unsigned long entries.
+ */
+int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *src,
+ unsigned long *dst,
+ void *private)
+{
+ struct migrate_vma migrate;
+
+ /* Sanity check the arguments */
+ start &= PAGE_MASK;
+ end &= PAGE_MASK;
+ if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
+ return -EINVAL;
+ if (start < vma->vm_start || start >= vma->vm_end)
+ return -EINVAL;
+ if (end <= vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+ if (!ops || !src || !dst || start >= end)
+ return -EINVAL;
+
+ memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
+ migrate.src = src;
+ migrate.dst = dst;
+ migrate.start = start;
+ migrate.npages = 0;
+ migrate.cpages = 0;
+ migrate.end = end;
+ migrate.vma = vma;
+
+ /* Collect, and try to unmap source pages */
+ migrate_vma_collect(&migrate);
+ if (!migrate.cpages)
+ return 0;
+
+ /* Lock and isolate page */
+ migrate_vma_prepare(&migrate);
+ if (!migrate.cpages)
+ return 0;
+
+ /* Unmap pages */
+ migrate_vma_unmap(&migrate);
+ if (!migrate.cpages)
+ return 0;
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the callback.
+ *
+ * Note that migration can fail in migrate_vma_struct_page() for each
+ * individual page.
+ */
+ ops->alloc_and_copy(vma, src, dst, start, end, private);
+
+ /* This does the real migration of struct page */
+ migrate_vma_pages(&migrate);
+
+ ops->finalize_and_map(vma, src, dst, start, end, private);
+
+ /* Unlock and remap pages */
+ migrate_vma_finalize(&migrate);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_vma);
+#endif /* defined(MIGRATE_VMA_HELPER) */
diff --git a/mm/mlock.c b/mm/mlock.c
index b562b5523a65..dfc6f1912176 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -365,8 +365,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
* @start + PAGE_SIZE when no page could be added by the pte walk.
*/
static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
- struct vm_area_struct *vma, int zoneid, unsigned long start,
- unsigned long end)
+ struct vm_area_struct *vma, struct zone *zone,
+ unsigned long start, unsigned long end)
{
pte_t *pte;
spinlock_t *ptl;
@@ -394,7 +394,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
* Break if page could not be obtained or the page's node+zone does not
* match
*/
- if (!page || page_zone_id(page) != zoneid)
+ if (!page || page_zone(page) != zone)
break;
/*
@@ -446,7 +446,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long page_increm;
struct pagevec pvec;
struct zone *zone;
- int zoneid;
pagevec_init(&pvec, 0);
/*
@@ -481,7 +480,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
*/
pagevec_add(&pvec, page);
zone = page_zone(page);
- zoneid = page_zone_id(page);
/*
* Try to fill the rest of pagevec using fast
@@ -490,7 +488,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
* pagevec.
*/
start = __munlock_pagevec_fill(&pvec, vma,
- zoneid, start, end);
+ zone, start, end);
__munlock_pagevec(&pvec, zone);
goto next;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 4c5981651407..680506faceae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -685,7 +685,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
struct address_space *mapping = NULL;
- struct rb_root *root = NULL;
+ struct rb_root_cached *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
bool start_changed = false, end_changed = false;
@@ -3340,7 +3340,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
- if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
@@ -3356,7 +3356,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* anon_vma->root->rwsem.
*/
if (__test_and_set_bit(0, (unsigned long *)
- &anon_vma->root->rb_root.rb_node))
+ &anon_vma->root->rb_root.rb_root.rb_node))
BUG();
}
}
@@ -3458,7 +3458,7 @@ out_unlock:
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
- if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
+ if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
/*
* The LSB of head.next can't change to 0 from under
* us because we hold the mm_all_locks_mutex.
@@ -3472,7 +3472,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
* anon_vma->root->rwsem.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
- &anon_vma->root->rb_root.rb_node))
+ &anon_vma->root->rb_root.rb_root.rb_node))
BUG();
anon_vma_unlock_write(anon_vma);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bd0f409922cb..6d3e2f082290 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -125,6 +125,20 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pages++;
}
+
+ if (is_write_device_private_entry(entry)) {
+ pte_t newpte;
+
+ /*
+ * We do not preserve soft-dirtiness. See
+ * copy_one_pte() for explanation.
+ */
+ make_device_private_entry_read(&entry);
+ newpte = swp_entry_to_pte(entry);
+ set_pte_at(mm, addr, pte, newpte);
+
+ pages++;
+ }
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -149,7 +163,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
unsigned long this_pages;
next = pmd_addr_end(addr, end);
- if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
+ if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
&& pmd_none_or_clear_bad(pmd))
continue;
@@ -159,7 +173,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(mm, mni_start, end);
}
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
} else {
diff --git a/mm/mremap.c b/mm/mremap.c
index 7395564daa6c..cfec004c4ff9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -223,7 +223,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
- if (pmd_trans_huge(*old_pmd)) {
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE) {
bool moved;
/* See comment in move_ptes() */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a9add06fe768..c841af88836a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2741,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order)
static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
{
#ifdef CONFIG_NUMA
- enum zone_stat_item local_stat = NUMA_LOCAL;
+ enum numa_stat_item local_stat = NUMA_LOCAL;
if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
if (z->node == preferred_zone->node)
- __inc_zone_state(z, NUMA_HIT);
+ __inc_numa_state(z, NUMA_HIT);
else {
- __inc_zone_state(z, NUMA_MISS);
- __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+ __inc_numa_state(z, NUMA_MISS);
+ __inc_numa_state(preferred_zone, NUMA_FOREIGN);
}
- __inc_zone_state(z, local_stat);
+ __inc_numa_state(z, local_stat);
#endif
}
@@ -4183,10 +4183,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
- gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+ gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
gfp_mask &= gfp_allowed_mask;
+ alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 8ec6ba230bb9..6a03946469a9 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -48,6 +48,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
if (!is_swap_pte(*pvmw->pte))
return false;
entry = pte_to_swp_entry(*pvmw->pte);
+
if (!is_migration_entry(entry))
return false;
if (migration_entry_to_page(entry) - pvmw->page >=
@@ -60,6 +61,15 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
WARN_ON_ONCE(1);
#endif
} else {
+ if (is_swap_pte(*pvmw->pte)) {
+ swp_entry_t entry;
+
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (is_device_private_entry(entry) &&
+ device_private_entry_to_page(entry) == pvmw->page)
+ return true;
+ }
+
if (!pte_present(*pvmw->pte))
return false;
@@ -138,16 +148,28 @@ restart:
if (!pud_present(*pud))
return false;
pvmw->pmd = pmd_offset(pud, pvmw->address);
- if (pmd_trans_huge(*pvmw->pmd)) {
+ if (pmd_trans_huge(*pvmw->pmd) || is_pmd_migration_entry(*pvmw->pmd)) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
- if (!pmd_present(*pvmw->pmd))
- return not_found(pvmw);
if (likely(pmd_trans_huge(*pvmw->pmd))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
if (pmd_page(*pvmw->pmd) != page)
return not_found(pvmw);
return true;
+ } else if (!pmd_present(*pvmw->pmd)) {
+ if (thp_migration_supported()) {
+ if (!(pvmw->flags & PVMW_MIGRATION))
+ return not_found(pvmw);
+ if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) {
+ swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd);
+
+ if (migration_entry_to_page(entry) != page)
+ return not_found(pvmw);
+ return true;
+ }
+ } else
+ WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+ return not_found(pvmw);
} else {
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c99d9512a45b..1175f6a24fdb 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -124,7 +124,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
diff --git a/mm/rmap.c b/mm/rmap.c
index c570f82e6827..b874c4761e84 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -63,6 +63,7 @@
#include <linux/hugetlb.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
+#include <linux/memremap.h>
#include <asm/tlbflush.h>
@@ -390,7 +391,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
- if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
anon_vma->parent->degree--;
continue;
}
@@ -424,7 +425,7 @@ static void anon_vma_ctor(void *data)
init_rwsem(&anon_vma->rwsem);
atomic_set(&anon_vma->refcount, 0);
- anon_vma->rb_root = RB_ROOT;
+ anon_vma->rb_root = RB_ROOT_CACHED;
}
void __init anon_vma_init(void)
@@ -1346,9 +1347,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
+ if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page) && !is_device_private_page(page))
+ return true;
+
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_address(vma, address,
- flags & TTU_MIGRATION, page);
+ flags & TTU_SPLIT_FREEZE, page);
}
/*
@@ -1360,6 +1365,19 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
while (page_vma_mapped_walk(&pvmw)) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte && (flags & TTU_MIGRATION)) {
+ VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+
+ if (!PageAnon(page))
+ continue;
+
+ set_pmd_migration_entry(&pvmw, page);
+ continue;
+ }
+#endif
+
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
@@ -1390,6 +1408,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
address = pvmw.address;
+ if (IS_ENABLED(CONFIG_MIGRATION) &&
+ (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_migration_entry(page, 0);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ goto discard;
+ }
+
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
@@ -1445,7 +1484,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
dec_mm_counter(mm, mm_counter(page));
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & TTU_MIGRATION)) {
+ (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
swp_entry_t entry;
pte_t swp_pte;
/*
@@ -1575,7 +1614,8 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
- if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
+ if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
+ && !PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
diff --git a/mm/slub.c b/mm/slub.c
index ddb04576b342..d39a5d3834b3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4232,7 +4232,7 @@ void __init kmem_cache_init(void)
cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
slub_cpu_dead);
- pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
+ pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n",
cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
diff --git a/mm/sparse.c b/mm/sparse.c
index a9783acf2bb9..83b3bf6461af 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -626,7 +626,7 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ unsigned long section_nr = pfn_to_section_nr(pfn);
struct mem_section *ms;
/* onlining code should never touch invalid ranges */
diff --git a/mm/swap.c b/mm/swap.c
index 62d96b8e5eb3..9295ae960d66 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -765,6 +765,17 @@ void release_pages(struct page **pages, int nr, bool cold)
if (is_huge_zero_page(page))
continue;
+ /* Device public page can not be huge page */
+ if (is_device_public_page(page)) {
+ if (locked_pgdat) {
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+ flags);
+ locked_pgdat = NULL;
+ }
+ put_zone_device_private_or_public_page(page);
+ continue;
+ }
+
page = compound_head(page);
if (!put_page_testzero(page))
continue;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d483278ee35b..bf91dc9e7a79 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3290,7 +3290,8 @@ bad_swap:
p->flags = 0;
spin_unlock(&swap_lock);
vfree(swap_map);
- vfree(cluster_info);
+ kvfree(cluster_info);
+ kvfree(frontswap_map);
if (swap_file) {
if (inode && S_ISREG(inode->i_mode)) {
inode_unlock(inode);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7e4b8458023..4bb13e72ac97 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@
#include "internal.h"
+#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
+
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -87,8 +89,10 @@ void vm_events_fold_cpu(int cpu)
* vm_stat contains the global counters
*/
atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_zone_stat);
+EXPORT_SYMBOL(vm_numa_stat);
EXPORT_SYMBOL(vm_node_stat);
#ifdef CONFIG_SMP
@@ -604,6 +608,32 @@ EXPORT_SYMBOL(dec_node_page_state);
* Fold a differential into the global counters.
* Returns the number of counters updated.
*/
+#ifdef CONFIG_NUMA
+static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
+{
+ int i;
+ int changes = 0;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (zone_diff[i]) {
+ atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+ changes++;
+ }
+
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ if (numa_diff[i]) {
+ atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
+ changes++;
+ }
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ if (node_diff[i]) {
+ atomic_long_add(node_diff[i], &vm_node_stat[i]);
+ changes++;
+ }
+ return changes;
+}
+#else
static int fold_diff(int *zone_diff, int *node_diff)
{
int i;
@@ -622,6 +652,7 @@ static int fold_diff(int *zone_diff, int *node_diff)
}
return changes;
}
+#endif /* CONFIG_NUMA */
/*
* Update the zone counters for the current cpu.
@@ -645,6 +676,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
struct zone *zone;
int i;
int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+ int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
int changes = 0;
@@ -666,6 +700,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
}
}
#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+ int v;
+
+ v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
+ if (v) {
+
+ atomic_long_add(v, &zone->vm_numa_stat[i]);
+ global_numa_diff[i] += v;
+ __this_cpu_write(p->expire, 3);
+ }
+ }
+
if (do_pagesets) {
cond_resched();
/*
@@ -712,7 +758,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
}
}
+#ifdef CONFIG_NUMA
+ changes += fold_diff(global_zone_diff, global_numa_diff,
+ global_node_diff);
+#else
changes += fold_diff(global_zone_diff, global_node_diff);
+#endif
return changes;
}
@@ -727,6 +778,9 @@ void cpu_vm_stats_fold(int cpu)
struct zone *zone;
int i;
int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+ int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
for_each_populated_zone(zone) {
@@ -743,6 +797,18 @@ void cpu_vm_stats_fold(int cpu)
atomic_long_add(v, &zone->vm_stat[i]);
global_zone_diff[i] += v;
}
+
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ if (p->vm_numa_stat_diff[i]) {
+ int v;
+
+ v = p->vm_numa_stat_diff[i];
+ p->vm_numa_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_numa_stat[i]);
+ global_numa_diff[i] += v;
+ }
+#endif
}
for_each_online_pgdat(pgdat) {
@@ -761,7 +827,11 @@ void cpu_vm_stats_fold(int cpu)
}
}
+#ifdef CONFIG_NUMA
+ fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
+#else
fold_diff(global_zone_diff, global_node_diff);
+#endif
}
/*
@@ -779,10 +849,36 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
atomic_long_add(v, &zone->vm_stat[i]);
atomic_long_add(v, &vm_zone_stat[i]);
}
+
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ if (pset->vm_numa_stat_diff[i]) {
+ int v = pset->vm_numa_stat_diff[i];
+
+ pset->vm_numa_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_numa_stat[i]);
+ atomic_long_add(v, &vm_numa_stat[i]);
+ }
+#endif
}
#endif
#ifdef CONFIG_NUMA
+void __inc_numa_state(struct zone *zone,
+ enum numa_stat_item item)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ u16 __percpu *p = pcp->vm_numa_stat_diff + item;
+ u16 v;
+
+ v = __this_cpu_inc_return(*p);
+
+ if (unlikely(v > NUMA_STATS_THRESHOLD)) {
+ zone_numa_state_add(v, zone, item);
+ __this_cpu_write(*p, 0);
+ }
+}
+
/*
* Determine the per node value of a stat item. This function
* is called frequently in a NUMA machine, so try to be as
@@ -802,6 +898,23 @@ unsigned long sum_zone_node_page_state(int node,
}
/*
+ * Determine the per node value of a numa stat item. To avoid deviation,
+ * the per cpu stat number in vm_numa_stat_diff[] is also included.
+ */
+unsigned long sum_zone_numa_state(int node,
+ enum numa_stat_item item)
+{
+ struct zone *zones = NODE_DATA(node)->node_zones;
+ int i;
+ unsigned long count = 0;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ count += zone_numa_state_snapshot(zones + i, item);
+
+ return count;
+}
+
+/*
* Determine the per node value of a stat item.
*/
unsigned long node_page_state(struct pglist_data *pgdat,
@@ -937,6 +1050,9 @@ const char * const vmstat_text[] = {
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
#endif
+ "nr_free_cma",
+
+ /* enum numa_stat_item counters */
#ifdef CONFIG_NUMA
"numa_hit",
"numa_miss",
@@ -945,7 +1061,6 @@ const char * const vmstat_text[] = {
"numa_local",
"numa_other",
#endif
- "nr_free_cma",
/* Node-based counters */
"nr_inactive_anon",
@@ -1106,7 +1221,6 @@ const char * const vmstat_text[] = {
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
-
#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
defined(CONFIG_PROC_FS)
static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -1384,7 +1498,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
seq_printf(m, "\n %-12s %lu",
- vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+ vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NUMA_STAT_ITEMS],
node_page_state(pgdat, i));
}
}
@@ -1421,6 +1536,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
zone_page_state(zone, i));
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ seq_printf(m, "\n %-12s %lu",
+ vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+ zone_numa_state_snapshot(zone, i));
+#endif
+
seq_printf(m, "\n pagesets");
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
@@ -1497,6 +1619,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+ NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
@@ -1512,6 +1635,12 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
v[i] = global_zone_page_state(i);
v += NR_VM_ZONE_STAT_ITEMS;
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ v[i] = global_numa_state(i);
+ v += NR_VM_NUMA_STAT_ITEMS;
+#endif
+
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
v[i] = global_node_page_state(i);
v += NR_VM_NODE_STAT_ITEMS;
@@ -1613,6 +1742,16 @@ int vmstat_refresh(struct ctl_table *table, int write,
err = -EINVAL;
}
}
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+ val = atomic_long_read(&vm_numa_stat[i]);
+ if (val < 0) {
+ pr_warn("%s: %s %ld\n",
+ __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val);
+ err = -EINVAL;
+ }
+ }
+#endif
if (err)
return err;
if (write)
@@ -1654,13 +1793,20 @@ static bool need_update(int cpu)
struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
+#ifdef CONFIG_NUMA
+ BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
+#endif
+
/*
* The fast way of checking if there are any vmstat diffs.
* This works because the diffs are byte sized items.
*/
if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
return true;
-
+#ifdef CONFIG_NUMA
+ if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
+ return true;
+#endif
}
return false;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 62457eb82330..7c38e850a8fc 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -551,20 +551,23 @@ static int get_size_class_index(int size)
return min_t(int, ZS_SIZE_CLASSES - 1, idx);
}
+/* type can be of enum type zs_stat_type or fullness_group */
static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
+ int type, unsigned long cnt)
{
class->stats.objs[type] += cnt;
}
+/* type can be of enum type zs_stat_type or fullness_group */
static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
+ int type, unsigned long cnt)
{
class->stats.objs[type] -= cnt;
}
+/* type can be of enum type zs_stat_type or fullness_group */
static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
+ int type)
{
return class->stats.objs[type];
}
@@ -1969,6 +1972,14 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
unsigned int obj_idx;
int ret = -EAGAIN;
+ /*
+ * We cannot support the _NO_COPY case here, because copy needs to
+ * happen under the zs lock, which does not work with
+ * MIGRATE_SYNC_NO_COPY workflow.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);