aboutsummaryrefslogtreecommitdiffstats
path: root/mm/migrate.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/migrate.c')
-rw-r--r--mm/migrate.c1018
1 files changed, 983 insertions, 35 deletions
diff --git a/mm/migrate.c b/mm/migrate.c
index d68a41da6abb..6954c1435833 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,11 +36,15 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
+#include <linux/ptrace.h>
#include <asm/tlbflush.h>
@@ -184,8 +188,8 @@ void putback_movable_pages(struct list_head *l)
unlock_page(page);
put_page(page);
} else {
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_cache(page), -hpage_nr_pages(page));
putback_lru_page(page);
}
}
@@ -215,6 +219,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
new = page - pvmw.page->index +
linear_page_index(vma, pvmw.address);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte) {
+ VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+ remove_migration_pmd(&pvmw, new);
+ continue;
+ }
+#endif
+
get_page(new);
pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
if (pte_swp_soft_dirty(*pvmw.pte))
@@ -227,7 +240,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
- flush_dcache_page(new);
+ if (unlikely(is_zone_device_page(new))) {
+ if (is_device_private_page(new)) {
+ entry = make_device_private_entry(new, pte_write(pte));
+ pte = swp_entry_to_pte(entry);
+ } else if (is_device_public_page(new)) {
+ pte = pte_mkdevmap(pte);
+ flush_dcache_page(new);
+ }
+ } else
+ flush_dcache_page(new);
+
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
@@ -329,6 +352,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,
__migration_entry_wait(mm, pte, ptl);
}
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
+{
+ spinlock_t *ptl;
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmd);
+ if (!is_pmd_migration_entry(*pmd))
+ goto unlock;
+ page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
+ if (!get_page_unless_zero(page))
+ goto unlock;
+ spin_unlock(ptl);
+ wait_on_page_locked(page);
+ put_page(page);
+ return;
+unlock:
+ spin_unlock(ptl);
+}
+#endif
+
#ifdef CONFIG_BLOCK
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
@@ -397,6 +441,13 @@ int migrate_page_move_mapping(struct address_space *mapping,
int expected_count = 1 + extra_count;
void **pslot;
+ /*
+ * Device public or private pages have an extra refcount as they are
+ * ZONE_DEVICE pages.
+ */
+ expected_count += is_device_private_page(page);
+ expected_count += is_device_public_page(page);
+
if (!mapping) {
/* Anonymous page without mapping */
if (page_count(page) != expected_count)
@@ -603,15 +654,10 @@ static void copy_huge_page(struct page *dst, struct page *src)
/*
* Copy the page to its new location
*/
-void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_states(struct page *newpage, struct page *page)
{
int cpupid;
- if (PageHuge(page) || PageTransHuge(page))
- copy_huge_page(newpage, page);
- else
- copy_highpage(newpage, page);
-
if (PageError(page))
SetPageError(newpage);
if (PageReferenced(page))
@@ -665,6 +711,17 @@ void migrate_page_copy(struct page *newpage, struct page *page)
mem_cgroup_migrate(page, newpage);
}
+EXPORT_SYMBOL(migrate_page_states);
+
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ if (PageHuge(page) || PageTransHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
+
+ migrate_page_states(newpage, page);
+}
EXPORT_SYMBOL(migrate_page_copy);
/************************************************************
@@ -690,7 +747,10 @@ int migrate_page(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page);
@@ -740,12 +800,15 @@ int buffer_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
bh = head;
do {
unlock_buffer(bh);
- put_bh(bh);
+ put_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
@@ -804,8 +867,13 @@ static int fallback_migrate_page(struct address_space *mapping,
{
if (PageDirty(page)) {
/* Only writeback pages in full synchronous migration */
- if (mode != MIGRATE_SYNC)
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
return -EBUSY;
+ }
return writeout(mapping, page);
}
@@ -942,7 +1010,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* the retry loop is too short and in the sync-light case,
* the overhead of stalling is too much
*/
- if (mode != MIGRATE_SYNC) {
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
rc = -EBUSY;
goto out_unlock;
}
@@ -1087,7 +1159,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
}
- if (unlikely(PageTransHuge(page))) {
+ if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
lock_page(page);
rc = split_huge_page(page);
unlock_page(page);
@@ -1115,8 +1187,8 @@ out:
* as __PageMovable
*/
if (likely(!__PageMovable(page)))
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_cache(page), -hpage_nr_pages(page));
}
/*
@@ -1212,8 +1284,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return -ENOMEM;
if (!trylock_page(hpage)) {
- if (!force || mode != MIGRATE_SYNC)
+ if (!force)
+ goto out;
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
goto out;
+ }
lock_page(hpage);
}
@@ -1390,7 +1469,17 @@ static struct page *new_page_node(struct page *p, unsigned long private,
if (PageHuge(p))
return alloc_huge_page_node(page_hstate(compound_head(p)),
pm->node);
- else
+ else if (thp_migration_supported() && PageTransHuge(p)) {
+ struct page *thp;
+
+ thp = alloc_pages_node(pm->node,
+ (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
+ HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ } else
return __alloc_pages_node(pm->node,
GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
}
@@ -1417,6 +1506,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
struct vm_area_struct *vma;
struct page *page;
+ struct page *head;
+ unsigned int follflags;
err = -EFAULT;
vma = find_vma(mm, pp->addr);
@@ -1424,8 +1515,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto set_status;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, pp->addr,
- FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
+ follflags = FOLL_GET | FOLL_DUMP;
+ if (!thp_migration_supported())
+ follflags |= FOLL_SPLIT;
+ page = follow_page(vma, pp->addr, follflags);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1435,7 +1528,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!page)
goto set_status;
- pp->page = page;
err = page_to_nid(page);
if (err == pp->node)
@@ -1450,16 +1542,22 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto put_and_set;
if (PageHuge(page)) {
- if (PageHead(page))
+ if (PageHead(page)) {
isolate_huge_page(page, &pagelist);
+ err = 0;
+ pp->page = page;
+ }
goto put_and_set;
}
- err = isolate_lru_page(page);
+ pp->page = compound_head(page);
+ head = compound_head(page);
+ err = isolate_lru_page(head);
if (!err) {
- list_add_tail(&page->lru, &pagelist);
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ list_add_tail(&head->lru, &pagelist);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_cache(head),
+ hpage_nr_pages(head));
}
put_and_set:
/*
@@ -1652,7 +1750,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
const int __user *, nodes,
int __user *, status, int, flags)
{
- const struct cred *cred = current_cred(), *tcred;
struct task_struct *task;
struct mm_struct *mm;
int err;
@@ -1676,14 +1773,9 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
/*
* Check if this process has the right to modify the specified
- * process. The right exists if the process has administrative
- * capabilities, superuser privileges or the same
- * userid as the target process.
+ * process. Use the regular "ptrace_may_access()" checks.
*/
- tcred = __task_cred(task);
- if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
- !capable(CAP_SYS_NICE)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out;
@@ -2034,3 +2126,859 @@ out_unlock:
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_NUMA */
+
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
+struct migrate_vma {
+ struct vm_area_struct *vma;
+ unsigned long *dst;
+ unsigned long *src;
+ unsigned long cpages;
+ unsigned long npages;
+ unsigned long start;
+ unsigned long end;
+};
+
+static int migrate_vma_collect_hole(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[migrate->npages] = 0;
+ migrate->cpages++;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_skip(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = 0;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+again:
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, walk);
+
+ if (pmd_trans_huge(*pmdp)) {
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (unlikely(!pmd_trans_huge(*pmdp))) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
+ page = pmd_page(*pmdp);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmdp, addr);
+ if (pmd_trans_unstable(pmdp))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ } else {
+ int ret;
+
+ get_page(page);
+ spin_unlock(ptl);
+ if (unlikely(!trylock_page(page)))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end,
+ walk);
+ }
+ }
+
+ if (unlikely(pmd_bad(*pmdp)))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+
+ for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ unsigned long mpfn, pfn;
+ struct page *page;
+ swp_entry_t entry;
+ pte_t pte;
+
+ pte = *ptep;
+ pfn = pte_pfn(pte);
+
+ if (pte_none(pte)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ pfn = 0;
+ goto next;
+ }
+
+ if (!pte_present(pte)) {
+ mpfn = pfn = 0;
+
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = pte_to_swp_entry(pte);
+ if (!is_device_private_entry(entry))
+ goto next;
+
+ page = device_private_entry_to_page(entry);
+ mpfn = migrate_pfn(page_to_pfn(page))|
+ MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+ if (is_write_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
+ if (is_zero_pfn(pfn)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ pfn = 0;
+ goto next;
+ }
+ page = _vm_normal_page(migrate->vma, addr, pte, true);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+
+ /* FIXME support THP */
+ if (!page || !page->mapping || PageTransCompound(page)) {
+ mpfn = pfn = 0;
+ goto next;
+ }
+ pfn = page_to_pfn(page);
+
+ /*
+ * By getting a reference on the page we pin it and that blocks
+ * any kind of migration. Side effect is that it "freezes" the
+ * pte.
+ *
+ * We drop this reference after isolating the page from the lru
+ * for non device page (device page are not on the lru and thus
+ * can't be dropped from it).
+ */
+ get_page(page);
+ migrate->cpages++;
+
+ /*
+ * Optimize for the common case where page is only mapped once
+ * in one process. If we can lock the page, then we can safely
+ * set up a special migration page table entry now.
+ */
+ if (trylock_page(page)) {
+ pte_t swp_pte;
+
+ mpfn |= MIGRATE_PFN_LOCKED;
+ ptep_get_and_clear(mm, addr, ptep);
+
+ /* Setup special migration page table entry */
+ entry = make_migration_entry(page, pte_write(pte));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, addr, ptep, swp_pte);
+
+ /*
+ * This is like regular unmap: we remove the rmap and
+ * drop page refcount. Page won't be freed, as we took
+ * a reference just above.
+ */
+ page_remove_rmap(page, false);
+ put_page(page);
+
+ if (pte_present(pte))
+ unmapped++;
+ }
+
+next:
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = mpfn;
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
+ /* Only flush the TLB if we actually modified any entries */
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return 0;
+}
+
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+ struct mm_walk mm_walk;
+
+ mm_walk.pmd_entry = migrate_vma_collect_pmd;
+ mm_walk.pte_entry = NULL;
+ mm_walk.pte_hole = migrate_vma_collect_hole;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.vma = migrate->vma;
+ mm_walk.mm = migrate->vma->vm_mm;
+ mm_walk.private = migrate;
+
+ mmu_notifier_invalidate_range_start(mm_walk.mm,
+ migrate->start,
+ migrate->end);
+ walk_page_range(migrate->start, migrate->end, &mm_walk);
+ mmu_notifier_invalidate_range_end(mm_walk.mm,
+ migrate->start,
+ migrate->end);
+
+ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * migrate_page_move_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page)
+{
+ /*
+ * One extra ref because caller holds an extra reference, either from
+ * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+ * a device page.
+ */
+ int extra = 1;
+
+ /*
+ * FIXME support THP (transparent huge page), it is bit more complex to
+ * check them than regular pages, because they can be mapped with a pmd
+ * or with a pte (split pte mapping).
+ */
+ if (PageCompound(page))
+ return false;
+
+ /* Page from ZONE_DEVICE have one extra reference */
+ if (is_zone_device_page(page)) {
+ /*
+ * Private page can never be pin as they have no valid pte and
+ * GUP will fail for those. Yet if there is a pending migration
+ * a thread might try to wait on the pte migration entry and
+ * will bump the page reference count. Sadly there is no way to
+ * differentiate a regular pin from migration wait. Hence to
+ * avoid 2 racing thread trying to migrate back to CPU to enter
+ * infinite loop (one stoping migration because the other is
+ * waiting on pte migration entry). We always return true here.
+ *
+ * FIXME proper solution is to rework migration_entry_wait() so
+ * it does not need to take a reference on page.
+ */
+ if (is_device_private_page(page))
+ return true;
+
+ /*
+ * Only allow device public page to be migrated and account for
+ * the extra reference count imply by ZONE_DEVICE pages.
+ */
+ if (!is_device_public_page(page))
+ return false;
+ extra++;
+ }
+
+ /* For file back page */
+ if (page_mapping(page))
+ extra += 1 + page_has_private(page);
+
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+ return true;
+}
+
+/*
+ * migrate_vma_prepare() - lock pages and isolate them from the lru
+ * @migrate: migrate struct containing all migration information
+ *
+ * This locks pages that have been collected by migrate_vma_collect(). Once each
+ * page is locked it is isolated from the lru (for non-device pages). Finally,
+ * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
+ * migrated by concurrent kernel threads.
+ */
+static void migrate_vma_prepare(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+ bool allow_drain = true;
+
+ lru_add_drain();
+
+ for (i = 0; (i < npages) && migrate->cpages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ bool remap = true;
+
+ if (!page)
+ continue;
+
+ if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
+ /*
+ * Because we are migrating several pages there can be
+ * a deadlock between 2 concurrent migration where each
+ * are waiting on each other page lock.
+ *
+ * Make migrate_vma() a best effort thing and backoff
+ * for any page we can not lock right away.
+ */
+ if (!trylock_page(page)) {
+ migrate->src[i] = 0;
+ migrate->cpages--;
+ put_page(page);
+ continue;
+ }
+ remap = false;
+ migrate->src[i] |= MIGRATE_PFN_LOCKED;
+ }
+
+ /* ZONE_DEVICE pages are not on LRU */
+ if (!is_zone_device_page(page)) {
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's pagevec */
+ lru_add_drain_all();
+ allow_drain = false;
+ }
+
+ if (isolate_lru_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+ put_page(page);
+ }
+ continue;
+ }
+
+ /* Drop the reference we took in collect */
+ put_page(page);
+ }
+
+ if (!migrate_vma_check_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+
+ if (!is_zone_device_page(page)) {
+ get_page(page);
+ putback_lru_page(page);
+ }
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+
+ if (!is_zone_device_page(page))
+ putback_lru_page(page);
+ else
+ put_page(page);
+ }
+ }
+ }
+
+ for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_pte(page, migrate->vma, addr, page);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ put_page(page);
+ restore--;
+ }
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Replace page mapping (CPU page table pte) with a special migration pte entry
+ * and check again if it has been pinned. Pinned pages are restored because we
+ * cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ if (page_mapped(page)) {
+ try_to_unmap(page, flags);
+ if (page_mapped(page))
+ goto restore;
+ }
+
+ if (migrate_vma_check_page(page))
+ continue;
+
+restore:
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ }
+
+ for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_ptes(page, page, false);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ restore--;
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+ }
+}
+
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ unsigned long *dst)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ spinlock_t *ptl;
+ pte_t entry;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ /* Only allow populating anonymous memory */
+ if (!vma_is_anonymous(vma))
+ goto abort;
+
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ goto abort;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ goto abort;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ goto abort;
+
+ if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ goto abort;
+
+ /*
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
+ * pte_offset_map() on pmds where a huge pmd might be created
+ * from a different thread.
+ *
+ * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * parallel threads are excluded by other means.
+ *
+ * Here we only have down_read(mmap_sem).
+ */
+ if (pte_alloc(mm, pmdp, addr))
+ goto abort;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(pmdp)))
+ goto abort;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto abort;
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ goto abort;
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ if (is_zone_device_page(page)) {
+ if (is_device_private_page(page)) {
+ swp_entry_t swp_entry;
+
+ swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+ entry = swp_entry_to_pte(swp_entry);
+ } else if (is_device_public_page(page)) {
+ entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_mkdevmap(entry);
+ }
+ } else {
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+
+ if (pte_present(*ptep)) {
+ unsigned long pfn = pte_pfn(*ptep);
+
+ if (!is_zero_pfn(pfn)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+ flush = true;
+ } else if (!pte_none(*ptep)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+
+ /*
+ * Check for usefaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+
+ inc_mm_counter(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
+ if (!is_zone_device_page(page))
+ lru_cache_add_active_or_unevictable(page, vma);
+ get_page(page);
+
+ if (flush) {
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush_notify(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ } else {
+ /* No need to invalidate - it was non-present before */
+ set_pte_at(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ }
+
+ pte_unmap_unlock(ptep, ptl);
+ *src = MIGRATE_PFN_MIGRATE;
+ return;
+
+abort:
+ *src &= ~MIGRATE_PFN_MIGRATE;
+}
+
+/*
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+static void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr, i, mmu_start;
+ bool notified = false;
+
+ for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct address_space *mapping;
+ int r;
+
+ if (!newpage) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ if (!page) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+ continue;
+ }
+ if (!notified) {
+ mmu_start = addr;
+ notified = true;
+ mmu_notifier_invalidate_range_start(mm,
+ mmu_start,
+ migrate->end);
+ }
+ migrate_vma_insert_page(migrate, addr, newpage,
+ &migrate->src[i],
+ &migrate->dst[i]);
+ continue;
+ }
+
+ mapping = page_mapping(page);
+
+ if (is_zone_device_page(newpage)) {
+ if (is_device_private_page(newpage)) {
+ /*
+ * For now only support private anonymous when
+ * migrating to un-addressable device memory.
+ */
+ if (mapping) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ } else if (!is_device_public_page(newpage)) {
+ /*
+ * Other types of ZONE_DEVICE page are not
+ * supported.
+ */
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ }
+
+ r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+ if (r != MIGRATEPAGE_SUCCESS)
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+
+ if (notified)
+ mmu_notifier_invalidate_range_end(mm, mmu_start,
+ migrate->end);
+}
+
+/*
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+static void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ unsigned long i;
+
+ for (i = 0; i < npages; i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ continue;
+ }
+
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ newpage = page;
+ }
+
+ remove_migration_ptes(page, newpage, false);
+ unlock_page(page);
+ migrate->cpages--;
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+
+ if (newpage != page) {
+ unlock_page(newpage);
+ if (is_zone_device_page(newpage))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+ }
+}
+
+/*
+ * migrate_vma() - migrate a range of memory inside vma
+ *
+ * @ops: migration callback for allocating destination memory and copying
+ * @vma: virtual memory area containing the range to be migrated
+ * @start: start address of the range to migrate (inclusive)
+ * @end: end address of the range to migrate (exclusive)
+ * @src: array of hmm_pfn_t containing source pfns
+ * @dst: array of hmm_pfn_t containing destination pfns
+ * @private: pointer passed back to each of the callback
+ * Returns: 0 on success, error code otherwise
+ *
+ * This function tries to migrate a range of memory virtual address range, using
+ * callbacks to allocate and copy memory from source to destination. First it
+ * collects all the pages backing each virtual address in the range, saving this
+ * inside the src array. Then it locks those pages and unmaps them. Once the pages
+ * are locked and unmapped, it checks whether each page is pinned or not. Pages
+ * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
+ * in the corresponding src array entry. It then restores any pages that are
+ * pinned, by remapping and unlocking those pages.
+ *
+ * At this point it calls the alloc_and_copy() callback. For documentation on
+ * what is expected from that callback, see struct migrate_vma_ops comments in
+ * include/linux/migrate.h
+ *
+ * After the alloc_and_copy() callback, this function goes over each entry in
+ * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then the function tries to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the struct
+ * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
+ * array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * It then calls the finalize_and_map() callback. See comments for "struct
+ * migrate_vma_ops", in include/linux/migrate.h for details about
+ * finalize_and_map() behavior.
+ *
+ * After the finalize_and_map() callback, for successfully migrated pages, this
+ * function updates the CPU page table to point to new pages, otherwise it
+ * restores the CPU page table to point to the original source pages.
+ *
+ * Function returns 0 after the above steps, even if no pages were migrated
+ * (The function only returns an error if any of the arguments are invalid.)
+ *
+ * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
+ * unsigned long entries.
+ */
+int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *src,
+ unsigned long *dst,
+ void *private)
+{
+ struct migrate_vma migrate;
+
+ /* Sanity check the arguments */
+ start &= PAGE_MASK;
+ end &= PAGE_MASK;
+ if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
+ return -EINVAL;
+ if (start < vma->vm_start || start >= vma->vm_end)
+ return -EINVAL;
+ if (end <= vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+ if (!ops || !src || !dst || start >= end)
+ return -EINVAL;
+
+ memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
+ migrate.src = src;
+ migrate.dst = dst;
+ migrate.start = start;
+ migrate.npages = 0;
+ migrate.cpages = 0;
+ migrate.end = end;
+ migrate.vma = vma;
+
+ /* Collect, and try to unmap source pages */
+ migrate_vma_collect(&migrate);
+ if (!migrate.cpages)
+ return 0;
+
+ /* Lock and isolate page */
+ migrate_vma_prepare(&migrate);
+ if (!migrate.cpages)
+ return 0;
+
+ /* Unmap pages */
+ migrate_vma_unmap(&migrate);
+ if (!migrate.cpages)
+ return 0;
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the callback.
+ *
+ * Note that migration can fail in migrate_vma_struct_page() for each
+ * individual page.
+ */
+ ops->alloc_and_copy(vma, src, dst, start, end, private);
+
+ /* This does the real migration of struct page */
+ migrate_vma_pages(&migrate);
+
+ ops->finalize_and_map(vma, src, dst, start, end, private);
+
+ /* Unlock and remap pages */
+ migrate_vma_finalize(&migrate);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_vma);
+#endif /* defined(MIGRATE_VMA_HELPER) */