aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/gup.c13
-rw-r--r--mm/huge_memory.c94
-rw-r--r--mm/hugetlb.c30
-rw-r--r--mm/khugepaged.c140
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/mempolicy.c34
-rw-r--r--mm/page_alloc.c32
-rw-r--r--mm/rmap.c13
-rw-r--r--mm/shmem.c53
-rw-r--r--mm/sparse.c16
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/truncate.c8
-rw-r--r--mm/userfaultfd.c62
-rw-r--r--mm/vmstat.c7
-rw-r--r--mm/z3fold.c101
16 files changed, 394 insertions, 225 deletions
diff --git a/mm/gup.c b/mm/gup.c
index f76e77a2d34b..8cb68a50dbdf 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -385,11 +385,17 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
* @vma: vm_area_struct mapping @address
* @address: virtual address to look up
* @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
+ * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
+ * pointer to output page_mask
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
+ * the device's dev_pagemap metadata to avoid repeating expensive lookups.
+ *
+ * On output, the @ctx->page_mask is set according to the size of the page.
+ *
+ * Return: the mapped (struct page *), %NULL if no mapping exists, or
* an error pointer if there is a mapping to something not represented
* by a page descriptor (see also vm_normal_page()).
*/
@@ -696,12 +702,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (!vma || start >= vma->vm_end) {
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(mm, start)) {
- int ret;
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
pages ? &pages[i] : NULL);
if (ret)
- return i ? : ret;
+ goto out;
ctx.page_mask = 0;
goto next_page;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 55478ab3c83b..5da55b38b1b7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,40 +629,30 @@ release:
* available
* never: never stall for any thp allocation
*/
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
{
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
- gfp_t this_node = 0;
-
-#ifdef CONFIG_NUMA
- struct mempolicy *pol;
- /*
- * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
- * specified, to express a general desire to stay on the current
- * node for optimistic allocation attempts. If the defrag mode
- * and/or madvise hint requires the direct reclaim then we prefer
- * to fallback to other node rather than node reclaim because that
- * can lead to excessive reclaim even though there is free memory
- * on other nodes. We expect that NUMA preferences are specified
- * by memory policies.
- */
- pol = get_vma_policy(vma, addr);
- if (pol->mode != MPOL_BIND)
- this_node = __GFP_THISNODE;
- mpol_cond_put(pol);
-#endif
+ /* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+ /* Kick kcompactd and fail quickly */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+ /* Synchronous compaction if madvised, otherwise kick kcompactd */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- __GFP_KSWAPD_RECLAIM | this_node);
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ __GFP_KSWAPD_RECLAIM);
+
+ /* Only do synchronous compaction if madvised */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- this_node);
- return GFP_TRANSHUGE_LIGHT | this_node;
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+ return GFP_TRANSHUGE_LIGHT;
}
/* Caller must hold page table lock. */
@@ -734,8 +724,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pte_free(vma->vm_mm, pgtable);
return ret;
}
- gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
- page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+ gfp = alloc_hugepage_direct_gfpmask(vma);
+ page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1305,9 +1295,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
- new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
- haddr, numa_node_id());
+ huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+ new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
@@ -2350,7 +2339,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
}
}
-static void freeze_page(struct page *page)
+static void unmap_page(struct page *page)
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
@@ -2365,7 +2354,7 @@ static void freeze_page(struct page *page)
VM_BUG_ON_PAGE(!unmap_success, page);
}
-static void unfreeze_page(struct page *page)
+static void remap_page(struct page *page)
{
int i;
if (PageTransHuge(page)) {
@@ -2402,6 +2391,12 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_unevictable) |
(1L << PG_dirty)));
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+ page_tail);
+ page_tail->mapping = head->mapping;
+ page_tail->index = head->index + tail;
+
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
@@ -2422,12 +2417,6 @@ static void __split_huge_page_tail(struct page *head, int tail,
if (page_is_idle(head))
set_page_idle(page_tail);
- /* ->mapping in first tail page is compound_mapcount */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- page_tail);
- page_tail->mapping = head->mapping;
-
- page_tail->index = head->index + tail;
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
/*
@@ -2439,12 +2428,11 @@ static void __split_huge_page_tail(struct page *head, int tail,
}
static void __split_huge_page(struct page *page, struct list_head *list,
- unsigned long flags)
+ pgoff_t end, unsigned long flags)
{
struct page *head = compound_head(page);
struct zone *zone = page_zone(head);
struct lruvec *lruvec;
- pgoff_t end = -1;
int i;
lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
@@ -2452,9 +2440,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
- if (!PageAnon(page))
- end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
-
for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
@@ -2483,7 +2468,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
- unfreeze_page(head);
+ remap_page(head);
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
@@ -2626,6 +2611,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
int count, mapcount, extra_pins, ret;
bool mlocked;
unsigned long flags;
+ pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -2648,6 +2634,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
ret = -EBUSY;
goto out;
}
+ end = -1;
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
@@ -2661,10 +2648,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
anon_vma = NULL;
i_mmap_lock_read(mapping);
+
+ /*
+ *__split_huge_page() may need to trim off pages beyond EOF:
+ * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
+ * which cannot be nested inside the page tree lock. So note
+ * end now: i_size itself may be changed at any moment, but
+ * head page lock is good enough to serialize the trimming.
+ */
+ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
}
/*
- * Racy check if we can split the page, before freeze_page() will
+ * Racy check if we can split the page, before unmap_page() will
* split PMDs
*/
if (!can_split_huge_page(head, &extra_pins)) {
@@ -2673,7 +2669,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
mlocked = PageMlocked(page);
- freeze_page(head);
+ unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -2707,7 +2703,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping)
__dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
- __split_huge_page(page, list, flags);
+ __split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2727,7 +2723,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
fail: if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
- unfreeze_page(head);
+ remap_page(head);
ret = -EBUSY;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c007fb5fb8d5..a80832487981 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1248,10 +1248,11 @@ void free_huge_page(struct page *page)
(struct hugepage_subpool *)page_private(page);
bool restore_reserve;
- set_page_private(page, 0);
- page->mapping = NULL;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
+
+ set_page_private(page, 0);
+ page->mapping = NULL;
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
@@ -3233,7 +3234,7 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
- pte_t *src_pte, *dst_pte, entry;
+ pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
int cow;
@@ -3261,15 +3262,30 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
break;
}
- /* If the pagetables are shared don't copy or take references */
- if (dst_pte == src_pte)
+ /*
+ * If the pagetables are shared don't copy or take references.
+ * dst_pte == src_pte is the common case of src/dest sharing.
+ *
+ * However, src could have 'unshared' and dst shares with
+ * another vma. If dst_pte !none, this implies sharing.
+ * Check here before taking page table lock, and once again
+ * after taking the lock below.
+ */
+ dst_entry = huge_ptep_get(dst_pte);
+ if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
continue;
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
- if (huge_pte_none(entry)) { /* skip none entry */
+ dst_entry = huge_ptep_get(dst_pte);
+ if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ /*
+ * Skip if src entry none. Also, skip in the
+ * unlikely case dst entry !none as this implies
+ * sharing with another vma.
+ */
;
} else if (unlikely(is_hugetlb_entry_migration(entry) ||
is_hugetlb_entry_hwpoisoned(entry))) {
@@ -4065,7 +4081,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_sem */
if (unlikely(ret)) {
- ret = -EFAULT;
+ ret = -ENOENT;
*pagep = page;
/* don't free the page */
goto out;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c13625c1ad5e..8e2ff195ecb3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1287,7 +1287,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* collapse_shmem - collapse small tmpfs/shmem pages into huge one.
*
* Basic scheme is simple, details are more complex:
- * - allocate and freeze a new huge page;
+ * - allocate and lock a new huge page;
* - scan page cache replacing old pages with the new one
* + swap in pages if necessary;
* + fill in gaps;
@@ -1295,11 +1295,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* - if replacing succeeds:
* + copy data over;
* + free old pages;
- * + unfreeze huge page;
+ * + unlock huge page;
* - if replacing failed;
* + put all pages back and unfreeze them;
* + restore gaps in the page cache;
- * + free huge page;
+ * + unlock and free huge page;
*/
static void collapse_shmem(struct mm_struct *mm,
struct address_space *mapping, pgoff_t start,
@@ -1329,19 +1329,6 @@ static void collapse_shmem(struct mm_struct *mm,
goto out;
}
- new_page->index = start;
- new_page->mapping = mapping;
- __SetPageSwapBacked(new_page);
- __SetPageLocked(new_page);
- BUG_ON(!page_ref_freeze(new_page, 1));
-
- /*
- * At this point the new_page is 'frozen' (page_count() is zero),
- * locked and not up-to-date. It's safe to insert it into the page
- * cache, because nobody would be able to map it or use it in other
- * way until we unfreeze it.
- */
-
/* This will be less messy when we use multi-index entries */
do {
xas_lock_irq(&xas);
@@ -1349,19 +1336,44 @@ static void collapse_shmem(struct mm_struct *mm,
if (!xas_error(&xas))
break;
xas_unlock_irq(&xas);
- if (!xas_nomem(&xas, GFP_KERNEL))
+ if (!xas_nomem(&xas, GFP_KERNEL)) {
+ mem_cgroup_cancel_charge(new_page, memcg, true);
+ result = SCAN_FAIL;
goto out;
+ }
} while (1);
+ __SetPageLocked(new_page);
+ __SetPageSwapBacked(new_page);
+ new_page->index = start;
+ new_page->mapping = mapping;
+
+ /*
+ * At this point the new_page is locked and not up-to-date.
+ * It's safe to insert it into the page cache, because nobody would
+ * be able to map it or use it in another way until we unlock it.
+ */
+
xas_set(&xas, start);
for (index = start; index < end; index++) {
struct page *page = xas_next(&xas);
VM_BUG_ON(index != xas.xa_index);
if (!page) {
+ /*
+ * Stop if extent has been truncated or hole-punched,
+ * and is now completely empty.
+ */
+ if (index == start) {
+ if (!xas_next_entry(&xas, end - 1)) {
+ result = SCAN_TRUNCATED;
+ goto xa_locked;
+ }
+ xas_set(&xas, index);
+ }
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;
- break;
+ goto xa_locked;
}
xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
nr_none++;
@@ -1376,13 +1388,12 @@ static void collapse_shmem(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_unlocked;
}
- xas_lock_irq(&xas);
- xas_set(&xas, index);
} else if (trylock_page(page)) {
get_page(page);
+ xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
- break;
+ goto xa_locked;
}
/*
@@ -1391,17 +1402,24 @@ static void collapse_shmem(struct mm_struct *mm,
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageUptodate(page), page);
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ /*
+ * If file was truncated then extended, or hole-punched, before
+ * we locked the first page, then a THP might be there already.
+ */
+ if (PageTransCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out_unlock;
+ }
if (page_mapping(page) != mapping) {
result = SCAN_TRUNCATED;
goto out_unlock;
}
- xas_unlock_irq(&xas);
if (isolate_lru_page(page)) {
result = SCAN_DEL_PAGE_LRU;
- goto out_isolate_failed;
+ goto out_unlock;
}
if (page_mapped(page))
@@ -1421,7 +1439,9 @@ static void collapse_shmem(struct mm_struct *mm,
*/
if (!page_ref_freeze(page, 3)) {
result = SCAN_PAGE_COUNT;
- goto out_lru;
+ xas_unlock_irq(&xas);
+ putback_lru_page(page);
+ goto out_unlock;
}
/*
@@ -1433,71 +1453,74 @@ static void collapse_shmem(struct mm_struct *mm,
/* Finally, replace with the new page. */
xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
continue;
-out_lru:
- xas_unlock_irq(&xas);
- putback_lru_page(page);
-out_isolate_failed:
- unlock_page(page);
- put_page(page);
- goto xa_unlocked;
out_unlock:
unlock_page(page);
put_page(page);
- break;
+ goto xa_unlocked;
}
- xas_unlock_irq(&xas);
+ __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ if (nr_none) {
+ struct zone *zone = page_zone(new_page);
+
+ __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
+ __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
+ }
+
+xa_locked:
+ xas_unlock_irq(&xas);
xa_unlocked:
+
if (result == SCAN_SUCCEED) {
struct page *page, *tmp;
- struct zone *zone = page_zone(new_page);
/*
* Replacing old pages with new one has succeeded, now we
* need to copy the content and free the old pages.
*/
+ index = start;
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+ while (index < page->index) {
+ clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ index++;
+ }
copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
page);
list_del(&page->lru);
- unlock_page(page);
- page_ref_unfreeze(page, 1);
page->mapping = NULL;
+ page_ref_unfreeze(page, 1);
ClearPageActive(page);
ClearPageUnevictable(page);
+ unlock_page(page);
put_page(page);
+ index++;
}
-
- local_irq_disable();
- __inc_node_page_state(new_page, NR_SHMEM_THPS);
- if (nr_none) {
- __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
- __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
+ while (index < end) {
+ clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ index++;
}
- local_irq_enable();
- /*
- * Remove pte page tables, so we can re-fault
- * the page as huge.
- */
- retract_page_tables(mapping, start);
-
- /* Everything is ready, let's unfreeze the new_page */
- set_page_dirty(new_page);
SetPageUptodate(new_page);
- page_ref_unfreeze(new_page, HPAGE_PMD_NR);
+ page_ref_add(new_page, HPAGE_PMD_NR - 1);
+ set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_anon(new_page);
- unlock_page(new_page);
+ /*
+ * Remove pte page tables, so we can re-fault the page as huge.
+ */
+ retract_page_tables(mapping, start);
*hpage = NULL;
khugepaged_pages_collapsed++;
} else {
struct page *page;
+
/* Something went wrong: roll back page cache changes */
- shmem_uncharge(mapping->host, nr_none);
xas_lock_irq(&xas);
+ mapping->nrpages -= nr_none;
+ shmem_uncharge(mapping->host, nr_none);
+
xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
page = list_first_entry_or_null(&pagelist,
@@ -1519,19 +1542,18 @@ xa_unlocked:
xas_store(&xas, page);
xas_pause(&xas);
xas_unlock_irq(&xas);
- putback_lru_page(page);
unlock_page(page);
+ putback_lru_page(page);
xas_lock_irq(&xas);
}
VM_BUG_ON(nr_none);
xas_unlock_irq(&xas);
- /* Unfreeze new_page, caller would take care about freeing it */
- page_ref_unfreeze(new_page, 1);
mem_cgroup_cancel_charge(new_page, memcg, true);
- unlock_page(new_page);
new_page->mapping = NULL;
}
+
+ unlock_page(new_page);
out:
VM_BUG_ON(!list_empty(&pagelist));
/* TODO: tracepoints */
diff --git a/mm/memblock.c b/mm/memblock.c
index 7df468c8ebc8..81ae63ca78d0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1179,7 +1179,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
- * Common iterator interface used to define for_each_mem_range().
+ * Common iterator interface used to define for_each_mem_pfn_range().
*/
void __init_memblock __next_mem_pfn_range(int *idx, int nid,
unsigned long *out_start_pfn,
@@ -1727,7 +1727,7 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
return -1;
}
-bool __init memblock_is_reserved(phys_addr_t addr)
+bool __init_memblock memblock_is_reserved(phys_addr_t addr)
{
return memblock_search(&memblock.reserved, addr) != -1;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0cd3de3550f0..7c72f2a95785 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1161,6 +1161,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
LIST_HEAD(tokill);
int rc = -EBUSY;
loff_t start;
+ dax_entry_t cookie;
/*
* Prevent the inode from being freed while we are interrogating
@@ -1169,7 +1170,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* also prevents changes to the mapping of this pfn until
* poison signaling is complete.
*/
- if (!dax_lock_mapping_entry(page))
+ cookie = dax_lock_page(page);
+ if (!cookie)
goto out;
if (hwpoison_filter(page)) {
@@ -1220,7 +1222,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
unlock:
- dax_unlock_mapping_entry(page);
+ dax_unlock_page(page, cookie);
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5837a067124d..d4496d9d34f5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
} else if (PageTransHuge(page)) {
struct page *thp;
- thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
- address, numa_node_id());
+ thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
+ HPAGE_PMD_ORDER);
if (!thp)
return NULL;
prep_transhuge_page(thp);
@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
-struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = __get_vma_policy(vma, addr);
@@ -2011,6 +2011,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
* @node: Which node to prefer for allocation (modulo policy).
+ * @hugepage: for hugepages try only the preferred node if possible
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
@@ -2021,7 +2022,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
*/
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, int node)
+ unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
struct page *page;
@@ -2039,6 +2040,31 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
goto out;
}
+ if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
+ int hpage_node = node;
+
+ /*
+ * For hugepage allocation and non-interleave policy which
+ * allows the current node (or other explicitly preferred
+ * node) we only try to allocate from the current/preferred
+ * node and don't fall back to other nodes, as the cost of
+ * remote accesses would likely offset THP benefits.
+ *
+ * If the policy is interleave, or does not allow the current
+ * node in its nodemask, we allocate the standard way.
+ */
+ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
+ hpage_node = pol->v.preferred_node;
+
+ nmask = policy_nodemask(gfp, pol);
+ if (!nmask || node_isset(hpage_node, *nmask)) {
+ mpol_cond_put(pol);
+ page = __alloc_pages_node(hpage_node,
+ gfp | __GFP_THISNODE, order);
+ goto out;
+ }
+ }
+
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a919ba5cb3c8..2ec9cc407216 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4061,17 +4061,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int reserve_flags;
/*
- * In the slowpath, we sanity check order to avoid ever trying to
- * reclaim >= MAX_ORDER areas which will never succeed. Callers may
- * be using allocators in order of preference for an area that is
- * too large.
- */
- if (order >= MAX_ORDER) {
- WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
- return NULL;
- }
-
- /*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/
@@ -4364,6 +4353,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
+ /*
+ * There are several places where we assume that the order value is sane
+ * so bail out early if the request is out of bound.
+ */
+ if (unlikely(order >= MAX_ORDER)) {
+ WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ return NULL;
+ }
+
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
@@ -5815,8 +5813,10 @@ void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
+ int zone_idx = zone_idx(zone) + 1;
- pgdat->nr_zones = zone_idx(zone) + 1;
+ if (zone_idx > pgdat->nr_zones)
+ pgdat->nr_zones = zone_idx;
zone->zone_start_pfn = zone_start_pfn;
@@ -7789,6 +7789,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
goto unmovable;
/*
+ * If the zone is movable and we have ruled out all reserved
+ * pages then it should be reasonably safe to assume the rest
+ * is movable.
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ continue;
+
+ /*
* Hugepages are not in LRU lists, but they're movable.
* We need not scan over tail pages bacause we don't
* handle each tail page individually in migration.
diff --git a/mm/rmap.c b/mm/rmap.c
index 1e79fac3186b..85b7f9423352 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1627,16 +1627,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
address + PAGE_SIZE);
} else {
/*
- * We should not need to notify here as we reach this
- * case only from freeze_page() itself only call from
- * split_huge_page_to_list() so everything below must
- * be true:
- * - page is not anonymous
- * - page is locked
- *
- * So as it is a locked file back page thus it can not
- * be remove from the page cache and replace by a new
- * page before mmu_notifier_invalidate_range_end so no
+ * This is a locked file-backed page, thus it cannot
+ * be removed from the page cache and replaced by a new
+ * page before mmu_notifier_invalidate_range_end, so no
* concurrent thread might update its page table to
* point at new page while a device still is using this
* page.
diff --git a/mm/shmem.c b/mm/shmem.c
index ea26d7a0342d..5d07e0b1352f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -297,12 +297,14 @@ bool shmem_charge(struct inode *inode, long pages)
if (!shmem_inode_acct_block(inode, pages))
return false;
+ /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
+ inode->i_mapping->nrpages += pages;
+
spin_lock_irqsave(&info->lock, flags);
info->alloced += pages;
inode->i_blocks += pages * BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
spin_unlock_irqrestore(&info->lock, flags);
- inode->i_mapping->nrpages += pages;
return true;
}
@@ -312,6 +314,8 @@ void shmem_uncharge(struct inode *inode, long pages)
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long flags;
+ /* nrpages adjustment done by __delete_from_page_cache() or caller */
+
spin_lock_irqsave(&info->lock, flags);
info->alloced -= pages;
inode->i_blocks -= pages * BLOCKS_PER_PAGE;
@@ -657,9 +661,7 @@ static int shmem_free_swap(struct address_space *mapping,
{
void *old;
- xa_lock_irq(&mapping->i_pages);
- old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0);
- xa_unlock_irq(&mapping->i_pages);
+ old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
if (old != radswap)
return -ENOENT;
free_swap_and_cache(radix_to_swp_entry(radswap));
@@ -1435,7 +1437,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
shmem_pseudo_vma_init(&pvma, info, hindex);
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
- HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
+ HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
@@ -1509,11 +1511,13 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
{
struct page *oldpage, *newpage;
struct address_space *swap_mapping;
+ swp_entry_t entry;
pgoff_t swap_index;
int error;
oldpage = *pagep;
- swap_index = page_private(oldpage);
+ entry.val = page_private(oldpage);
+ swap_index = swp_offset(entry);
swap_mapping = page_mapping(oldpage);
/*
@@ -1532,7 +1536,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
__SetPageLocked(newpage);
__SetPageSwapBacked(newpage);
SetPageUptodate(newpage);
- set_page_private(newpage, swap_index);
+ set_page_private(newpage, entry.val);
SetPageSwapCache(newpage);
/*
@@ -2214,6 +2218,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
struct page *page;
pte_t _dst_pte, *dst_pte;
int ret;
+ pgoff_t offset, max_off;
ret = -ENOMEM;
if (!shmem_inode_acct_block(inode, 1))
@@ -2236,7 +2241,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
*pagep = page;
shmem_inode_unacct_blocks(inode, 1);
/* don't free the page */
- return -EFAULT;
+ return -ENOENT;
}
} else { /* mfill_zeropage_atomic */
clear_highpage(page);
@@ -2251,6 +2256,12 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
__SetPageSwapBacked(page);
__SetPageUptodate(page);
+ ret = -EFAULT;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off))
+ goto out_release;
+
ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
if (ret)
goto out_release;
@@ -2265,9 +2276,25 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
if (dst_vma->vm_flags & VM_WRITE)
_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+ else {
+ /*
+ * We don't set the pte dirty if the vma has no
+ * VM_WRITE permission, so mark the page dirty or it
+ * could be freed from under us. We could do it
+ * unconditionally before unlock_page(), but doing it
+ * only if VM_WRITE is not set is faster.
+ */
+ set_page_dirty(page);
+ }
- ret = -EEXIST;
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+
+ ret = -EFAULT;
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off))
+ goto out_release_uncharge_unlock;
+
+ ret = -EEXIST;
if (!pte_none(*dst_pte))
goto out_release_uncharge_unlock;
@@ -2285,13 +2312,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
- unlock_page(page);
pte_unmap_unlock(dst_pte, ptl);
+ unlock_page(page);
ret = 0;
out:
return ret;
out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
+ ClearPageDirty(page);
+ delete_from_page_cache(page);
out_release_uncharge:
mem_cgroup_cancel_charge(page, memcg, false);
out_release:
@@ -2563,9 +2592,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
inode_lock(inode);
/* We're holding i_mutex so we can access i_size directly */
- if (offset < 0)
- offset = -EINVAL;
- else if (offset >= inode->i_size)
+ if (offset < 0 || offset >= inode->i_size)
offset = -ENXIO;
else {
start = offset >> PAGE_SHIFT;
diff --git a/mm/sparse.c b/mm/sparse.c
index 33307fc05c4d..3abc8cc50201 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -240,6 +240,22 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
}
/*
+ * Mark all memblocks as present using memory_present(). This is a
+ * convienence function that is useful for a number of arches
+ * to mark all of the systems memory as present during initialization.
+ */
+void __init memblocks_present(void)
+{
+ struct memblock_region *reg;
+
+ for_each_memblock(memory, reg) {
+ memory_present(memblock_get_region_node(reg),
+ memblock_region_memory_base_pfn(reg),
+ memblock_region_memory_end_pfn(reg));
+ }
+}
+
+/*
* Subtle, we encode the real pfn into the mem_map such that
* the identity pfn - section_mem_map will return the actual
* physical page frame number.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 644f746e167a..8688ae65ef58 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2813,7 +2813,7 @@ static struct swap_info_struct *alloc_swap_info(void)
unsigned int type;
int i;
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ p = kvzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -2824,7 +2824,7 @@ static struct swap_info_struct *alloc_swap_info(void)
}
if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
- kfree(p);
+ kvfree(p);
return ERR_PTR(-EPERM);
}
if (type >= nr_swapfiles) {
@@ -2838,7 +2838,7 @@ static struct swap_info_struct *alloc_swap_info(void)
smp_wmb();
nr_swapfiles++;
} else {
- kfree(p);
+ kvfree(p);
p = swap_info[type];
/*
* Do not memset this entry: a racing procfs swap_next()
diff --git a/mm/truncate.c b/mm/truncate.c
index 45d68e90b703..798e7ccfb030 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -517,9 +517,13 @@ void truncate_inode_pages_final(struct address_space *mapping)
*/
xa_lock_irq(&mapping->i_pages);
xa_unlock_irq(&mapping->i_pages);
-
- truncate_inode_pages(mapping, 0);
}
+
+ /*
+ * Cleancache needs notification even if there are no pages or shadow
+ * entries.
+ */
+ truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 5029f241908f..458acda96f20 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -33,6 +33,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
void *page_kaddr;
int ret;
struct page *page;
+ pgoff_t offset, max_off;
+ struct inode *inode;
if (!*pagep) {
ret = -ENOMEM;
@@ -48,7 +50,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_sem */
if (unlikely(ret)) {
- ret = -EFAULT;
+ ret = -ENOENT;
*pagep = page;
/* don't free the page */
goto out;
@@ -73,8 +75,17 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (dst_vma->vm_flags & VM_WRITE)
_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
- ret = -EEXIST;
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (dst_vma->vm_file) {
+ /* the shmem MAP_PRIVATE case requires checking the i_size */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_release_uncharge_unlock;
+ }
+ ret = -EEXIST;
if (!pte_none(*dst_pte))
goto out_release_uncharge_unlock;
@@ -108,11 +119,22 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
int ret;
+ pgoff_t offset, max_off;
+ struct inode *inode;
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
- ret = -EEXIST;
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (dst_vma->vm_file) {
+ /* the shmem MAP_PRIVATE case requires checking the i_size */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_unlock;
+ }
+ ret = -EEXIST;
if (!pte_none(*dst_pte))
goto out_unlock;
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -205,8 +227,9 @@ retry:
if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
goto out_unlock;
/*
- * Only allow __mcopy_atomic_hugetlb on userfaultfd
- * registered ranges.
+ * Check the vma is registered in uffd, this is
+ * required to enforce the VM_MAYWRITE check done at
+ * uffd registration time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
goto out_unlock;
@@ -274,7 +297,7 @@ retry:
cond_resched();
- if (unlikely(err == -EFAULT)) {
+ if (unlikely(err == -ENOENT)) {
up_read(&dst_mm->mmap_sem);
BUG_ON(!page);
@@ -380,7 +403,17 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
{
ssize_t err;
- if (vma_is_anonymous(dst_vma)) {
+ /*
+ * The normal page fault path for a shmem will invoke the
+ * fault, fill the hole in the file and COW it right away. The
+ * result generates plain anonymous memory. So when we are
+ * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
+ * generate anonymous memory directly without actually filling
+ * the hole. For the MAP_PRIVATE case the robustness check
+ * only happens in the pagetable (to verify it's still none)
+ * and not in the radix tree.
+ */
+ if (!(dst_vma->vm_flags & VM_SHARED)) {
if (!zeropage)
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr, page);
@@ -449,13 +482,9 @@ retry:
if (!dst_vma)
goto out_unlock;
/*
- * Be strict and only allow __mcopy_atomic on userfaultfd
- * registered ranges to prevent userland errors going
- * unnoticed. As far as the VM consistency is concerned, it
- * would be perfectly safe to remove this check, but there's
- * no useful usage for __mcopy_atomic ouside of userfaultfd
- * registered ranges. This is after all why these are ioctls
- * belonging to the userfaultfd and not syscalls.
+ * Check the vma is registered in uffd, this is required to
+ * enforce the VM_MAYWRITE check done at uffd registration
+ * time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
goto out_unlock;
@@ -489,7 +518,8 @@ retry:
* dst_vma.
*/
err = -ENOMEM;
- if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
+ if (!(dst_vma->vm_flags & VM_SHARED) &&
+ unlikely(anon_vma_prepare(dst_vma)))
goto out_unlock;
while (src_addr < src_start + len) {
@@ -530,7 +560,7 @@ retry:
src_addr, &page, zeropage);
cond_resched();
- if (unlikely(err == -EFAULT)) {
+ if (unlikely(err == -ENOENT)) {
void *page_kaddr;
up_read(&dst_mm->mmap_sem);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6038ce593ce3..9c624595e904 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1827,12 +1827,13 @@ static bool need_update(int cpu)
/*
* The fast way of checking if there are any vmstat diffs.
- * This works because the diffs are byte sized items.
*/
- if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
+ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
+ sizeof(p->vm_stat_diff[0])))
return true;
#ifdef CONFIG_NUMA
- if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
+ if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
+ sizeof(p->vm_numa_stat_diff[0])))
return true;
#endif
}
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 4b366d181f35..aee9b0b8d907 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -99,6 +99,7 @@ struct z3fold_header {
#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
#define BUDDY_MASK (0x3)
+#define BUDDY_SHIFT 2
/**
* struct z3fold_pool - stores metadata for each z3fold pool
@@ -145,7 +146,7 @@ enum z3fold_page_flags {
MIDDLE_CHUNK_MAPPED,
NEEDS_COMPACTING,
PAGE_STALE,
- UNDER_RECLAIM
+ PAGE_CLAIMED, /* by either reclaim or free */
};
/*****************
@@ -174,7 +175,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
- clear_bit(UNDER_RECLAIM, &page->private);
+ clear_bit(PAGE_CLAIMED, &page->private);
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
@@ -223,8 +224,11 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
unsigned long handle;
handle = (unsigned long)zhdr;
- if (bud != HEADLESS)
- handle += (bud + zhdr->first_num) & BUDDY_MASK;
+ if (bud != HEADLESS) {
+ handle |= (bud + zhdr->first_num) & BUDDY_MASK;
+ if (bud == LAST)
+ handle |= (zhdr->last_chunks << BUDDY_SHIFT);
+ }
return handle;
}
@@ -234,6 +238,12 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
return (struct z3fold_header *)(handle & PAGE_MASK);
}
+/* only for LAST bud, returns zero otherwise */
+static unsigned short handle_to_chunks(unsigned long handle)
+{
+ return (handle & ~PAGE_MASK) >> BUDDY_SHIFT;
+}
+
/*
* (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
* but that doesn't matter. because the masking will result in the
@@ -720,37 +730,39 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
page = virt_to_page(zhdr);
if (test_bit(PAGE_HEADLESS, &page->private)) {
- /* HEADLESS page stored */
- bud = HEADLESS;
- } else {
- z3fold_page_lock(zhdr);
- bud = handle_to_buddy(handle);
-
- switch (bud) {
- case FIRST:
- zhdr->first_chunks = 0;
- break;
- case MIDDLE:
- zhdr->middle_chunks = 0;
- zhdr->start_middle = 0;
- break;
- case LAST:
- zhdr->last_chunks = 0;
- break;
- default:
- pr_err("%s: unknown bud %d\n", __func__, bud);
- WARN_ON(1);
- z3fold_page_unlock(zhdr);
- return;
+ /* if a headless page is under reclaim, just leave.
+ * NB: we use test_and_set_bit for a reason: if the bit
+ * has not been set before, we release this page
+ * immediately so we don't care about its value any more.
+ */
+ if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ spin_lock(&pool->lock);
+ list_del(&page->lru);
+ spin_unlock(&pool->lock);
+ free_z3fold_page(page);
+ atomic64_dec(&pool->pages_nr);
}
+ return;
}
- if (bud == HEADLESS) {
- spin_lock(&pool->lock);
- list_del(&page->lru);
- spin_unlock(&pool->lock);
- free_z3fold_page(page);
- atomic64_dec(&pool->pages_nr);
+ /* Non-headless case */
+ z3fold_page_lock(zhdr);
+ bud = handle_to_buddy(handle);
+
+ switch (bud) {
+ case FIRST:
+ zhdr->first_chunks = 0;
+ break;
+ case MIDDLE:
+ zhdr->middle_chunks = 0;
+ break;
+ case LAST:
+ zhdr->last_chunks = 0;
+ break;
+ default:
+ pr_err("%s: unknown bud %d\n", __func__, bud);
+ WARN_ON(1);
+ z3fold_page_unlock(zhdr);
return;
}
@@ -758,7 +770,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
atomic64_dec(&pool->pages_nr);
return;
}
- if (test_bit(UNDER_RECLAIM, &page->private)) {
+ if (test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
return;
}
@@ -836,20 +848,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
}
list_for_each_prev(pos, &pool->lru) {
page = list_entry(pos, struct page, lru);
+
+ /* this bit could have been set by free, in which case
+ * we pass over to the next page in the pool.
+ */
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ continue;
+
+ zhdr = page_address(page);
if (test_bit(PAGE_HEADLESS, &page->private))
- /* candidate found */
break;
- zhdr = page_address(page);
- if (!z3fold_page_trylock(zhdr))
+ if (!z3fold_page_trylock(zhdr)) {
+ zhdr = NULL;
continue; /* can't evict at this point */
+ }
kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
- set_bit(UNDER_RECLAIM, &page->private);
break;
}
+ if (!zhdr)
+ break;
+
list_del_init(&page->lru);
spin_unlock(&pool->lock);
@@ -898,6 +920,7 @@ next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
free_z3fold_page(page);
+ atomic64_dec(&pool->pages_nr);
return 0;
}
spin_lock(&pool->lock);
@@ -905,7 +928,7 @@ next:
spin_unlock(&pool->lock);
} else {
z3fold_page_lock(zhdr);
- clear_bit(UNDER_RECLAIM, &page->private);
+ clear_bit(PAGE_CLAIMED, &page->private);
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked)) {
atomic64_dec(&pool->pages_nr);
@@ -964,7 +987,7 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
break;
case LAST:
- addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+ addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
break;
default:
pr_err("unknown buddy id %d\n", buddy);