aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-27 11:40:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-27 11:40:49 -0700
commit8291eaafed36f575f23951f3ce18407f480e9ecf (patch)
tree279b61422ba2df7b8579af8ccc81331de80affa8 /mm
parentMerge tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm (diff)
parentmm/shmem.c: suppress shift warning (diff)
downloadwireguard-linux-8291eaafed36f575f23951f3ce18407f480e9ecf.tar.xz
wireguard-linux-8291eaafed36f575f23951f3ce18407f480e9ecf.zip
Merge tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton: - Two follow-on fixes for the post-5.19 series "Use pageblock_order for cma and alloc_contig_range alignment", from Zi Yan. - A series of z3fold cleanups and fixes from Miaohe Lin. - Some memcg selftests work from Michal Koutný <mkoutny@suse.com> - Some swap fixes and cleanups from Miaohe Lin - Several individual minor fixups * tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (25 commits) mm/shmem.c: suppress shift warning mm: Kconfig: reorganize misplaced mm options mm: kasan: fix input of vmalloc_to_page() mm: fix is_pinnable_page against a cma page mm: filter out swapin error entry in shmem mapping mm/shmem: fix infinite loop when swap in shmem error at swapoff time mm/madvise: free hwpoison and swapin error entry in madvise_free_pte_range mm/swapfile: fix lost swap bits in unuse_pte() mm/swapfile: unuse_pte can map random data if swap read fails selftests: memcg: factor out common parts of memory.{low,min} tests selftests: memcg: remove protection from top level memcg selftests: memcg: adjust expected reclaim values of protected cgroups selftests: memcg: expect no low events in unprotected sibling selftests: memcg: fix compilation mm/z3fold: fix z3fold_page_migrate races with z3fold_map mm/z3fold: fix z3fold_reclaim_page races with z3fold_free mm/z3fold: always clear PAGE_CLAIMED under z3fold page lock mm/z3fold: put z3fold page back into unbuddied list when reclaim or migration fails revert "mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc" mm/z3fold: throw warning on failure of trylock_page in z3fold_alloc ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig56
-rw-r--r--mm/Kconfig.debug33
-rw-r--r--mm/internal.h4
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/madvise.c18
-rw-r--r--mm/memory.c5
-rw-r--r--mm/page_alloc.c32
-rw-r--r--mm/page_isolation.c36
-rw-r--r--mm/shmem.c41
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/swapfile.c21
-rw-r--r--mm/z3fold.c97
12 files changed, 261 insertions, 87 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 905c205e14f3..169e64192e48 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -270,6 +270,19 @@ config SLAB_FREELIST_HARDENED
sanity-checking than others. This option is most effective with
CONFIG_SLUB.
+config SLUB_STATS
+ default n
+ bool "Enable SLUB performance statistics"
+ depends on SLUB && SYSFS
+ help
+ SLUB statistics are useful to debug SLUBs allocation behavior in
+ order find ways to optimize the allocator. This should never be
+ enabled for production use since keeping statistics slows down
+ the allocator by a few percentage points. The slabinfo command
+ supports the determination of the most active slabs to figure
+ out which slabs are relevant to a particular load.
+ Try running: slabinfo -DA
+
config SLUB_CPU_PARTIAL
default y
depends on SLUB && SMP
@@ -307,6 +320,40 @@ config SHUFFLE_PAGE_ALLOCATOR
Say Y if unsure.
+config COMPAT_BRK
+ bool "Disable heap randomization"
+ default y
+ help
+ Randomizing heap placement makes heap exploits harder, but it
+ also breaks ancient binaries (including anything libc5 based).
+ This option changes the bootup default to heap randomization
+ disabled, and can be overridden at runtime by setting
+ /proc/sys/kernel/randomize_va_space to 2.
+
+ On non-ancient distros (post-2000 ones) N is usually a safe choice.
+
+config MMAP_ALLOW_UNINITIALIZED
+ bool "Allow mmapped anonymous memory to be uninitialized"
+ depends on EXPERT && !MMU
+ default n
+ help
+ Normally, and according to the Linux spec, anonymous memory obtained
+ from mmap() has its contents cleared before it is passed to
+ userspace. Enabling this config option allows you to request that
+ mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
+ providing a huge performance boost. If this option is not enabled,
+ then the flag will be ignored.
+
+ This is taken advantage of by uClibc's malloc(), and also by
+ ELF-FDPIC binfmt's brk and stack allocator.
+
+ Because of the obvious security issues, this option should only be
+ enabled on embedded devices where you control what is run in
+ userspace. Since that isn't generally a problem on no-MMU systems,
+ it is normally safe to say Y here.
+
+ See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
+
config SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SELECT_MEMORY_MODEL
@@ -964,6 +1011,15 @@ config ARCH_USES_HIGH_VMA_FLAGS
config ARCH_HAS_PKEYS
bool
+config VM_EVENT_COUNTERS
+ default y
+ bool "Enable VM event counters for /proc/vmstat" if EXPERT
+ help
+ VM event counters are needed for event counts to be shown.
+ This option allows the disabling of the VM event counters
+ on EXPERT systems. /proc/vmstat will only show page counts
+ if VM event counters are disabled.
+
config PERCPU_STATS
bool "Collect percpu memory statistics"
help
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5bd5bb097252..ce8dded36de9 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -45,6 +45,39 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
Enable debug page memory allocations by default? This value
can be overridden by debug_pagealloc=off|on.
+config DEBUG_SLAB
+ bool "Debug slab memory allocations"
+ depends on DEBUG_KERNEL && SLAB
+ help
+ Say Y here to have the kernel do limited verification on memory
+ allocation as well as poisoning memory on free to catch use of freed
+ memory. This can make kmalloc/kfree-intensive workloads much slower.
+
+config SLUB_DEBUG
+ default y
+ bool "Enable SLUB debugging support" if EXPERT
+ depends on SLUB && SYSFS
+ select STACKDEPOT if STACKTRACE_SUPPORT
+ help
+ SLUB has extensive debug support features. Disabling these can
+ result in significant savings in code size. This also disables
+ SLUB sysfs support. /sys/slab will not exist and there will be
+ no support for cache validation etc.
+
+config SLUB_DEBUG_ON
+ bool "SLUB debugging on by default"
+ depends on SLUB && SLUB_DEBUG
+ select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT
+ default n
+ help
+ Boot with debugging on by default. SLUB boots by default with
+ the runtime debug capabilities switched off. Enabling this is
+ equivalent to specifying the "slub_debug" parameter on boot.
+ There is no support for more fine grained debug control like
+ possible with slub_debug=xxx. SLUB debugging may be switched
+ off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
+ "slub_debug=-".
+
config PAGE_OWNER
bool "Track page owner"
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
diff --git a/mm/internal.h b/mm/internal.h
index 64e61b032dac..c0f8fbe0445b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -374,8 +374,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
int nid, bool exact_nid);
-void split_free_page(struct page *free_page,
- int order, unsigned long split_pfn_offset);
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 199d77cce21a..b341a191651d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -347,7 +347,7 @@ static void print_address_description(void *addr, u8 tag)
va->addr, va->addr + va->size, va->caller);
pr_err("\n");
- page = vmalloc_to_page(page);
+ page = vmalloc_to_page(addr);
}
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d6592488b51..d7b4f2602949 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -248,10 +248,13 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
if (!xa_is_value(page))
continue;
+ swap = radix_to_swp_entry(page);
+ /* There might be swapin error entries in shmem mapping. */
+ if (non_swap_entry(swap))
+ continue;
xas_pause(&xas);
rcu_read_unlock();
- swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0, false, &splug);
if (page)
@@ -624,11 +627,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
swp_entry_t entry;
entry = pte_to_swp_entry(ptent);
- if (non_swap_entry(entry))
- continue;
- nr_swap--;
- free_swap_and_cache(entry);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ if (!non_swap_entry(entry)) {
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ } else if (is_hwpoison_entry(entry) ||
+ is_swapin_error_entry(entry)) {
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ }
continue;
}
diff --git a/mm/memory.c b/mm/memory.c
index 54bcd5327b74..21dadf03f089 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1487,7 +1487,8 @@ again:
/* Only drop the uffd-wp marker if explicitly requested */
if (!zap_drop_file_uffd_wp(details))
continue;
- } else if (is_hwpoison_entry(entry)) {
+ } else if (is_hwpoison_entry(entry) ||
+ is_swapin_error_entry(entry)) {
if (!should_zap_cows(details))
continue;
} else {
@@ -3727,6 +3728,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
+ } else if (is_swapin_error_entry(entry)) {
+ ret = VM_FAULT_SIGBUS;
} else if (is_pte_marker_entry(entry)) {
ret = handle_pte_marker(vmf);
} else {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 149f2ab5063b..e008a3df0485 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -482,8 +482,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page,
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
-
- word = bitmap[word_bitidx];
+ /*
+ * This races, without locks, with set_pfnblock_flags_mask(). Ensure
+ * a consistent read of the memory array, so that results, even though
+ * racy, are not corrupted.
+ */
+ word = READ_ONCE(bitmap[word_bitidx]);
return (word >> bitidx) & mask;
}
@@ -1100,30 +1104,44 @@ done_merging:
* @order: the order of the page
* @split_pfn_offset: split offset within the page
*
+ * Return -ENOENT if the free page is changed, otherwise 0
+ *
* It is used when the free page crosses two pageblocks with different migratetypes
* at split_pfn_offset within the page. The split free page will be put into
* separate migratetype lists afterwards. Otherwise, the function achieves
* nothing.
*/
-void split_free_page(struct page *free_page,
- int order, unsigned long split_pfn_offset)
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset)
{
struct zone *zone = page_zone(free_page);
unsigned long free_page_pfn = page_to_pfn(free_page);
unsigned long pfn;
unsigned long flags;
int free_page_order;
+ int mt;
+ int ret = 0;
if (split_pfn_offset == 0)
- return;
+ return ret;
spin_lock_irqsave(&zone->lock, flags);
+
+ if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mt = get_pageblock_migratetype(free_page);
+ if (likely(!is_migrate_isolate(mt)))
+ __mod_zone_freepage_state(zone, -(1UL << order), mt);
+
del_page_from_free_list(free_page, zone, order);
for (pfn = free_page_pfn;
pfn < free_page_pfn + (1UL << order);) {
int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
- free_page_order = min_t(int,
+ free_page_order = min_t(unsigned int,
pfn ? __ffs(pfn) : order,
__fls(split_pfn_offset));
__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
@@ -1134,7 +1152,9 @@ void split_free_page(struct page *free_page,
if (split_pfn_offset == 0)
split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
}
+out:
spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c643c8420809..6021f8444b5a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -300,7 +300,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page.
*/
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- gfp_t gfp_flags, bool isolate_before)
+ gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
{
unsigned char saved_mt;
unsigned long start_pfn;
@@ -327,11 +327,16 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
zone->zone_start_pfn);
saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
- ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
- isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
- if (ret)
- return ret;
+ if (skip_isolation)
+ VM_BUG_ON(!is_migrate_isolate(saved_mt));
+ else {
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
+ isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+
+ if (ret)
+ return ret;
+ }
/*
* Bail out early when the to-be-isolated pageblock does not form
@@ -366,9 +371,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
if (PageBuddy(page)) {
int order = buddy_order(page);
- if (pfn + (1UL << order) > boundary_pfn)
- split_free_page(page, order, boundary_pfn - pfn);
- pfn += (1UL << order);
+ if (pfn + (1UL << order) > boundary_pfn) {
+ /* free page changed before split, check it again */
+ if (split_free_page(page, order, boundary_pfn - pfn))
+ continue;
+ }
+
+ pfn += 1UL << order;
continue;
}
/*
@@ -463,7 +472,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
return 0;
failed:
/* restore the original migratetype */
- unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
+ if (!skip_isolation)
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
return -EBUSY;
}
@@ -522,14 +532,18 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
int ret;
+ bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false);
+ ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
if (ret)
return ret;
+ if (isolate_start == isolate_end - pageblock_nr_pages)
+ skip_isolation = true;
+
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true);
+ ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index da30c769b376..a6f565308133 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1174,6 +1174,10 @@ static int shmem_find_swap_entries(struct address_space *mapping,
continue;
entry = radix_to_swp_entry(folio);
+ /*
+ * swapin error entries can be found in the mapping. But they're
+ * deliberately ignored here as we've done everything we can do.
+ */
if (swp_type(entry) != type)
continue;
@@ -1671,6 +1675,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
return error;
}
+static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
+ struct folio *folio, swp_entry_t swap)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ swp_entry_t swapin_error;
+ void *old;
+
+ swapin_error = make_swapin_error_entry(&folio->page);
+ old = xa_cmpxchg_irq(&mapping->i_pages, index,
+ swp_to_radix_entry(swap),
+ swp_to_radix_entry(swapin_error), 0);
+ if (old != swp_to_radix_entry(swap))
+ return;
+
+ folio_wait_writeback(folio);
+ delete_from_swap_cache(&folio->page);
+ spin_lock_irq(&info->lock);
+ /*
+ * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
+ * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
+ * shmem_evict_inode.
+ */
+ info->alloced--;
+ info->swapped--;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ swap_free(swap);
+}
+
/*
* Swap in the page pointed to by *pagep.
* Caller has to make sure that *pagep contains a valid swapped page.
@@ -1694,6 +1728,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
swap = radix_to_swp_entry(*foliop);
*foliop = NULL;
+ if (is_swapin_error_entry(swap))
+ return -EIO;
+
/* Look it up and read it in.. */
page = lookup_swap_cache(swap, NULL, 0);
if (!page) {
@@ -1761,6 +1798,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
failed:
if (!shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
+ if (error == -EIO)
+ shmem_set_folio_swapin_error(inode, index, folio, swap);
unlock:
if (folio) {
folio_unlock(folio);
@@ -1906,7 +1945,7 @@ alloc_nohuge:
spin_lock_irq(&info->lock);
info->alloced += folio_nr_pages(folio);
- inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio);
+ inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
alloced = true;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b9e4ed2e90bf..778d57d2d92d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -410,6 +410,9 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
return NULL;
swp = radix_to_swp_entry(page);
+ /* There might be swapin error entries in shmem mapping. */
+ if (non_swap_entry(swp))
+ return NULL;
/* Prevent swapoff from happening to us */
si = get_swap_device(swp);
if (!si)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 94b4ff43ead0..a2e66d855b19 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1775,7 +1775,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
{
struct page *swapcache;
spinlock_t *ptl;
- pte_t *pte;
+ pte_t *pte, new_pte;
int ret = 1;
swapcache = page;
@@ -1789,6 +1789,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
+ if (unlikely(!PageUptodate(page))) {
+ pte_t pteval;
+
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+ pteval = swp_entry_to_pte(make_swapin_error_entry(page));
+ set_pte_at(vma->vm_mm, addr, pte, pteval);
+ swap_free(entry);
+ ret = 0;
+ goto out;
+ }
+
/* See do_swap_page() */
BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
BUG_ON(PageAnon(page) && PageAnonExclusive(page));
@@ -1813,8 +1824,12 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
}
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
+ new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
+ if (pte_swp_soft_dirty(*pte))
+ new_pte = pte_mksoft_dirty(new_pte);
+ if (pte_swp_uffd_wp(*pte))
+ new_pte = pte_mkuffd_wp(new_pte);
+ set_pte_at(vma->vm_mm, addr, pte, new_pte);
swap_free(entry);
out:
pte_unmap_unlock(pte, ptl);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 83b5a3514427..f41f8b0d9e9a 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -181,6 +181,7 @@ enum z3fold_page_flags {
NEEDS_COMPACTING,
PAGE_STALE,
PAGE_CLAIMED, /* by either reclaim or free */
+ PAGE_MIGRATED, /* page is migrated and soon to be released */
};
/*
@@ -212,10 +213,8 @@ static int size_to_chunks(size_t size)
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
gfp_t gfp)
{
- struct z3fold_buddy_slots *slots;
-
- slots = kmem_cache_zalloc(pool->c_handle,
- (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
+ struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
+ gfp);
if (slots) {
/* It will be freed separately in free_handle(). */
@@ -272,8 +271,13 @@ static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
locked = z3fold_page_trylock(zhdr);
read_unlock(&slots->lock);
- if (locked)
- break;
+ if (locked) {
+ struct page *page = virt_to_page(zhdr);
+
+ if (!test_bit(PAGE_MIGRATED, &page->private))
+ break;
+ z3fold_page_unlock(zhdr);
+ }
cpu_relax();
} while (true);
} else {
@@ -391,6 +395,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
clear_bit(PAGE_CLAIMED, &page->private);
+ clear_bit(PAGE_MIGRATED, &page->private);
if (headless)
return zhdr;
@@ -521,13 +526,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
atomic64_dec(&pool->pages_nr);
}
-static void release_z3fold_page(struct kref *ref)
-{
- struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
- refcount);
- __release_z3fold_page(zhdr, false);
-}
-
static void release_z3fold_page_locked(struct kref *ref)
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
@@ -940,10 +938,19 @@ lookup:
}
}
- if (zhdr && !zhdr->slots)
- zhdr->slots = alloc_slots(pool,
- can_sleep ? GFP_NOIO : GFP_ATOMIC);
+ if (zhdr && !zhdr->slots) {
+ zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
+ if (!zhdr->slots)
+ goto out_fail;
+ }
return zhdr;
+
+out_fail:
+ if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ add_to_unbuddied(pool, zhdr);
+ z3fold_page_unlock(zhdr);
+ }
+ return NULL;
}
/*
@@ -1066,7 +1073,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
enum buddy bud;
bool can_sleep = gfpflags_allow_blocking(gfp);
- if (!size)
+ if (!size || (gfp & __GFP_HIGHMEM))
return -EINVAL;
if (size > PAGE_SIZE)
@@ -1093,28 +1100,7 @@ retry:
bud = FIRST;
}
- page = NULL;
- if (can_sleep) {
- spin_lock(&pool->stale_lock);
- zhdr = list_first_entry_or_null(&pool->stale,
- struct z3fold_header, buddy);
- /*
- * Before allocating a page, let's see if we can take one from
- * the stale pages list. cancel_work_sync() can sleep so we
- * limit this case to the contexts where we can sleep
- */
- if (zhdr) {
- list_del(&zhdr->buddy);
- spin_unlock(&pool->stale_lock);
- cancel_work_sync(&zhdr->work);
- page = virt_to_page(zhdr);
- } else {
- spin_unlock(&pool->stale_lock);
- }
- }
- if (!page)
- page = alloc_page(gfp);
-
+ page = alloc_page(gfp);
if (!page)
return -ENOMEM;
@@ -1134,10 +1120,9 @@ retry:
__SetPageMovable(page, pool->inode->i_mapping);
unlock_page(page);
} else {
- if (trylock_page(page)) {
- __SetPageMovable(page, pool->inode->i_mapping);
- unlock_page(page);
- }
+ WARN_ON(!trylock_page(page));
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
}
z3fold_page_lock(zhdr);
@@ -1236,8 +1221,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
return;
}
if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
- put_z3fold_header(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
+ put_z3fold_header(zhdr);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
@@ -1332,12 +1317,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
break;
}
- if (kref_get_unless_zero(&zhdr->refcount) == 0) {
- zhdr = NULL;
- break;
- }
if (!z3fold_page_trylock(zhdr)) {
- kref_put(&zhdr->refcount, release_z3fold_page);
zhdr = NULL;
continue; /* can't evict at this point */
}
@@ -1348,14 +1328,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
*/
if (zhdr->foreign_handles ||
test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- if (!kref_put(&zhdr->refcount,
- release_z3fold_page_locked))
- z3fold_page_unlock(zhdr);
+ z3fold_page_unlock(zhdr);
zhdr = NULL;
continue; /* can't evict such page */
}
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
+ /* See comment in __z3fold_alloc. */
+ kref_get(&zhdr->refcount);
break;
}
@@ -1437,8 +1417,10 @@ next:
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
- z3fold_page_unlock(zhdr);
+ if (list_empty(&zhdr->buddy))
+ add_to_unbuddied(pool, zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
+ z3fold_page_unlock(zhdr);
}
/* We started off locked to we need to lock the pool back */
@@ -1590,8 +1572,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
if (!z3fold_page_trylock(zhdr))
return -EAGAIN;
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
- z3fold_page_unlock(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
+ z3fold_page_unlock(zhdr);
return -EBUSY;
}
if (work_pending(&zhdr->work)) {
@@ -1601,7 +1583,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
new_zhdr = page_address(newpage);
memcpy(new_zhdr, zhdr, PAGE_SIZE);
newpage->private = page->private;
- page->private = 0;
+ set_bit(PAGE_MIGRATED, &page->private);
z3fold_page_unlock(zhdr);
spin_lock_init(&new_zhdr->page_lock);
INIT_WORK(&new_zhdr->work, compact_page_work);
@@ -1631,7 +1613,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
- clear_bit(PAGE_CLAIMED, &page->private);
+ /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
+ page->private = 0;
put_page(page);
return 0;
}
@@ -1653,6 +1636,8 @@ static void z3fold_page_putback(struct page *page)
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
+ if (list_empty(&zhdr->buddy))
+ add_to_unbuddied(pool, zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}