aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/include/linux/mm.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h1244
1 files changed, 613 insertions, 631 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c0910bc3e4a..0ef2ba0c667a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -5,12 +5,14 @@
#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
+#include <linux/pgalloc_tag.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
+#include <linux/compiler.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
@@ -30,6 +32,8 @@
#include <linux/kasan.h>
#include <linux/memremap.h>
#include <linux/slab.h>
+#include <linux/cacheinfo.h>
+#include <linux/rcuwait.h>
struct mempolicy;
struct anon_vma;
@@ -38,22 +42,10 @@ struct user_struct;
struct pt_regs;
struct folio_batch;
-extern int sysctl_page_lock_unfairness;
-
+void arch_mm_preinit(void);
void mm_core_init(void);
void init_mm_internals(void);
-#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */
-extern unsigned long max_mapnr;
-
-static inline void set_max_mapnr(unsigned long limit)
-{
- max_mapnr = limit;
-}
-#else
-static inline void set_max_mapnr(unsigned long limit) { }
-#endif
-
extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
@@ -76,8 +68,6 @@ static inline void totalram_pages_add(long count)
}
extern void * high_memory;
-extern int page_cluster;
-extern const int page_cluster_max;
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
@@ -87,7 +77,7 @@ extern int sysctl_legacy_va_layout;
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
-extern const int mmap_rnd_bits_max;
+extern int mmap_rnd_bits_max __ro_after_init;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
@@ -96,6 +86,14 @@ extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif
+#ifndef DIRECT_MAP_PHYSMEM_END
+# ifdef MAX_PHYSMEM_BITS
+# define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
+# else
+# define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63))
+# endif
+#endif
+
#include <asm/page.h>
#include <asm/processor.h>
@@ -199,17 +197,6 @@ extern int sysctl_max_map_count;
extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
-extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
-extern unsigned long sysctl_overcommit_kbytes;
-
-int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
- loff_t *);
-int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
- loff_t *);
-int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
- loff_t *);
-
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio))
@@ -247,8 +234,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
-/* Use only if VMA has no other users */
-void __vm_area_free(struct vm_area_struct *vma);
#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
@@ -320,21 +305,27 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
+#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
-# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
-# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */
-# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */
-# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
-# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
-#ifdef CONFIG_PPC
+# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
+# define VM_PKEY_BIT0 VM_HIGH_ARCH_0
+# define VM_PKEY_BIT1 VM_HIGH_ARCH_1
+# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
+#if CONFIG_ARCH_PKEY_BITS > 3
+# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
+#else
+# define VM_PKEY_BIT3 0
+#endif
+#if CONFIG_ARCH_PKEY_BITS > 4
# define VM_PKEY_BIT4 VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4 0
@@ -352,13 +343,21 @@ extern unsigned int kobjsize(const void *objp);
* for more details on the guard size.
*/
# define VM_SHADOW_STACK VM_HIGH_ARCH_5
-#else
+#endif
+
+#if defined(CONFIG_ARM64_GCS)
+/*
+ * arm64's Guarded Control Stack implements similar functionality and
+ * has similar constraints to shadow stacks.
+ */
+# define VM_SHADOW_STACK VM_HIGH_ARCH_6
+#endif
+
+#ifndef VM_SHADOW_STACK
# define VM_SHADOW_STACK VM_NONE
#endif
-#if defined(CONFIG_X86)
-# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
-#elif defined(CONFIG_PPC)
+#if defined(CONFIG_PPC64)
# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP VM_ARCH_1
@@ -373,8 +372,8 @@ extern unsigned int kobjsize(const void *objp);
#endif
#if defined(CONFIG_ARM64_MTE)
-# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */
-# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */
+# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */
+# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */
#else
# define VM_MTE VM_NONE
# define VM_MTE_ALLOWED VM_NONE
@@ -385,7 +384,7 @@ extern unsigned int kobjsize(const void *objp);
#endif
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT 38
+# define VM_UFFD_MINOR_BIT 41
# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR VM_NONE
@@ -405,6 +404,20 @@ extern unsigned int kobjsize(const void *objp);
#define VM_ALLOW_ANY_UNCACHED VM_NONE
#endif
+#ifdef CONFIG_64BIT
+#define VM_DROPPABLE_BIT 40
+#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT)
+#elif defined(CONFIG_PPC32)
+#define VM_DROPPABLE VM_ARCH_1
+#else
+#define VM_DROPPABLE VM_NONE
+#endif
+
+#ifdef CONFIG_64BIT
+/* VM is sealed, in vm_flags */
+#define VM_SEALED _BITUL(63)
+#endif
+
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
@@ -656,109 +669,11 @@ static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_PER_VMA_LOCK
-/*
- * Try to read-lock a vma. The function is allowed to occasionally yield false
- * locked result to avoid performance overhead, in which case we fall back to
- * using mmap_lock. The function should never yield false unlocked result.
- */
-static inline bool vma_start_read(struct vm_area_struct *vma)
-{
- /*
- * Check before locking. A race might cause false locked result.
- * We can use READ_ONCE() for the mm_lock_seq here, and don't need
- * ACQUIRE semantics, because this is just a lockless check whose result
- * we don't rely on for anything - the mm_lock_seq read against which we
- * need ordering is below.
- */
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
- return false;
-
- if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
- return false;
-
- /*
- * Overflow might produce false locked result.
- * False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
- * modification invalidates all existing locks.
- *
- * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
- * racing with vma_end_write_all(), we only start reading from the VMA
- * after it has been unlocked.
- * This pairs with RELEASE semantics in vma_end_write_all().
- */
- if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
- up_read(&vma->vm_lock->lock);
- return false;
- }
- return true;
-}
-
-static inline void vma_end_read(struct vm_area_struct *vma)
-{
- rcu_read_lock(); /* keeps vma alive till the end of up_read */
- up_read(&vma->vm_lock->lock);
- rcu_read_unlock();
-}
-
-/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
-{
- mmap_assert_write_locked(vma->vm_mm);
-
- /*
- * current task is holding mmap_write_lock, both vma->vm_lock_seq and
- * mm->mm_lock_seq can't be concurrently modified.
- */
- *mm_lock_seq = vma->vm_mm->mm_lock_seq;
- return (vma->vm_lock_seq == *mm_lock_seq);
-}
-
/*
- * Begin writing to a VMA.
- * Exclude concurrent readers under the per-VMA lock until the currently
- * write-locked mmap_lock is dropped or downgraded.
+ * These must be here rather than mmap_lock.h as dependent on vm_fault type,
+ * declared in this header.
*/
-static inline void vma_start_write(struct vm_area_struct *vma)
-{
- int mm_lock_seq;
-
- if (__is_vma_write_locked(vma, &mm_lock_seq))
- return;
-
- down_write(&vma->vm_lock->lock);
- /*
- * We should use WRITE_ONCE() here because we can have concurrent reads
- * from the early lockless pessimistic check in vma_start_read().
- * We don't really care about the correctness of that early check, but
- * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
- */
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- up_write(&vma->vm_lock->lock);
-}
-
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
-{
- int mm_lock_seq;
-
- VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
-}
-
-static inline void vma_assert_locked(struct vm_area_struct *vma)
-{
- if (!rwsem_is_locked(&vma->vm_lock->lock))
- vma_assert_write_locked(vma);
-}
-
-static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
-{
- /* When detaching vma should be write-locked */
- if (detached)
- vma_assert_write_locked(vma);
- vma->detached = detached;
-}
-
+#ifdef CONFIG_PER_VMA_LOCK
static inline void release_fault_lock(struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_VMA_LOCK)
@@ -774,32 +689,7 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
else
mmap_assert_locked(vmf->vma->vm_mm);
}
-
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address);
-
-#else /* CONFIG_PER_VMA_LOCK */
-
-static inline bool vma_start_read(struct vm_area_struct *vma)
- { return false; }
-static inline void vma_end_read(struct vm_area_struct *vma) {}
-static inline void vma_start_write(struct vm_area_struct *vma) {}
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
- { mmap_assert_write_locked(vma->vm_mm); }
-static inline void vma_mark_detached(struct vm_area_struct *vma,
- bool detached) {}
-
-static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
-{
- return NULL;
-}
-
-static inline void vma_assert_locked(struct vm_area_struct *vma)
-{
- mmap_assert_locked(vma->vm_mm);
-}
-
+#else
static inline void release_fault_lock(struct vm_fault *vmf)
{
mmap_read_unlock(vmf->vma->vm_mm);
@@ -809,23 +699,17 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
{
mmap_assert_locked(vmf->vma->vm_mm);
}
-
#endif /* CONFIG_PER_VMA_LOCK */
extern const struct vm_operations_struct vma_dummy_vm_ops;
-/*
- * WARNING: vma_init does not initialize vma->vm_lock.
- * Use vm_area_alloc()/vm_area_free() if vma needs locking.
- */
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_ops = &vma_dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
- vma_mark_detached(vma, false);
- vma_numab_state_init(vma);
+ vma_lock_init(vma, false);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
@@ -992,27 +876,6 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
return mas_prev(&vmi->mas, 0);
}
-static inline
-struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
-{
- return mas_prev_range(&vmi->mas, 0);
-}
-
-static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
-{
- return vmi->mas.index;
-}
-
-static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
-{
- return vmi->mas.last + 1;
-}
-static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
- unsigned long count)
-{
- return mas_expected_entries(&vmi->mas, count);
-}
-
static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
unsigned long start, unsigned long end, gfp_t gfp)
{
@@ -1039,6 +902,7 @@ static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
if (unlikely(mas_is_err(&vmi->mas)))
return -ENOMEM;
+ vma_mark_attached(vma);
return 0;
}
@@ -1079,6 +943,25 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
struct mmu_gather;
struct inode;
+extern void prep_compound_page(struct page *page, unsigned int order);
+
+static inline unsigned int folio_large_order(const struct folio *folio)
+{
+ return folio->_flags_1 & 0xff;
+}
+
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+static inline long folio_large_nr_pages(const struct folio *folio)
+{
+ return folio->_nr_pages;
+}
+#else
+static inline long folio_large_nr_pages(const struct folio *folio)
+{
+ return 1L << folio_large_order(folio);
+}
+#endif
+
/*
* compound_order() can be called without holding a reference, which means
* that niceties like page_folio() don't work. These callers should be
@@ -1092,7 +975,7 @@ static inline unsigned int compound_order(struct page *page)
if (!test_bit(PG_head, &folio->flags))
return 0;
- return folio->_flags_1 & 0xff;
+ return folio_large_order(folio);
}
/**
@@ -1104,11 +987,28 @@ static inline unsigned int compound_order(struct page *page)
*
* Return: The order of the folio.
*/
-static inline unsigned int folio_order(struct folio *folio)
+static inline unsigned int folio_order(const struct folio *folio)
{
if (!folio_test_large(folio))
return 0;
- return folio->_flags_1 & 0xff;
+ return folio_large_order(folio);
+}
+
+/**
+ * folio_reset_order - Reset the folio order and derived _nr_pages
+ * @folio: The folio.
+ *
+ * Reset the order and derived _nr_pages to 0. Must only be used in the
+ * process of splitting large folios.
+ */
+static inline void folio_reset_order(struct folio *folio)
+{
+ if (WARN_ON_ONCE(!folio_test_large(folio)))
+ return;
+ folio->_flags_1 &= ~0xffUL;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 0;
+#endif
}
#include <linux/huge_mm.h>
@@ -1196,75 +1096,53 @@ static inline int is_vmalloc_or_module_addr(const void *x)
/*
* How many times the entire folio is mapped as a single unit (eg by a
* PMD or PUD entry). This is probably not what you want, except for
- * debugging purposes - it does not include PTE-mapped sub-pages; look
- * at folio_mapcount() or page_mapcount() instead.
+ * debugging purposes or implementation of other core folio_*() primitives.
*/
-static inline int folio_entire_mapcount(struct folio *folio)
+static inline int folio_entire_mapcount(const struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1))
+ return 0;
return atomic_read(&folio->_entire_mapcount) + 1;
}
-/*
- * The atomic page->_mapcount, starts from -1: so that transitions
- * both from it and to it can be tracked, using atomic_inc_and_test
- * and atomic_add_negative(-1).
- */
-static inline void page_mapcount_reset(struct page *page)
+static inline int folio_large_mapcount(const struct folio *folio)
{
- atomic_set(&(page)->_mapcount, -1);
+ VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
+ return atomic_read(&folio->_large_mapcount) + 1;
}
/**
- * page_mapcount() - Number of times this precise page is mapped.
- * @page: The page.
+ * folio_mapcount() - Number of mappings of this folio.
+ * @folio: The folio.
*
- * The number of times this page is mapped. If this page is part of
- * a large folio, it includes the number of times this page is mapped
- * as part of that folio.
+ * The folio mapcount corresponds to the number of present user page table
+ * entries that reference any part of a folio. Each such present user page
+ * table entry must be paired with exactly on folio reference.
*
- * The result is undefined for pages which cannot be mapped into userspace.
- * For example SLAB or special types of pages. See function page_has_type().
- * They use this field in struct page differently.
- */
-static inline int page_mapcount(struct page *page)
-{
- int mapcount = atomic_read(&page->_mapcount) + 1;
-
- if (unlikely(PageCompound(page)))
- mapcount += folio_entire_mapcount(page_folio(page));
-
- return mapcount;
-}
-
-int folio_total_mapcount(struct folio *folio);
-
-/**
- * folio_mapcount() - Calculate the number of mappings of this folio.
- * @folio: The folio.
+ * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
+ * exactly once.
*
- * A large folio tracks both how many times the entire folio is mapped,
- * and how many times each individual page in the folio is mapped.
- * This function calculates the total number of times the folio is
- * mapped.
+ * For hugetlb folios, each abstracted "hugetlb" user page table entry that
+ * references the entire folio counts exactly once, even when such special
+ * page table entries are comprised of multiple ordinary page table entries.
+ *
+ * Will report 0 for pages which cannot be mapped into userspace, such as
+ * slab, page tables and similar.
*
* Return: The number of times this folio is mapped.
*/
-static inline int folio_mapcount(struct folio *folio)
+static inline int folio_mapcount(const struct folio *folio)
{
- if (likely(!folio_test_large(folio)))
- return atomic_read(&folio->_mapcount) + 1;
- return folio_total_mapcount(folio);
-}
+ int mapcount;
-static inline bool folio_large_is_mapped(struct folio *folio)
-{
- /*
- * Reading _entire_mapcount below could be omitted if hugetlb
- * participated in incrementing nr_pages_mapped when compound mapped.
- */
- return atomic_read(&folio->_nr_pages_mapped) > 0 ||
- atomic_read(&folio->_entire_mapcount) >= 0;
+ if (likely(!folio_test_large(folio))) {
+ mapcount = atomic_read(&folio->_mapcount) + 1;
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+ return mapcount;
+ }
+ return folio_large_mapcount(folio);
}
/**
@@ -1273,11 +1151,9 @@ static inline bool folio_large_is_mapped(struct folio *folio)
*
* Return: True if any page in this folio is referenced by user page tables.
*/
-static inline bool folio_mapped(struct folio *folio)
+static inline bool folio_mapped(const struct folio *folio)
{
- if (likely(!folio_test_large(folio)))
- return atomic_read(&folio->_mapcount) >= 0;
- return folio_large_is_mapped(folio);
+ return folio_mapcount(folio) >= 1;
}
/*
@@ -1285,11 +1161,9 @@ static inline bool folio_mapped(struct folio *folio)
* For compound page it returns true if any sub-page of compound page is mapped,
* even if this particular sub-page is not itself mapped by any PTE or PMD.
*/
-static inline bool page_mapped(struct page *page)
+static inline bool page_mapped(const struct page *page)
{
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) >= 0;
- return folio_large_is_mapped(page_folio(page));
+ return folio_mapped(page_folio(page));
}
static inline struct page *virt_to_head_page(const void *x)
@@ -1308,15 +1182,12 @@ static inline struct folio *virt_to_folio(const void *x)
void __folio_put(struct folio *folio);
-void put_pages_list(struct list_head *pages);
-
void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);
+int folio_mc_copy(struct folio *dst, struct folio *src);
unsigned long nr_free_buffer_pages(void);
-void destroy_large_folio(struct folio *folio);
-
/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
@@ -1364,7 +1235,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page);
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
struct page *page, unsigned int nr, unsigned long addr);
@@ -1405,9 +1276,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
* the page's disk buffers. PG_private must be set to tell the VM to call
* into the filesystem to release these pages.
*
- * A page may belong to an inode's memory mapping. In this case, page->mapping
- * is the pointer to the inode, and page->index is the file offset of the page,
- * in units of PAGE_SIZE.
+ * A folio may belong to an inode's memory mapping. In this case,
+ * folio->mapping points to the inode, and folio->index is the file
+ * offset of the folio, in units of PAGE_SIZE.
*
* If pagecache pages are not associated with an inode, they are said to be
* anonymous pages. These may become associated with the swapcache, and in that
@@ -1431,30 +1302,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
* back into memory.
*/
-#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
-DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-
-bool __put_devmap_managed_page_refs(struct page *page, int refs);
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
-{
- if (!static_branch_unlikely(&devmap_managed_key))
- return false;
- if (!is_zone_device_page(page))
- return false;
- return __put_devmap_managed_page_refs(page, refs);
-}
-#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
-{
- return false;
-}
-#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-
-static inline bool put_devmap_managed_page(struct page *page)
-{
- return put_devmap_managed_page_refs(page, 1);
-}
-
/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1475,7 +1322,10 @@ static inline void folio_get(struct folio *folio)
static inline void get_page(struct page *page)
{
- folio_get(page_folio(page));
+ struct folio *folio = page_folio(page);
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return;
+ folio_get(folio);
}
static inline __must_check bool try_get_page(struct page *page)
@@ -1569,12 +1419,9 @@ static inline void put_page(struct page *page)
{
struct folio *folio = page_folio(page);
- /*
- * For some devmap managed pages we need to catch refcount transition
- * from 2 to 1:
- */
- if (put_devmap_managed_page(&folio->page))
+ if (folio_test_slab(folio))
return;
+
folio_put(folio);
}
@@ -1605,17 +1452,20 @@ static inline void put_page(struct page *page)
* issue.
*
* Locking: the lockless algorithm described in folio_try_get_rcu()
- * provides safe operation for get_user_pages(), page_mkclean() and
+ * provides safe operation for get_user_pages(), folio_mkclean() and
* other calls that race to set up page table entries.
*/
#define GUP_PIN_COUNTING_BIAS (1U << 10)
void unpin_user_page(struct page *page);
+void unpin_folio(struct folio *folio);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty);
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);
+void unpin_user_folio(struct folio *folio, unsigned long npages);
+void unpin_folios(struct folio **folios, unsigned long nfolios);
static inline bool is_cow_mapping(vm_flags_t flags)
{
@@ -1763,6 +1613,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
}
}
+
+bool folio_use_access_time(struct folio *folio);
#else /* !CONFIG_NUMA_BALANCING */
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
@@ -1816,6 +1668,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
}
+static inline bool folio_use_access_time(struct folio *folio)
+{
+ return false;
+}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -1914,7 +1770,7 @@ static inline unsigned long page_to_section(const struct page *page)
*
* Return: The Page Frame Number of the first page in the folio.
*/
-static inline unsigned long folio_pfn(struct folio *folio)
+static inline unsigned long folio_pfn(const struct folio *folio)
{
return page_to_pfn(&folio->page);
}
@@ -1924,6 +1780,52 @@ static inline struct folio *pfn_folio(unsigned long pfn)
return page_folio(pfn_to_page(pfn));
}
+#ifdef CONFIG_MMU
+static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
+{
+ return pfn_pte(page_to_pfn(page), pgprot);
+}
+
+/**
+ * folio_mk_pte - Create a PTE for this folio
+ * @folio: The folio to create a PTE for
+ * @pgprot: The page protection bits to use
+ *
+ * Create a page table entry for the first page of this folio.
+ * This is suitable for passing to set_ptes().
+ *
+ * Return: A page table entry suitable for mapping this folio.
+ */
+static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot)
+{
+ return pfn_pte(folio_pfn(folio), pgprot);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/**
+ * folio_mk_pmd - Create a PMD for this folio
+ * @folio: The folio to create a PMD for
+ * @pgprot: The page protection bits to use
+ *
+ * Create a page table entry for the first page of this folio.
+ * This is suitable for passing to set_pmd_at().
+ *
+ * Return: A page table entry suitable for mapping this folio.
+ */
+static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot)
+{
+ return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
+}
+#endif
+#endif /* CONFIG_MMU */
+
+static inline bool folio_has_pincount(const struct folio *folio)
+{
+ if (IS_ENABLED(CONFIG_64BIT))
+ return folio_test_large(folio);
+ return folio_order(folio) > 1;
+}
+
/**
* folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
* @folio: The folio.
@@ -1940,18 +1842,18 @@ static inline struct folio *pfn_folio(unsigned long pfn)
* get that many refcounts, and b) all the callers of this routine are
* expected to be able to deal gracefully with a false positive.
*
- * For large folios, the result will be exactly correct. That's because
+ * For most large folios, the result will be exactly correct. That's because
* we have more tracking data available: the _pincount field is used
* instead of the GUP_PIN_COUNTING_BIAS scheme.
*
* For more information, please see Documentation/core-api/pin_user_pages.rst.
*
- * Return: True, if it is likely that the page has been "dma-pinned".
- * False, if the page is definitely not dma-pinned.
+ * Return: True, if it is likely that the folio has been "dma-pinned".
+ * False, if the folio is definitely not dma-pinned.
*/
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
- if (folio_test_large(folio))
+ if (folio_has_pincount(folio))
return atomic_read(&folio->_pincount) > 0;
/*
@@ -1966,11 +1868,6 @@ static inline bool folio_maybe_dma_pinned(struct folio *folio)
GUP_PIN_COUNTING_BIAS;
}
-static inline bool page_maybe_dma_pinned(struct page *page)
-{
- return folio_maybe_dma_pinned(page_folio(page));
-}
-
/*
* This should most likely only be called during fork() to see whether we
* should break the cow immediately for an anon page on the src mm.
@@ -2028,6 +1925,13 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio)
if (folio_is_device_coherent(folio))
return false;
+ /*
+ * Filesystems can only tolerate transient delays to truncate and
+ * hole-punch operations
+ */
+ if (folio_is_fsdax(folio))
+ return false;
+
/* Otherwise, non-movable zone folios can be pinned. */
return !folio_is_zone_movable(folio);
@@ -2067,15 +1971,11 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
*
* Return: A positive power of two.
*/
-static inline long folio_nr_pages(struct folio *folio)
+static inline long folio_nr_pages(const struct folio *folio)
{
if (!folio_test_large(folio))
return 1;
-#ifdef CONFIG_64BIT
- return folio->_folio_nr_pages;
-#else
- return 1L << (folio->_flags_1 & 0xff);
-#endif
+ return folio_large_nr_pages(folio);
}
/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
@@ -2090,26 +1990,13 @@ static inline long folio_nr_pages(struct folio *folio)
* page. compound_nr() can be called on a tail page, and is defined to
* return 1 in that case.
*/
-static inline unsigned long compound_nr(struct page *page)
+static inline long compound_nr(struct page *page)
{
struct folio *folio = (struct folio *)page;
if (!test_bit(PG_head, &folio->flags))
return 1;
-#ifdef CONFIG_64BIT
- return folio->_folio_nr_pages;
-#else
- return 1L << (folio->_flags_1 & 0xff);
-#endif
-}
-
-/**
- * thp_nr_pages - The number of regular pages in this huge page.
- * @page: The head page of a huge page.
- */
-static inline int thp_nr_pages(struct page *page)
-{
- return folio_nr_pages((struct folio *)page);
+ return folio_large_nr_pages(folio);
}
/**
@@ -2143,7 +2030,7 @@ static inline struct folio *folio_next(struct folio *folio)
* it from being split. It is not necessary for the folio to be locked.
* Return: The base-2 logarithm of the size of this folio.
*/
-static inline unsigned int folio_shift(struct folio *folio)
+static inline unsigned int folio_shift(const struct folio *folio)
{
return PAGE_SHIFT + folio_order(folio);
}
@@ -2156,49 +2043,134 @@ static inline unsigned int folio_shift(struct folio *folio)
* it from being split. It is not necessary for the folio to be locked.
* Return: The number of bytes in this folio.
*/
-static inline size_t folio_size(struct folio *folio)
+static inline size_t folio_size(const struct folio *folio)
{
return PAGE_SIZE << folio_order(folio);
}
/**
- * folio_estimated_sharers - Estimate the number of sharers of a folio.
+ * folio_maybe_mapped_shared - Whether the folio is mapped into the page
+ * tables of more than one MM
* @folio: The folio.
*
- * folio_estimated_sharers() aims to serve as a function to efficiently
- * estimate the number of processes sharing a folio. This is done by
- * looking at the precise mapcount of the first subpage in the folio, and
- * assuming the other subpages are the same. This may not be true for large
- * folios. If you want exact mapcounts for exact calculations, look at
- * page_mapcount() or folio_total_mapcount().
+ * This function checks if the folio maybe currently mapped into more than one
+ * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single
+ * MM ("mapped exclusively").
+ *
+ * For KSM folios, this function also returns "mapped shared" when a folio is
+ * mapped multiple times into the same MM, because the individual page mappings
+ * are independent.
+ *
+ * For small anonymous folios and anonymous hugetlb folios, the return
+ * value will be exactly correct: non-KSM folios can only be mapped at most once
+ * into an MM, and they cannot be partially mapped. KSM folios are
+ * considered shared even if mapped multiple times into the same MM.
+ *
+ * For other folios, the result can be fuzzy:
+ * #. For partially-mappable large folios (THP), the return value can wrongly
+ * indicate "mapped shared" (false positive) if a folio was mapped by
+ * more than two MMs at one point in time.
+ * #. For pagecache folios (including hugetlb), the return value can wrongly
+ * indicate "mapped shared" (false positive) when two VMAs in the same MM
+ * cover the same file range.
*
- * Return: The estimated number of processes sharing a folio.
+ * Further, this function only considers current page table mappings that
+ * are tracked using the folio mapcount(s).
+ *
+ * This function does not consider:
+ * #. If the folio might get mapped in the (near) future (e.g., swapcache,
+ * pagecache, temporary unmapping for migration).
+ * #. If the folio is mapped differently (VM_PFNMAP).
+ * #. If hugetlb page table sharing applies. Callers might want to check
+ * hugetlb_pmd_shared().
+ *
+ * Return: Whether the folio is estimated to be mapped into more than one MM.
*/
-static inline int folio_estimated_sharers(struct folio *folio)
+static inline bool folio_maybe_mapped_shared(struct folio *folio)
{
- return page_mapcount(folio_page(folio, 0));
+ int mapcount = folio_mapcount(folio);
+
+ /* Only partially-mappable folios require more care. */
+ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
+ return mapcount > 1;
+
+ /*
+ * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ...
+ * simply assume "mapped shared", nobody should really care
+ * about this for arbitrary kernel allocations.
+ */
+ if (!IS_ENABLED(CONFIG_MM_ID))
+ return true;
+
+ /*
+ * A single mapping implies "mapped exclusively", even if the
+ * folio flag says something different: it's easier to handle this
+ * case here instead of on the RMAP hot path.
+ */
+ if (mapcount <= 1)
+ return false;
+ return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
}
-#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
-static inline int arch_make_page_accessible(struct page *page)
+/**
+ * folio_expected_ref_count - calculate the expected folio refcount
+ * @folio: the folio
+ *
+ * Calculate the expected folio refcount, taking references from the pagecache,
+ * swapcache, PG_private and page table mappings into account. Useful in
+ * combination with folio_ref_count() to detect unexpected references (e.g.,
+ * GUP or other temporary references).
+ *
+ * Does currently not consider references from the LRU cache. If the folio
+ * was isolated from the LRU (which is the case during migration or split),
+ * the LRU cache does not apply.
+ *
+ * Calling this function on an unmapped folio -- !folio_mapped() -- that is
+ * locked will return a stable result.
+ *
+ * Calling this function on a mapped folio will not result in a stable result,
+ * because nothing stops additional page table mappings from coming (e.g.,
+ * fork()) or going (e.g., munmap()).
+ *
+ * Calling this function without the folio lock will also not result in a
+ * stable result: for example, the folio might get dropped from the swapcache
+ * concurrently.
+ *
+ * However, even when called without the folio lock or on a mapped folio,
+ * this function can be used to detect unexpected references early (for example,
+ * if it makes sense to even lock the folio and unmap it).
+ *
+ * The caller must add any reference (e.g., from folio_try_get()) it might be
+ * holding itself to the result.
+ *
+ * Returns the expected folio refcount.
+ */
+static inline int folio_expected_ref_count(const struct folio *folio)
{
- return 0;
+ const int order = folio_order(folio);
+ int ref_count = 0;
+
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return 0;
+
+ if (folio_test_anon(folio)) {
+ /* One reference per page from the swapcache. */
+ ref_count += folio_test_swapcache(folio) << order;
+ } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) {
+ /* One reference per page from the pagecache. */
+ ref_count += !!folio->mapping << order;
+ /* One reference from PG_private. */
+ ref_count += folio_test_private(folio);
+ }
+
+ /* One reference per page table mapping. */
+ return ref_count + folio_mapcount(folio);
}
-#endif
#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
static inline int arch_make_folio_accessible(struct folio *folio)
{
- int ret;
- long i, nr = folio_nr_pages(folio);
-
- for (i = 0; i < nr; i++) {
- ret = arch_make_page_accessible(folio_page(folio, i));
- if (ret)
- break;
- }
-
- return ret;
+ return 0;
}
#endif
@@ -2207,11 +2179,6 @@ static inline int arch_make_folio_accessible(struct folio *folio)
*/
#include <linux/vmstat.h>
-static __always_inline void *lowmem_page_address(const struct page *page)
-{
- return page_to_virt(page);
-}
-
#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif
@@ -2234,6 +2201,11 @@ void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif
+static __always_inline void *lowmem_page_address(const struct page *page)
+{
+ return page_to_virt(page);
+}
+
#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address) do { } while(0)
@@ -2245,19 +2217,6 @@ static inline void *folio_address(const struct folio *folio)
return page_address(&folio->page);
}
-extern pgoff_t __page_file_index(struct page *page);
-
-/*
- * Return the pagecache index of the passed page. Regular pagecache pages
- * use ->index whereas swapcache pages use swp_offset(->private)
- */
-static inline pgoff_t page_index(struct page *page)
-{
- if (unlikely(PageSwapCache(page)))
- return __page_file_index(page);
- return page->index;
-}
-
/*
* Return true only if the page has been allocated with
* ALLOC_NO_WATERMARKS and the low watermark was not
@@ -2308,7 +2267,6 @@ static inline void clear_page_pfmemalloc(struct page *page)
extern void pagefault_out_of_memory(void);
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
-#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
/*
@@ -2317,6 +2275,7 @@ extern void pagefault_out_of_memory(void);
struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
bool even_cows; /* Zap COWed private pages too? */
+ bool reclaim_pt; /* Need reclaim page tables? */
zap_flags_t zap_flags; /* Extra flags for zapping */
};
@@ -2391,15 +2350,42 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp);
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
- unsigned long *pfn);
-int follow_phys(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags, unsigned long *prot, resource_size_t *phys);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
+struct follow_pfnmap_args {
+ /**
+ * Inputs:
+ * @vma: Pointer to @vm_area_struct struct
+ * @address: the virtual address to walk
+ */
+ struct vm_area_struct *vma;
+ unsigned long address;
+ /**
+ * Internals:
+ *
+ * The caller shouldn't touch any of these.
+ */
+ spinlock_t *lock;
+ pte_t *ptep;
+ /**
+ * Outputs:
+ *
+ * @pfn: the PFN of the address
+ * @addr_mask: address mask covering pfn
+ * @pgprot: the pgprot_t of the mapping
+ * @writable: whether the mapping is writable
+ * @special: whether the mapping is a special mapping (real PFN maps)
+ */
+ unsigned long pfn;
+ unsigned long addr_mask;
+ pgprot_t pgprot;
+ bool writable;
+ bool special;
+};
+int follow_pfnmap_start(struct follow_pfnmap_args *args);
+void follow_pfnmap_end(struct follow_pfnmap_args *args);
+
extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
@@ -2457,6 +2443,11 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
+#ifdef CONFIG_BPF_SYSCALL
+extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags);
+#endif
+
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -2504,6 +2495,10 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
+long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
+ struct folio **folios, unsigned int max_folios,
+ pgoff_t *offset);
+int folio_add_pins(struct folio *folio, unsigned int pins);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
@@ -2516,19 +2511,15 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
struct task_struct *task, bool bypass_rlim);
struct kvec;
-struct page *get_dump_page(unsigned long addr);
+struct page *get_dump_page(unsigned long addr, int *locked);
bool folio_mark_dirty(struct folio *folio);
+bool folio_mark_dirty_lock(struct folio *folio);
bool set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
-extern unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack);
-
/*
* Flags used by change_protection(). For now we make it a bitmap so
* that we can pass in multiple flags just like parameters. However
@@ -2549,21 +2540,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
-bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
-int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
-static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
-{
- /*
- * We want to check manually if we can change individual PTEs writable
- * if we can't do that automatically for all PTEs in a mapping. For
- * private mappings, that's always the case when we have write
- * permissions as we properly have to handle COW.
- */
- if (vma->vm_flags & VM_SHARED)
- return vma_wants_writenotify(vma, vma->vm_page_prot);
- return !!(vma->vm_flags & VM_WRITE);
-
-}
bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
extern long change_protection(struct mmu_gather *tlb,
@@ -2651,7 +2627,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm)
{
unsigned long _rss = get_mm_rss(mm);
- if ((mm)->hiwater_rss < _rss)
+ if (data_race(mm->hiwater_rss) < _rss)
(mm)->hiwater_rss = _rss;
}
@@ -2687,6 +2663,30 @@ static inline pte_t pte_mkspecial(pte_t pte)
}
#endif
+#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+static inline bool pmd_special(pmd_t pmd)
+{
+ return false;
+}
+
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+ return pmd;
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */
+
+#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
+static inline bool pud_special(pud_t pud)
+{
+ return false;
+}
+
+static inline pud_t pud_mkspecial(pud_t pud)
+{
+ return pud;
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
+
#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
@@ -2857,12 +2857,13 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt)
*
* Return: The ptdesc describing the allocated page tables.
*/
-static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
{
- struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+ struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);
return page_ptdesc(page);
}
+#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
/**
* pagetable_free - Free pagetables
@@ -2878,7 +2879,7 @@ static inline void pagetable_free(struct ptdesc *pt)
__free_pages(page, compound_order(page));
}
-#if USE_SPLIT_PTE_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
bool ptlock_alloc(struct ptdesc *ptdesc);
@@ -2913,6 +2914,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}
+static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
+{
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE));
+ BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE);
+ return ptlock_ptr(virt_to_ptdesc(pte));
+}
+
static inline bool ptlock_init(struct ptdesc *ptdesc)
{
/*
@@ -2929,7 +2937,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc)
return true;
}
-#else /* !USE_SPLIT_PTE_PTLOCKS */
+#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
*/
@@ -2937,23 +2945,24 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
return &mm->page_table_lock;
}
+static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
+{
+ return &mm->page_table_lock;
+}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
-#endif /* USE_SPLIT_PTE_PTLOCKS */
+#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
-static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
+static inline void __pagetable_ctor(struct ptdesc *ptdesc)
{
struct folio *folio = ptdesc_folio(ptdesc);
- if (!ptlock_init(ptdesc))
- return false;
__folio_set_pgtable(folio);
lruvec_stat_add_folio(folio, NR_PAGETABLE);
- return true;
}
-static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
+static inline void pagetable_dtor(struct ptdesc *ptdesc)
{
struct folio *folio = ptdesc_folio(ptdesc);
@@ -2962,7 +2971,30 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}
-pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
+static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
+{
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
+
+static inline bool pagetable_pte_ctor(struct mm_struct *mm,
+ struct ptdesc *ptdesc)
+{
+ if (mm != &init_mm && !ptlock_init(ptdesc))
+ return false;
+ __pagetable_ctor(ptdesc);
+ return true;
+}
+
+pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
+static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr,
+ pmd_t *pmdvalp)
+{
+ pte_t *pte;
+
+ __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp));
+ return pte;
+}
static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
{
return __pte_offset_map(pmd, addr, NULL);
@@ -2975,12 +3007,16 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
- __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp));
+ __cond_lock(RCU, __cond_lock(*ptlp,
+ pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)));
return pte;
}
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, spinlock_t **ptlp);
+pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp);
+pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, pmd_t *pmdvalp,
+ spinlock_t **ptlp);
#define pte_unmap_unlock(pte, ptl) do { \
spin_unlock(ptl); \
@@ -3000,7 +3036,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
NULL: pte_offset_kernel(pmd, address))
-#if USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_SPLIT_PMD_PTLOCKS)
static inline struct page *pmd_pgtable_page(pmd_t *pmd)
{
@@ -3026,14 +3062,6 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
return ptlock_init(ptdesc);
}
-static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
-#endif
- ptlock_free(ptdesc);
-}
-
#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
#else
@@ -3044,7 +3072,6 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
}
static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
-static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}
#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
@@ -3057,26 +3084,16 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
return ptl;
}
-static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
+static inline bool pagetable_pmd_ctor(struct mm_struct *mm,
+ struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
-
- if (!pmd_ptlock_init(ptdesc))
+ if (mm != &init_mm && !pmd_ptlock_init(ptdesc))
return false;
- __folio_set_pgtable(folio);
- lruvec_stat_add_folio(folio, NR_PAGETABLE);
+ ptdesc_pmd_pts_init(ptdesc);
+ __pagetable_ctor(ptdesc);
return true;
}
-static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
-{
- struct folio *folio = ptdesc_folio(ptdesc);
-
- pmd_ptlock_free(ptdesc);
- __folio_clear_pgtable(folio);
- lruvec_stat_sub_folio(folio, NR_PAGETABLE);
-}
-
/*
* No scalability reason to split PUD locks yet, but follow the same pattern
* as the PMD locks to make it easier if we decide to. The VM should not be
@@ -3098,18 +3115,17 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
-
- __folio_set_pgtable(folio);
- lruvec_stat_add_folio(folio, NR_PAGETABLE);
+ __pagetable_ctor(ptdesc);
}
-static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
+static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
+ __pagetable_ctor(ptdesc);
+}
- __folio_clear_pgtable(folio);
- lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc)
+{
+ __pagetable_ctor(ptdesc);
}
extern void __init pagecache_init(void);
@@ -3130,14 +3146,7 @@ extern void reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid);
/* Free the reserved page into the buddy system, so it gets managed. */
-static inline void free_reserved_page(struct page *page)
-{
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- adjust_managed_page_count(page, 1);
-}
-#define free_highmem_page(page) free_reserved_page(page)
+void free_reserved_page(struct page *page);
static inline void mark_page_reserved(struct page *page)
{
@@ -3193,8 +3202,6 @@ static inline unsigned long get_num_physpages(void)
*/
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
-unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
- unsigned long end_pfn);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
@@ -3210,7 +3217,6 @@ static inline int early_pfn_to_nid(unsigned long pfn)
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif
-extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void mem_init(void);
extern void __init mmap_init(void);
@@ -3222,9 +3228,6 @@ static inline void show_mem(void)
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
-#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-extern unsigned long arch_reserved_kernel_pages(void);
-#endif
extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
@@ -3271,78 +3274,10 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
-extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff,
- struct vm_area_struct *next);
-extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff);
-extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void unlink_file_vma(struct vm_area_struct *);
-extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
- unsigned long addr, unsigned long len, pgoff_t pgoff,
- bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
-struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long vm_flags,
- struct mempolicy *policy,
- struct vm_userfaultfd_ctx uffd_ctx,
- struct anon_vma_name *anon_name);
-
-/* We are about to modify the VMA's flags. */
-static inline struct vm_area_struct
-*vma_modify_flags(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long new_flags)
-{
- return vma_modify(vmi, prev, vma, start, end, new_flags,
- vma_policy(vma), vma->vm_userfaultfd_ctx,
- anon_vma_name(vma));
-}
-
-/* We are about to modify the VMA's flags and/or anon_name. */
-static inline struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long new_flags,
- struct anon_vma_name *new_name)
-{
- return vma_modify(vmi, prev, vma, start, end, new_flags,
- vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
-}
-
-/* We are about to modify the VMA's memory policy. */
-static inline struct vm_area_struct
-*vma_modify_policy(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct mempolicy *new_pol)
-{
- return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
- new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-}
-
-/* We are about to modify the VMA's flags and/or uffd context. */
-static inline struct vm_area_struct
-*vma_modify_flags_uffd(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
-{
- return vma_modify(vmi, prev, vma, start, end, new_flags,
- vma_policy(vma), new_ctx, anon_vma_name(vma));
-}
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, bool write);
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
@@ -3375,19 +3310,21 @@ extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long flags,
const struct vm_special_mapping *spec);
-/* This is an obsolete alternative to _install_special_mapping. */
-extern int install_special_mapping(struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long flags, struct page **pages);
unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);
-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+unsigned long
+__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);
+
+static inline unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
+}
-extern unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
@@ -3395,14 +3332,14 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr,
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
+int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool unlock);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
#ifdef CONFIG_MMU
-extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *uf, bool unlock);
extern int __mm_populate(unsigned long addr, unsigned long len,
int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
@@ -3429,6 +3366,7 @@ struct vm_unmapped_area_info {
unsigned long high_limit;
unsigned long align_mask;
unsigned long align_offset;
+ unsigned long start_gap;
};
extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
@@ -3450,9 +3388,6 @@ extern unsigned long stack_guard_gap;
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address);
-
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
@@ -3570,6 +3505,8 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
+ bool write);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
@@ -3629,9 +3566,6 @@ static inline vm_fault_t vmf_fs_error(int err)
return VM_FAULT_SIGBUS;
}
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
- unsigned int foll_flags);
-
static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
if (vm_fault & VM_FAULT_OOM)
@@ -3785,24 +3719,22 @@ static inline bool page_is_guard(struct page *page)
return PageGuard(page);
}
-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
- int migratetype);
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
+ unsigned int order)
{
if (!debug_guardpage_enabled())
return false;
- return __set_page_guard(zone, page, order, migratetype);
+ return __set_page_guard(zone, page, order);
}
-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
- int migratetype);
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
+ unsigned int order)
{
if (!debug_guardpage_enabled())
return;
- __clear_page_guard(zone, page, order, migratetype);
+ __clear_page_guard(zone, page, order);
}
#else /* CONFIG_DEBUG_PAGEALLOC */
@@ -3812,9 +3744,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) { return false; }
+ unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
+ unsigned int order) {}
#endif /* CONFIG_DEBUG_PAGEALLOC */
#ifdef __HAVE_ARCH_GATE_AREA
@@ -3835,12 +3767,6 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
-#ifdef CONFIG_SYSCTL
-extern int sysctl_drop_caches;
-int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
- loff_t *);
-#endif
-
void drop_slab(void);
#ifndef CONFIG_MMU
@@ -3859,17 +3785,17 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
#endif
void *sparse_buffer_alloc(unsigned long size);
+unsigned long section_map_size(void);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
-void pmd_init(void *addr);
-void pud_init(void *addr);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
- struct vmem_altmap *altmap, struct page *reuse);
+ struct vmem_altmap *altmap, unsigned long ptpfn,
+ unsigned long flags);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
@@ -3885,6 +3811,12 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap);
+int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node,
+ unsigned long headsize);
+int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node,
+ unsigned long headsize);
+void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
+ unsigned long headsize);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
@@ -3951,9 +3883,6 @@ static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
}
#endif
-void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
- unsigned long nr_pages);
-
enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
MF_ACTION_REQUIRED = 1 << 1,
@@ -3969,7 +3898,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
-extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
@@ -3982,7 +3910,6 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
-struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
#else
static inline void memory_failure_queue(unsigned long pfn, int flags)
{
@@ -4003,12 +3930,6 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
}
#endif
-#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
-void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma, struct list_head *to_kill,
- unsigned long ksm_addr);
-#endif
-
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
extern void memblk_nr_poison_inc(unsigned long pfn);
extern void memblk_nr_poison_sub(unsigned long pfn, long i);
@@ -4049,10 +3970,10 @@ enum mf_result {
enum mf_action_page_type {
MF_MSG_KERNEL,
MF_MSG_KERNEL_HIGH_ORDER,
- MF_MSG_SLAB,
MF_MSG_DIFFERENT_COMPOUND,
MF_MSG_HUGE,
MF_MSG_FREE_HUGE,
+ MF_MSG_GET_HWPOISON,
MF_MSG_UNMAP_FAILED,
MF_MSG_DIRTY_SWAPCACHE,
MF_MSG_CLEAN_SWAPCACHE,
@@ -4066,13 +3987,12 @@ enum mf_action_page_type {
MF_MSG_BUDDY,
MF_MSG_DAX,
MF_MSG_UNSPLIT_THP,
+ MF_MSG_ALREADY_POISONED,
MF_MSG_UNKNOWN,
};
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
-extern void clear_huge_page(struct page *page,
- unsigned long addr_hint,
- unsigned int pages_per_huge_page);
+void folio_zero_user(struct folio *folio, unsigned long addr_hint);
int copy_user_large_folio(struct folio *dst, struct folio *src,
unsigned long addr_hint,
struct vm_area_struct *vma);
@@ -4123,47 +4043,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr);
#endif
-extern int sysctl_nr_trim_pages;
-
-#ifdef CONFIG_PRINTK
-void mem_dump_obj(void *object);
-#else
-static inline void mem_dump_obj(void *object) {}
-#endif
-
-/**
- * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
- * handle them.
- * @seals: the seals to check
- * @vma: the vma to operate on
- *
- * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
- * check/handling on the vma flags. Return 0 if check pass, or <0 for errors.
- */
-static inline int seal_check_write(int seals, struct vm_area_struct *vma)
-{
- if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * write seals are active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- /*
- * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (vma->vm_flags & VM_SHARED)
- vm_flags_clear(vma, VM_MAYWRITE);
- }
-
- return 0;
-}
-
#ifdef CONFIG_ANON_VMA_NAME
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
unsigned long len_in,
@@ -4178,18 +4057,18 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
#ifdef CONFIG_UNACCEPTED_MEMORY
-bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
-void accept_memory(phys_addr_t start, phys_addr_t end);
+bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size);
+void accept_memory(phys_addr_t start, unsigned long size);
#else
static inline bool range_contains_unaccepted_memory(phys_addr_t start,
- phys_addr_t end)
+ unsigned long size)
{
return false;
}
-static inline void accept_memory(phys_addr_t start, phys_addr_t end)
+static inline void accept_memory(phys_addr_t start, unsigned long size)
{
}
@@ -4197,9 +4076,112 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
{
- phys_addr_t paddr = pfn << PAGE_SHIFT;
+ return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
+}
+
+void vma_pgtable_walk_begin(struct vm_area_struct *vma);
+void vma_pgtable_walk_end(struct vm_area_struct *vma);
+
+int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
+int reserve_mem_release_by_name(const char *name);
+
+#ifdef CONFIG_64BIT
+int do_mseal(unsigned long start, size_t len_in, unsigned long flags);
+#else
+static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
+{
+ /* noop on 32 bit */
+ return 0;
+}
+#endif
- return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
+/*
+ * user_alloc_needs_zeroing checks if a user folio from page allocator needs to
+ * be zeroed or not.
+ */
+static inline bool user_alloc_needs_zeroing(void)
+{
+ /*
+ * for user folios, arch with cache aliasing requires cache flush and
+ * arc changes folio->flags to make icache coherent with dcache, so
+ * always return false to make caller use
+ * clear_user_page()/clear_user_highpage().
+ */
+ return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() ||
+ !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
+ &init_on_alloc);
}
+int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
+int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
+int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
+
+
+/*
+ * mseal of userspace process's system mappings.
+ */
+#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
+#define VM_SEALED_SYSMAP VM_SEALED
+#else
+#define VM_SEALED_SYSMAP VM_NONE
+#endif
+
+/*
+ * DMA mapping IDs for page_pool
+ *
+ * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and
+ * stashes it in the upper bits of page->pp_magic. We always want to be able to
+ * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP
+ * pages can have arbitrary kernel pointers stored in the same field as pp_magic
+ * (since it overlaps with page->lru.next), so we must ensure that we cannot
+ * mistake a valid kernel pointer with any of the values we write into this
+ * field.
+ *
+ * On architectures that set POISON_POINTER_DELTA, this is already ensured,
+ * since this value becomes part of PP_SIGNATURE; meaning we can just use the
+ * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the
+ * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is
+ * 0, we make sure that we leave the two topmost bits empty, as that guarantees
+ * we won't mistake a valid kernel pointer for a value we set, regardless of the
+ * VMSPLIT setting.
+ *
+ * Altogether, this means that the number of bits available is constrained by
+ * the size of an unsigned long (at the upper end, subtracting two bits per the
+ * above), and the definition of PP_SIGNATURE (with or without
+ * POISON_POINTER_DELTA).
+ */
+#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
+#if POISON_POINTER_DELTA > 0
+/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
+ * index to not overlap with that if set
+ */
+#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
+#else
+/* Always leave out the topmost two; see above. */
+#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2)
+#endif
+
+#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
+ PP_DMA_INDEX_SHIFT)
+
+/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is
+ * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for
+ * the head page of compound page and bit 1 for pfmemalloc page, as well as the
+ * bits used for the DMA index. page_is_pfmemalloc() is checked in
+ * __page_pool_put_page() to avoid recycling the pfmemalloc page.
+ */
+#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)
+
+#ifdef CONFIG_PAGE_POOL
+static inline bool page_pool_page_is_pp(struct page *page)
+{
+ return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE;
+}
+#else
+static inline bool page_pool_page_is_pp(struct page *page)
+{
+ return false;
+}
+#endif
+
#endif /* _LINUX_MM_H */