diff options
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r-- | include/linux/mm.h | 1713 |
1 files changed, 1133 insertions, 580 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index c54fb96cb1e6..8bbcccbc5565 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3,9 +3,6 @@ #define _LINUX_MM_H #include <linux/errno.h> - -#ifdef __KERNEL__ - #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/bug.h> @@ -15,6 +12,7 @@ #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/mm_types.h> +#include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> @@ -23,22 +21,26 @@ #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> +#include <linux/page-flags.h> #include <linux/page_ref.h> -#include <linux/memremap.h> #include <linux/overflow.h> #include <linux/sizes.h> +#include <linux/sched.h> +#include <linux/pgtable.h> +#include <linux/kasan.h> +#include <linux/memremap.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; -struct file_ra_state; struct user_struct; -struct writeback_control; -struct bdi_writeback; +struct pt_regs; + +extern int sysctl_page_lock_unfairness; void init_mm_internals(void); -#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ +#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; static inline void set_max_mapnr(unsigned long limit) @@ -91,7 +93,6 @@ extern int mmap_rnd_compat_bits __read_mostly; #endif #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/processor.h> /* @@ -99,7 +100,7 @@ extern int mmap_rnd_compat_bits __read_mostly; * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. - * It's defined as noop for arcitectures that don't support memory tagging. + * It's defined as noop for architectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) @@ -138,7 +139,7 @@ extern int mmap_rnd_compat_bits __read_mostly; /* This function must be updated when the size of struct page grows above 80 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can - * combine write statments if they are both assignments and can be reordered, + * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) @@ -153,11 +154,14 @@ static inline void __mm_zero_struct_page(struct page *page) switch (sizeof(struct page)) { case 80: - _pp[9] = 0; /* fallthrough */ + _pp[9] = 0; + fallthrough; case 72: - _pp[8] = 0; /* fallthrough */ + _pp[8] = 0; + fallthrough; case 64: - _pp[7] = 0; /* fallthrough */ + _pp[7] = 0; + fallthrough; case 56: _pp[6] = 0; _pp[5] = 0; @@ -200,20 +204,38 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); -extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); +int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) +#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) +#else +#define nth_page(page,n) ((page) + (n)) +#define folio_page_idx(folio, p) ((p) - &(folio)->page) +#endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) +/* to align the pointer to the (prev) page boundary */ +#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) + /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) +static inline struct folio *lru_to_folio(struct list_head *head) +{ + return list_entry((head)->prev, struct folio, lru); +} + +void setup_initial_init_mm(void *start_code, void *end_code, + void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. @@ -255,7 +277,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 @@ -324,24 +345,49 @@ extern unsigned int kobjsize(const void *objp); #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI +#elif defined(CONFIG_ARM64) +# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ +# define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif -#if defined(CONFIG_X86_INTEL_MPX) -/* MPX specific bounds table or bounds directory */ -# define VM_MPX VM_HIGH_ARCH_4 +#if defined(CONFIG_ARM64_MTE) +# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ +# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ #else -# define VM_MPX VM_NONE +# define VM_MTE VM_NONE +# define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define VM_UFFD_MINOR_BIT 37 +# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ +#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +# define VM_UFFD_MINOR VM_NONE +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif + #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif @@ -354,12 +400,18 @@ extern unsigned int kobjsize(const void *objp); #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + + /* * Special vmas that are non-mergable, non-mlock()able. - * Note: mm/huge_memory.c VM_NO_THP depends on this definition. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* This mask prevents VMA from being scanned with khugepaged */ +#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) + /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE @@ -376,17 +428,33 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ -extern pgprot_t protection_map[16]; -#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ -#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */ -#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */ -#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */ -#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ -#define FAULT_FLAG_TRIED 0x20 /* Second try */ -#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ -#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ -#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +/* + * The default fault flags that should be used by most of the + * arch-specific page fault handlers. + */ +#define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ + FAULT_FLAG_KILLABLE | \ + FAULT_FLAG_INTERRUPTIBLE) + +/** + * fault_flag_allow_retry_first - check ALLOW_RETRY the first time + * @flags: Fault flags. + * + * This is mostly used for places where we want to try to avoid taking + * the mmap_lock for too long a time when waiting for another condition + * to change, in which case we can try to be polite to release the + * mmap_lock in the first round to avoid potential starvation of other + * processes that would also want the mmap_lock. + * + * Return: true if the page fault allows retry and this is the first + * attempt of the fault handling; false otherwise. + */ +static inline bool fault_flag_allow_retry_first(enum fault_flag flags) +{ + return (flags & FAULT_FLAG_ALLOW_RETRY) && + (!(flags & FAULT_FLAG_TRIED)); +} #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ @@ -397,10 +465,11 @@ extern pgprot_t protection_map[16]; { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ - { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" } + { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } /* - * vm_fault is filled by the the pagefault handler and passed to the vma's + * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * @@ -410,20 +479,28 @@ extern pgprot_t protection_map[16]; * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { - struct vm_area_struct *vma; /* Target VMA */ - unsigned int flags; /* FAULT_FLAG_xxx flags */ - gfp_t gfp_mask; /* gfp mask to be used for allocations */ - pgoff_t pgoff; /* Logical page offset based on vma */ - unsigned long address; /* Faulting virtual address */ + const struct { + struct vm_area_struct *vma; /* Target VMA */ + gfp_t gfp_mask; /* gfp mask to be used for allocations */ + pgoff_t pgoff; /* Logical page offset based on vma */ + unsigned long address; /* Faulting virtual address - masked */ + unsigned long real_address; /* Faulting virtual address - unmasked */ + }; + enum fault_flag flags; /* FAULT_FLAG_xxx flags + * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ - pte_t orig_pte; /* Value of PTE at the time of fault */ + union { + pte_t orig_pte; /* Value of PTE at the time of fault */ + pmd_t orig_pmd; /* Value of PMD at the time of fault, + * used by PMD fault only. + */ + }; struct page *cow_page; /* Page handler may use for COW fault */ - struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by @@ -439,8 +516,8 @@ struct vm_fault { * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. - * vm_ops->map_pages() calls - * alloc_set_pte() from atomic context. + * vm_ops->map_pages() sets up a page + * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. @@ -461,13 +538,25 @@ enum page_entry_size { */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ void (*close)(struct vm_area_struct * area); - int (*split)(struct vm_area_struct * area, unsigned long addr); - int (*mremap)(struct vm_area_struct * area); + /* Called any time before splitting to check if it's allowed */ + int (*may_split)(struct vm_area_struct *area, unsigned long addr); + int (*mremap)(struct vm_area_struct *area); + /* + * Called by mprotect() to make driver-specific permission + * checks before mprotect() is finalised. The VMA must not + * be modified. Returns 0 if eprotect() can proceed. + */ + int (*mprotect)(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); - void (*map_pages)(struct vm_fault *vmf, + vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); @@ -479,7 +568,8 @@ struct vm_operations_struct { vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically - * for use by special VMAs that can switch between memory and hardware + * for use by special VMAs. See also generic_access_phys() for a generic + * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); @@ -504,7 +594,7 @@ struct vm_operations_struct { * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not - * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. + * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. @@ -541,6 +631,68 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma) return !vma->vm_ops; } +static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) +{ + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; +} + +static inline bool vma_is_foreign(struct vm_area_struct *vma) +{ + if (!current->mm) + return true; + + if (current->mm != vma->vm_mm) + return true; + + return false; +} + +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_ACCESS_FLAGS; +} + +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) +{ + return mas_find(&vmi->mas, max); +} + +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) +{ + /* + * Uses vma_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. + */ + return vma_find(vmi, ULONG_MAX); +} + +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) +{ + return mas_prev(&vmi->mas, 0); +} + +static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) +{ + return vmi->mas.index; +} + +#define for_each_vma(__vmi, __vma) \ + while (((__vma) = vma_next(&(__vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(__vmi, __vma, __end) \ + while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL) + #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow @@ -559,11 +711,29 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; -/* - * FIXME: take this include out, include page-flags.h in - * files which need it (119 of them) +static inline unsigned int compound_order(struct page *page) +{ + if (!PageHead(page)) + return 0; + return page[1].compound_order; +} + +/** + * folio_order - The allocation order of a folio. + * @folio: The folio. + * + * A folio is composed of 2^order pages. See get_order() for the definition + * of order. + * + * Return: The order of the folio. */ -#include <linux/page-flags.h> +static inline unsigned int folio_order(struct folio *folio) +{ + if (!folio_test_large(folio)) + return 0; + return folio->_folio_order; +} + #include <linux/huge_mm.h> /* @@ -588,13 +758,18 @@ static inline int put_page_testzero(struct page *page) return page_ref_dec_and_test(page); } +static inline int folio_put_testzero(struct folio *folio) +{ + return put_page_testzero(&folio->page); +} + /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ -static inline int get_page_unless_zero(struct page *page) +static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } @@ -639,42 +814,26 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif -extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kvmalloc(size_t size, gfp_t flags) -{ - return kvmalloc_node(size, flags, NUMA_NO_NODE); -} -static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) -{ - return kvmalloc_node(size, flags | __GFP_ZERO, node); -} -static inline void *kvzalloc(size_t size, gfp_t flags) -{ - return kvmalloc(size, flags | __GFP_ZERO); -} - -static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) -{ - size_t bytes; - - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; - - return kvmalloc(bytes, flags); -} - -static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) +/* + * How many times the entire folio is mapped as a single unit (eg by a + * PMD or PUD entry). This is probably not what you want, except for + * debugging purposes; look at folio_mapcount() or page_mapcount() + * instead. + */ +static inline int folio_entire_mapcount(struct folio *folio) { - return kvmalloc_array(n, size, flags | __GFP_ZERO); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + return atomic_read(folio_mapcount_ptr(folio)) + 1; } -extern void kvfree(const void *addr); - +/* + * Mapcount of compound page as a whole, does not include mapped sub-pages. + * + * Must be called only for compound pages. + */ static inline int compound_mapcount(struct page *page) { - VM_BUG_ON_PAGE(!PageCompound(page), page); - page = compound_head(page); - return atomic_read(compound_mapcount_ptr(page)) + 1; + return folio_entire_mapcount(page_folio(page)); } /* @@ -689,30 +848,33 @@ static inline void page_mapcount_reset(struct page *page) int __page_mapcount(struct page *page); +/* + * Mapcount of 0-order page; when compound sub-page, includes + * compound_mapcount(). + * + * Result is undefined for pages which cannot be mapped into userspace. + * For example SLAB or special types of pages. See function page_has_type(). + * They use this place in struct page differently. + */ static inline int page_mapcount(struct page *page) { - VM_BUG_ON_PAGE(PageSlab(page), page); - if (unlikely(PageCompound(page))) return __page_mapcount(page); return atomic_read(&page->_mapcount) + 1; } +int folio_mapcount(struct folio *folio); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE -int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page, int *total_mapcount); -#else static inline int total_mapcount(struct page *page) { - return page_mapcount(page); + return folio_mapcount(page_folio(page)); } -static inline int page_trans_huge_mapcount(struct page *page, - int *total_mapcount) + +#else +static inline int total_mapcount(struct page *page) { - int mapcount = page_mapcount(page); - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; + return page_mapcount(page); } #endif @@ -723,11 +885,21 @@ static inline struct page *virt_to_head_page(const void *x) return compound_head(page); } -void __put_page(struct page *page); +static inline struct folio *virt_to_folio(const void *x) +{ + struct page *page = virt_to_page(x); + + return page_folio(page); +} + +void __folio_put(struct folio *folio); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); +void folio_copy(struct folio *dst, struct folio *src); + +unsigned long nr_free_buffer_pages(void); /* * Compound pages have a destructor function. Provide a @@ -748,7 +920,7 @@ enum compound_dtor_id { #endif NR_COMPOUND_DTORS, }; -extern compound_page_dtor * const compound_page_dtors[]; +extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) @@ -757,28 +929,31 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } -static inline compound_page_dtor *get_compound_page_dtor(struct page *page) -{ - VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); - return compound_page_dtors[page[1].compound_dtor]; -} +void destroy_large_folio(struct folio *folio); -static inline unsigned int compound_order(struct page *page) +static inline int head_compound_pincount(struct page *head) { - if (!PageHead(page)) - return 0; - return page[1].compound_order; + return atomic_read(compound_pincount_ptr(head)); } static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; +#ifdef CONFIG_64BIT + page[1].compound_nr = 1U << order; +#endif } /* Returns the number of pages in this potentially compound page. */ static inline unsigned long compound_nr(struct page *page) { + if (!PageHead(page)) + return 1; +#ifdef CONFIG_64BIT + return page[1].compound_nr; +#else return 1UL << compound_order(page); +#endif } /* Returns the number of bytes in this potentially compound page. */ @@ -793,6 +968,37 @@ static inline unsigned int page_shift(struct page *page) return PAGE_SHIFT + compound_order(page); } +/** + * thp_order - Order of a transparent huge page. + * @page: Head page of a transparent huge page. + */ +static inline unsigned int thp_order(struct page *page) +{ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + return compound_order(page); +} + +/** + * thp_nr_pages - The number of regular pages in this huge page. + * @page: The head page of a huge page. + */ +static inline int thp_nr_pages(struct page *page) +{ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + return compound_nr(page); +} + +/** + * thp_size - Size of a transparent huge page. + * @page: Head page of a transparent huge page. + * + * Return: Number of bytes in this page. + */ +static inline unsigned long thp_size(struct page *page) +{ + return PAGE_SIZE << thp_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU @@ -809,8 +1015,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page); +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); + vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif @@ -875,132 +1082,55 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); * back into memory. */ -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. - */ - -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ -#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) -#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) -#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) - -/* - * Define the bit shifts to access each section. For non-existent - * sections we define the shift as 0; that plus a 0 mask ensures - * the compiler will optimise away reference to them. - */ -#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) -#define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) - -/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ -#ifdef NODE_NOT_IN_PAGE_FLAGS -#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ - SECTIONS_PGOFF : ZONES_PGOFF) -#else -#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ - NODES_PGOFF : ZONES_PGOFF) -#endif - -#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) - -#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) -#define NODES_MASK ((1UL << NODES_WIDTH) - 1) -#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) -#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) -#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) - -static inline enum zone_type page_zonenum(const struct page *page) -{ - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; -} - -#ifdef CONFIG_ZONE_DEVICE -static inline bool is_zone_device_page(const struct page *page) -{ - return page_zonenum(page) == ZONE_DEVICE; -} -extern void memmap_init_zone_device(struct zone *, unsigned long, - unsigned long, struct dev_pagemap *); -#else -static inline bool is_zone_device_page(const struct page *page) -{ - return false; -} -#endif - -#ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page); +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -static inline bool page_is_devmap_managed(struct page *page) +bool __put_devmap_managed_page_refs(struct page *page, int refs); +static inline bool put_devmap_managed_page_refs(struct page *page, int refs) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!is_zone_device_page(page)) return false; - switch (page->pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_FS_DAX: - return true; - default: - break; - } - return false; + return __put_devmap_managed_page_refs(page, refs); } - -void put_devmap_managed_page(struct page *page); - -#else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool page_is_devmap_managed(struct page *page) +#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ +static inline bool put_devmap_managed_page_refs(struct page *page, int refs) { return false; } +#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ -static inline void put_devmap_managed_page(struct page *page) +static inline bool put_devmap_managed_page(struct page *page) { + return put_devmap_managed_page_refs(page, 1); } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool is_device_private_page(const struct page *page) -{ - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; -} +/* 127: arbitrary random number, small enough to assemble well */ +#define folio_ref_zero_or_close_to_overflow(folio) \ + ((unsigned int) folio_ref_count(folio) + 127u <= 127u) -static inline bool is_pci_p2pdma_page(const struct page *page) +/** + * folio_get - Increment the reference count on a folio. + * @folio: The folio. + * + * Context: May be called in any context, as long as you know that + * you have a refcount on the folio. If you do not already have one, + * folio_try_get() may be the right interface for you to use. + */ +static inline void folio_get(struct folio *folio) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; + VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); + folio_ref_inc(folio); } -/* 127: arbitrary random number, small enough to assemble well */ -#define page_ref_zero_or_close_to_overflow(page) \ - ((unsigned int) page_ref_count(page) + 127u <= 127u) - static inline void get_page(struct page *page) { - page = compound_head(page); - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_refcount. - */ - VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); - page_ref_inc(page); + folio_get(page_folio(page)); } +bool __must_check try_grab_page(struct page *page, unsigned int flags); + static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); @@ -1010,48 +1140,121 @@ static inline __must_check bool try_get_page(struct page *page) return true; } +/** + * folio_put - Decrement the reference count on a folio. + * @folio: The folio. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put() unless you can be sure that it wasn't the + * last reference. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put(struct folio *folio) +{ + if (folio_put_testzero(folio)) + __folio_put(folio); +} + +/** + * folio_put_refs - Reduce the reference count on a folio. + * @folio: The folio. + * @refs: The amount to subtract from the folio's reference count. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put_refs() unless you can be sure that these weren't + * the last references. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put_refs(struct folio *folio, int refs) +{ + if (folio_ref_sub_and_test(folio, refs)) + __folio_put(folio); +} + +void release_pages(struct page **pages, int nr); + +/** + * folios_put - Decrement the reference count on an array of folios. + * @folios: The folios. + * @nr: How many folios there are. + * + * Like folio_put(), but for an array of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which + * need to be taken if the folios are freed. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folios_put(struct folio **folios, unsigned int nr) +{ + release_pages((struct page **)folios, nr); +} + static inline void put_page(struct page *page) { - page = compound_head(page); + struct folio *folio = page_folio(page); /* - * For devmap managed pages we need to catch refcount transition from - * 2 to 1, when refcount reach one it means the page is free and we - * need to inform the device driver through callback. See - * include/linux/memremap.h and HMM for details. + * For some devmap managed pages we need to catch refcount transition + * from 2 to 1: */ - if (page_is_devmap_managed(page)) { - put_devmap_managed_page(page); + if (put_devmap_managed_page(&folio->page)) return; - } - - if (put_page_testzero(page)) - __put_page(page); + folio_put(folio); } -/** - * unpin_user_page() - release a gup-pinned page - * @page: pointer to page to be released +/* + * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload + * the page's refcount so that two separate items are tracked: the original page + * reference count, and also a new count of how many pin_user_pages() calls were + * made against the page. ("gup-pinned" is another term for the latter). + * + * With this scheme, pin_user_pages() becomes special: such pages are marked as + * distinct from normal pages. As such, the unpin_user_page() call (and its + * variants) must be used in order to release gup-pinned pages. + * + * Choice of value: * - * Pages that were pinned via pin_user_pages*() must be released via either - * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so - * that eventually such pages can be separately tracked and uniquely handled. In - * particular, interactions with RDMA and filesystems need special handling. + * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference + * counts with respect to pin_user_pages() and unpin_user_page() becomes + * simpler, due to the fact that adding an even power of two to the page + * refcount has the effect of using only the upper N bits, for the code that + * counts up using the bias value. This means that the lower bits are left for + * the exclusive use of the original code that increments and decrements by one + * (or at least, by much smaller values than the bias value). * - * unpin_user_page() and put_page() are not interchangeable, despite this early - * implementation that makes them look the same. unpin_user_page() calls must - * be perfectly matched up with pin*() calls. + * Of course, once the lower bits overflow into the upper bits (and this is + * OK, because subtraction recovers the original values), then visual inspection + * no longer suffices to directly view the separate counts. However, for normal + * applications that don't have huge page reference counts, this won't be an + * issue. + * + * Locking: the lockless algorithm described in folio_try_get_rcu() + * provides safe operation for get_user_pages(), page_mkclean() and + * other calls that race to set up page table entries. */ -static inline void unpin_user_page(struct page *page) -{ - put_page(page); -} +#define GUP_PIN_COUNTING_BIAS (1U << 10) +void unpin_user_page(struct page *page); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); - +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +static inline bool is_cow_mapping(vm_flags_t flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif @@ -1080,7 +1283,24 @@ static inline int page_to_nid(const struct page *page) } #endif +static inline int folio_nid(const struct folio *folio) +{ + return page_to_nid(&folio->page); +} + #ifdef CONFIG_NUMA_BALANCING +/* page access time bits needs to hold at least 4 seconds */ +#define PAGE_ACCESS_TIME_MIN_BITS 12 +#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS +#define PAGE_ACCESS_TIME_BUCKETS \ + (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) +#else +#define PAGE_ACCESS_TIME_BUCKETS 0 +#endif + +#define PAGE_ACCESS_TIME_MASK \ + (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) + static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); @@ -1144,12 +1364,25 @@ static inline void page_cpupid_reset_last(struct page *page) page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ + +static inline int xchg_page_access_time(struct page *page, int time) +{ + int last_time; + + last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS); + return last_time << PAGE_ACCESS_TIME_BUCKETS; +} #else /* !CONFIG_NUMA_BALANCING */ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { return page_to_nid(page); /* XXX */ } +static inline int xchg_page_access_time(struct page *page, int time) +{ + return 0; +} + static inline int page_cpupid_last(struct page *page) { return page_to_nid(page); /* XXX */ @@ -1177,7 +1410,7 @@ static inline int cpu_pid_to_cpupid(int nid, int pid) static inline bool cpupid_pid_unset(int cpupid) { - return 1; + return true; } static inline void page_cpupid_reset_last(struct page *page) @@ -1190,23 +1423,50 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) } #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_KASAN_SW_TAGS +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +/* + * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid + * setting tags for all pages to native kernel tag value 0xff, as the default + * value 0x00 maps to 0xff. + */ + static inline u8 page_kasan_tag(const struct page *page) { - return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + u8 tag = 0xff; + + if (kasan_enabled()) { + tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + tag ^= 0xff; + } + + return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { - page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); - page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + unsigned long old_flags, flags; + + if (!kasan_enabled()) + return; + + tag ^= 0xff; + old_flags = READ_ONCE(page->flags); + do { + flags = old_flags; + flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); + flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) { - page_kasan_tag_set(page, 0xff); + if (kasan_enabled()) + page_kasan_tag_set(page, 0xff); } -#else + +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + static inline u8 page_kasan_tag(const struct page *page) { return 0xff; @@ -1214,7 +1474,8 @@ static inline u8 page_kasan_tag(const struct page *page) static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } -#endif + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { @@ -1226,6 +1487,16 @@ static inline pg_data_t *page_pgdat(const struct page *page) return NODE_DATA(page_to_nid(page)); } +static inline struct zone *folio_zone(const struct folio *folio) +{ + return page_zone(&folio->page); +} + +static inline pg_data_t *folio_pgdat(const struct folio *folio) +{ + return page_pgdat(&folio->page); +} + #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { @@ -1239,6 +1510,127 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/** + * folio_pfn - Return the Page Frame Number of a folio. + * @folio: The folio. + * + * A folio may contain multiple pages. The pages have consecutive + * Page Frame Numbers. + * + * Return: The Page Frame Number of the first page in the folio. + */ +static inline unsigned long folio_pfn(struct folio *folio) +{ + return page_to_pfn(&folio->page); +} + +static inline struct folio *pfn_folio(unsigned long pfn) +{ + return page_folio(pfn_to_page(pfn)); +} + +static inline atomic_t *folio_pincount_ptr(struct folio *folio) +{ + return &folio_page(folio, 1)->compound_pincount; +} + +/** + * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. + * @folio: The folio. + * + * This function checks if a folio has been pinned via a call to + * a function in the pin_user_pages() family. + * + * For small folios, the return value is partially fuzzy: false is not fuzzy, + * because it means "definitely not pinned for DMA", but true means "probably + * pinned for DMA, but possibly a false positive due to having at least + * GUP_PIN_COUNTING_BIAS worth of normal folio references". + * + * False positives are OK, because: a) it's unlikely for a folio to + * get that many refcounts, and b) all the callers of this routine are + * expected to be able to deal gracefully with a false positive. + * + * For large folios, the result will be exactly correct. That's because + * we have more tracking data available: the compound_pincount is used + * instead of the GUP_PIN_COUNTING_BIAS scheme. + * + * For more information, please see Documentation/core-api/pin_user_pages.rst. + * + * Return: True, if it is likely that the page has been "dma-pinned". + * False, if the page is definitely not dma-pinned. + */ +static inline bool folio_maybe_dma_pinned(struct folio *folio) +{ + if (folio_test_large(folio)) + return atomic_read(folio_pincount_ptr(folio)) > 0; + + /* + * folio_ref_count() is signed. If that refcount overflows, then + * folio_ref_count() returns a negative value, and callers will avoid + * further incrementing the refcount. + * + * Here, for that overflow case, use the sign bit to count a little + * bit higher via unsigned math, and thus still get an accurate result. + */ + return ((unsigned int)folio_ref_count(folio)) >= + GUP_PIN_COUNTING_BIAS; +} + +static inline bool page_maybe_dma_pinned(struct page *page) +{ + return folio_maybe_dma_pinned(page_folio(page)); +} + +/* + * This should most likely only be called during fork() to see whether we + * should break the cow immediately for an anon page on the src mm. + * + * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. + */ +static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, + struct page *page) +{ + VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); + + if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) + return false; + + return page_maybe_dma_pinned(page); +} + +/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ +#ifdef CONFIG_MIGRATION +static inline bool is_longterm_pinnable_page(struct page *page) +{ +#ifdef CONFIG_CMA + int mt = get_pageblock_migratetype(page); + + if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) + return false; +#endif + /* The zero page may always be pinned */ + if (is_zero_pfn(page_to_pfn(page))) + return true; + + /* Coherent device memory must always allow eviction. */ + if (is_device_coherent_page(page)) + return false; + + /* Otherwise, non-movable zone pages can be pinned. */ + return !is_zone_movable_page(page); +} +#else +static inline bool is_longterm_pinnable_page(struct page *page) +{ + return true; +} +#endif + +static inline bool folio_is_longterm_pinnable(struct folio *folio) +{ + return is_longterm_pinnable_page(&folio->page); +} + static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); @@ -1261,25 +1653,92 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *page_memcg(struct page *page) +/** + * folio_nr_pages - The number of pages in the folio. + * @folio: The folio. + * + * Return: A positive power of two. + */ +static inline long folio_nr_pages(struct folio *folio) { - return page->mem_cgroup; + if (!folio_test_large(folio)) + return 1; +#ifdef CONFIG_64BIT + return folio->_folio_nr_pages; +#else + return 1L << folio->_folio_order; +#endif } -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) + +/** + * folio_next - Move to the next physical folio. + * @folio: The folio we're currently operating on. + * + * If you have physically contiguous memory which may span more than + * one folio (eg a &struct bio_vec), use this function to move from one + * folio to the next. Do not use it if the memory is only virtually + * contiguous as the folios are almost certainly not adjacent to each + * other. This is the folio equivalent to writing ``page++``. + * + * Context: We assume that the folios are refcounted and/or locked at a + * higher level and do not adjust the reference counts. + * Return: The next struct folio. + */ +static inline struct folio *folio_next(struct folio *folio) { - WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(page->mem_cgroup); + return (struct folio *)folio_page(folio, folio_nr_pages(folio)); } -#else -static inline struct mem_cgroup *page_memcg(struct page *page) + +/** + * folio_shift - The size of the memory described by this folio. + * @folio: The folio. + * + * A folio represents a number of bytes which is a power-of-two in size. + * This function tells you which power-of-two the folio is. See also + * folio_size() and folio_order(). + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The base-2 logarithm of the size of this folio. + */ +static inline unsigned int folio_shift(struct folio *folio) { - return NULL; + return PAGE_SHIFT + folio_order(folio); } -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) + +/** + * folio_size - The number of bytes in a folio. + * @folio: The folio. + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The number of bytes in this folio. + */ +static inline size_t folio_size(struct folio *folio) { - WARN_ON_ONCE(!rcu_read_lock_held()); - return NULL; + return PAGE_SIZE << folio_order(folio); +} + +#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +static inline int arch_make_page_accessible(struct page *page) +{ + return 0; +} +#endif + +#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE +static inline int arch_make_folio_accessible(struct folio *folio) +{ + int ret; + long i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + ret = arch_make_page_accessible(folio_page(folio, i)); + if (ret) + break; + } + + return ret; } #endif @@ -1321,21 +1780,12 @@ void page_address_init(void); #define page_address_init() do { } while(0) #endif -extern void *page_rmapping(struct page *page); -extern struct anon_vma *page_anon_vma(struct page *page); -extern struct address_space *page_mapping(struct page *page); - -extern struct address_space *__page_file_mapping(struct page *); - -static inline -struct address_space *page_file_mapping(struct page *page) +static inline void *folio_address(const struct folio *folio) { - if (unlikely(PageSwapCache(page))) - return __page_file_mapping(page); - - return page->mapping; + return page_address(&folio->page); } +extern void *page_rmapping(struct page *page); extern pgoff_t __page_file_index(struct page *page); /* @@ -1350,21 +1800,21 @@ static inline pgoff_t page_index(struct page *page) } bool page_mapped(struct page *page); -struct address_space *page_mapping(struct page *page); -struct address_space *page_mapping_file(struct page *page); +bool folio_mapped(struct folio *folio); /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ -static inline bool page_is_pfmemalloc(struct page *page) +static inline bool page_is_pfmemalloc(const struct page *page) { /* - * Page index cannot be this large so this must be - * a pfmemalloc page. + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. */ - return page->index == -1UL; + return (uintptr_t)page->lru.next & BIT(1); } /* @@ -1373,12 +1823,12 @@ static inline bool page_is_pfmemalloc(struct page *page) */ static inline void set_page_pfmemalloc(struct page *page) { - page->index = -1UL; + page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { - page->index = 0; + page->lru.next = NULL; } /* @@ -1387,6 +1837,8 @@ static inline void clear_page_pfmemalloc(struct page *page) extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) +#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) +#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Flags passed to show_mem() and show_free_areas() to suppress output in @@ -1394,24 +1846,19 @@ extern void pagefault_out_of_memory(void); */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask) +{ + __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); +} #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif -extern int user_shm_lock(size_t, struct user_struct *); -extern void user_shm_unlock(size_t, struct user_struct *); - -/* - * Parameter block passed down to zap_pte_range in exceptional cases. - */ -struct zap_details { - struct address_space *check_mapping; /* Check page->mapping if set */ - pgoff_t first_index; /* Lowest page->index to unmap */ - pgoff_t last_index; /* Highest page->index to unmap */ -}; +extern int user_shm_lock(size_t, struct ucounts *); +extern void user_shm_unlock(size_t, struct ucounts *); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); @@ -1422,18 +1869,18 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long start, unsigned long end); +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long start, + unsigned long end); struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); -int follow_pte_pmd(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, @@ -1445,14 +1892,13 @@ extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); -int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, - unsigned long address, unsigned int flags); -extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int flags, + struct pt_regs *regs); +extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); void unmap_mapping_pages(struct address_space *mapping, @@ -1461,14 +1907,14 @@ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) + unsigned long address, unsigned int flags, + struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } -static inline int fixup_user_fault(struct task_struct *tsk, - struct mm_struct *mm, unsigned long address, +static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ @@ -1491,14 +1937,14 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags); +extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); -long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked); -long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked); @@ -1508,10 +1954,10 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); @@ -1522,76 +1968,14 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); -/* Container for pinned pfns / pages */ -struct frame_vector { - unsigned int nr_allocated; /* Number of frames we have space for */ - unsigned int nr_frames; /* Number of frames stored in ptrs array */ - bool got_ref; /* Did we pin pages by getting page ref? */ - bool is_pfns; /* Does array contain pages or pfns? */ - void *ptrs[0]; /* Array of pinned pfns / pages. Use - * pfns_vector_pages() or pfns_vector_pfns() - * for access */ -}; - -struct frame_vector *frame_vector_create(unsigned int nr_frames); -void frame_vector_destroy(struct frame_vector *vec); -int get_vaddr_frames(unsigned long start, unsigned int nr_pfns, - unsigned int gup_flags, struct frame_vector *vec); -void put_vaddr_frames(struct frame_vector *vec); -int frame_vector_to_pages(struct frame_vector *vec); -void frame_vector_to_pfns(struct frame_vector *vec); - -static inline unsigned int frame_vector_count(struct frame_vector *vec) -{ - return vec->nr_frames; -} - -static inline struct page **frame_vector_pages(struct frame_vector *vec) -{ - if (vec->is_pfns) { - int err = frame_vector_to_pages(vec); - - if (err) - return ERR_PTR(err); - } - return (struct page **)(vec->ptrs); -} - -static inline unsigned long *frame_vector_pfns(struct frame_vector *vec) -{ - if (!vec->is_pfns) - frame_vector_to_pfns(vec); - return (unsigned long *)(vec->ptrs); -} - struct kvec; int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, struct page **pages); -int get_kernel_page(unsigned long start, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); -extern int try_to_release_page(struct page * page, gfp_t gfp_mask); -extern void do_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); - -void __set_page_dirty(struct page *, struct address_space *, int warn); -int __set_page_dirty_nobuffers(struct page *page); -int __set_page_dirty_no_writeback(struct page *page); -int redirty_page_for_writepage(struct writeback_control *wbc, - struct page *page); -void account_page_dirtied(struct page *page, struct address_space *mapping); -void account_page_cleaned(struct page *page, struct address_space *mapping, - struct bdi_writeback *wb); -int set_page_dirty(struct page *page); +bool folio_mark_dirty(struct folio *folio); +bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -void __cancel_dirty_page(struct page *page); -static inline void cancel_dirty_page(struct page *page) -{ - /* Avoid atomic ops, locking, etc. when not actually needed. */ - if (PageDirty(page)) - __cancel_dirty_page(page); -} -int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); @@ -1599,18 +1983,48 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks); -extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, + +/* + * Flags used by change_protection(). For now we make it a bitmap so + * that we can pass in multiple flags just like parameters. However + * for now all the callers are only use one of the flags at the same + * time. + */ +/* + * Whether we should manually check if we can map individual PTEs writable, + * because something (e.g., COW, uffd-wp) blocks that from happening for all + * PTEs automatically in a writable mapping. + */ +#define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) +/* Whether this protection change is for NUMA hints */ +#define MM_CP_PROT_NUMA (1UL << 1) +/* Whether this change is for write protecting */ +#define MM_CP_UFFD_WP (1UL << 2) /* do wp */ +#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ +#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ + MM_CP_UFFD_WP_RESOLVE) + +extern unsigned long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa); -extern int mprotect_fixup(struct vm_area_struct *vma, + unsigned long cp_flags); +extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); + +static inline bool get_user_page_fast_only(unsigned long addr, + unsigned int gup_flags, struct page **pagep) +{ + return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; +} /* * per-process(per-mm_struct) statistics. */ @@ -1720,6 +2134,18 @@ static inline void sync_mm_rss(struct mm_struct *mm) } #endif +#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL +static inline int pte_special(pte_t pte) +{ + return 0; +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return pte; +} +#endif + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { @@ -1841,11 +2267,6 @@ int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) -/* - * The following ifdef needed to get the 5level-fixup.h header to work. - * Remove it when 5level-fixup.h has been removed. - */ -#ifndef __ARCH_HAS_5LEVEL_HACK static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { @@ -1859,7 +2280,6 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } -#endif /* !__ARCH_HAS_5LEVEL_HACK */ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { @@ -1943,7 +2363,7 @@ static inline bool pgtable_pte_page_ctor(struct page *page) if (!ptlock_init(page)) return false; __SetPageTable(page); - inc_zone_page_state(page, NR_PAGETABLE); + inc_lruvec_page_state(page, NR_PAGETABLE); return true; } @@ -1951,7 +2371,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) { ptlock_free(page); __ClearPageTable(page); - dec_zone_page_state(page, NR_PAGETABLE); + dec_lruvec_page_state(page, NR_PAGETABLE); } #define pte_offset_map_lock(mm, pmd, address, ptlp) \ @@ -1994,7 +2414,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(pmd_to_page(pmd)); } -static inline bool pgtable_pmd_page_ctor(struct page *page) +static inline bool pmd_ptlock_init(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE page->pmd_huge_pte = NULL; @@ -2002,7 +2422,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page) return ptlock_init(page); } -static inline void pgtable_pmd_page_dtor(struct page *page) +static inline void pmd_ptlock_free(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON_PAGE(page->pmd_huge_pte, page); @@ -2019,8 +2439,8 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } -static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; } -static inline void pgtable_pmd_page_dtor(struct page *page) {} +static inline bool pmd_ptlock_init(struct page *page) { return true; } +static inline void pmd_ptlock_free(struct page *page) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -2033,6 +2453,22 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } +static inline bool pgtable_pmd_page_ctor(struct page *page) +{ + if (!pmd_ptlock_init(page)) + return false; + __SetPageTable(page); + inc_lruvec_page_state(page, NR_PAGETABLE); + return true; +} + +static inline void pgtable_pmd_page_dtor(struct page *page) +{ + pmd_ptlock_free(page); + __ClearPageTable(page); + dec_lruvec_page_state(page, NR_PAGETABLE); +} + /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be @@ -2053,9 +2489,6 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) } extern void __init pagecache_init(void); -extern void free_area_init(unsigned long * zones_size); -extern void __init free_area_init_node(int nid, unsigned long * zones_size, - unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); /* @@ -2067,32 +2500,20 @@ extern void free_initmem(void); extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); -#ifdef CONFIG_HIGHMEM -/* - * Free a highmem page into the buddy system, adjusting totalhigh_pages - * and totalram_pages. - */ -extern void free_highmem_page(struct page *page); -#endif - extern void adjust_managed_page_count(struct page *page, long count); -extern void mem_init_print_info(const char *str); +extern void mem_init_print_info(void); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); /* Free the reserved page into the buddy system, so it gets managed. */ -static inline void __free_reserved_page(struct page *page) +static inline void free_reserved_page(struct page *page) { ClearPageReserved(page); init_page_count(page); __free_page(page); -} - -static inline void free_reserved_page(struct page *page) -{ - __free_reserved_page(page); adjust_managed_page_count(page, 1); } +#define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { @@ -2111,7 +2532,7 @@ static inline unsigned long free_initmem_default(int poison) extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, - poison, "unused kernel"); + poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) @@ -2125,34 +2546,23 @@ static inline unsigned long get_num_physpages(void) return phys_pages; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its - * zones, allocate the backing mem_map and account for memory holes in a more - * architecture independent manner. This is a substitute for creating the - * zone_sizes[] and zholes_size[] arrays and passing them to - * free_area_init_node() + * Using memblock node mappings, an architecture may initialise its + * zones, allocate the backing mem_map and account for memory holes in an + * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling - * free_area_init_nodes() passing in the PFN each zone ends at. At a basic + * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() - * memblock_add_node(base, size, nid) - * free_area_init_nodes(max_zone_pfns); - * - * free_bootmem_with_active_regions() calls free_bootmem_node() for each - * registered physical page range. Similarly - * sparse_memory_present_with_active_regions() calls memory_present() for - * each range when SPARSEMEM is enabled. - * - * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_HAVE_MEMBLOCK_NODE_MAP. + * memblock_add_node(base, size, nid, MEMBLOCK_NONE) + * free_area_init(max_zone_pfns); */ -extern void free_area_init_nodes(unsigned long *max_zone_pfn); +void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); @@ -2160,36 +2570,32 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); -extern unsigned long find_min_pfn_with_active_regions(void); -extern void free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn); -extern void sparse_memory_present_with_active_regions(int nid); - -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ - !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) +#ifndef CONFIG_NUMA +static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); -/* there is a per-arch backend function. */ -extern int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum memmap_context, struct vmem_altmap *); +extern void memmap_init_range(unsigned long, int, unsigned long, + unsigned long, unsigned long, enum meminit_context, + struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); +extern void calculate_min_free_kbytes(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(unsigned int flags, nodemask_t *nodemask); + +extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static inline void show_mem(unsigned int flags, nodemask_t *nodemask) +{ + __show_mem(flags, nodemask, MAX_NR_ZONES - 1); +} extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); @@ -2206,6 +2612,7 @@ extern void setup_per_cpu_pageset(void); extern int min_free_kbytes; extern int watermark_boost_factor; extern int watermark_scale_factor; +extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; @@ -2258,21 +2665,22 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); + static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, @@ -2290,7 +2698,8 @@ static inline int check_data_rlimit(unsigned long rlim, extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); @@ -2309,6 +2718,7 @@ extern int install_special_mapping(struct mm_struct *mm, unsigned long flags, struct page **pages); unsigned long randomize_stack_top(unsigned long stack_top); +unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); @@ -2317,22 +2727,13 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, - struct list_head *uf); -extern int __do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf, bool downgrade); + unsigned long pgoff, unsigned long *populate, struct list_head *uf); +extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); -extern int do_madvise(unsigned long start, size_t len_in, int behavior); - -static inline unsigned long -do_mmap_pgoff(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, unsigned long *populate, - struct list_head *uf) -{ - return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf); -} +extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, @@ -2364,26 +2765,7 @@ struct vm_unmapped_area_info { unsigned long align_offset; }; -extern unsigned long unmapped_area(struct vm_unmapped_area_info *info); -extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); - -/* - * Search for an unmapped address range. - * - * We are looking for a range that: - * - does not intersect with any VMA; - * - is contained within the [low_limit, high_limit) interval; - * - is at least the desired size. - * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) - */ -static inline unsigned long -vm_unmapped_area(struct vm_unmapped_area_info *info) -{ - if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) - return unmapped_area_topdown(info); - else - return unmapped_area(info); -} +extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ extern void truncate_inode_pages(struct address_space *, loff_t); @@ -2393,38 +2775,15 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); -extern void filemap_map_pages(struct vm_fault *vmf, +extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); -/* mm/page-writeback.c */ -int __must_check write_one_page(struct page *page); -void task_dirty_inc(struct task_struct *tsk); - -/* readahead.c */ -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) - -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); - -void page_cache_sync_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - pgoff_t offset, - unsigned long size); - -void page_cache_async_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - struct page *pg, - pgoff_t offset, - unsigned long size); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); -/* CONFIG_STACK_GROWSUP still needs to to grow downwards at some places */ +/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ extern int expand_downwards(struct vm_area_struct *vma, unsigned long address); #if VM_GROWSUP @@ -2438,15 +2797,24 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) -{ - struct vm_area_struct * vma = find_vma(mm,start_addr); +/* + * Look up the first VMA which intersects the interval [start_addr, end_addr) + * NULL if none. Assume start_addr < end_addr. + */ +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr); - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; +/** + * vma_lookup() - Find a VMA at a specific address + * @mm: The process address space. + * @addr: The user address. + * + * Return: The vm_area_struct at the given address, %NULL otherwise. + */ +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) +{ + return mtree_load(&mm->mm_mt, addr); } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) @@ -2482,7 +2850,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { - struct vm_area_struct *vma = find_vma(mm, vm_start); + struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; @@ -2510,6 +2878,8 @@ static inline void vma_set_page_prot(struct vm_area_struct *vma) } #endif +void vma_set_file(struct vm_area_struct *vma, struct file *file); + #ifdef CONFIG_NUMA_BALANCING unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -2518,7 +2888,11 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, @@ -2548,6 +2922,15 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +#ifndef io_remap_pfn_range +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); +} +#endif + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) @@ -2565,19 +2948,16 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_POPULATE 0x40 /* fault in page */ -#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ +#define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_MLOCK 0x1000 /* lock present pages */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ +#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each @@ -2632,7 +3012,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * releasing pages: get_user_pages*() pages must be released via put_page(), * while pin_user_pages*() pages must be released via unpin_user_page(). * - * Please see Documentation/vm/pin_user_pages.rst for more information. + * Please see Documentation/core-api/pin_user_pages.rst for more information. */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) @@ -2646,6 +3026,65 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) return 0; } +/* + * Indicates for which pages that are write-protected in the page table, + * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the + * GUP pin will remain consistent with the pages mapped into the page tables + * of the MM. + * + * Temporary unmapping of PageAnonExclusive() pages or clearing of + * PageAnonExclusive() has to protect against concurrent GUP: + * * Ordinary GUP: Using the PT lock + * * GUP-fast and fork(): mm->write_protect_seq + * * GUP-fast and KSM or temporary unmapping (swap, migration): see + * page_try_share_anon_rmap() + * + * Must be called with the (sub)page that's actually referenced via the + * page table entry, which might not necessarily be the head page for a + * PTE-mapped THP. + */ +static inline bool gup_must_unshare(unsigned int flags, struct page *page) +{ + /* + * FOLL_WRITE is implicitly handled correctly as the page table entry + * has to be writable -- and if it references (part of) an anonymous + * folio, that part is required to be marked exclusive. + */ + if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) + return false; + /* + * Note: PageAnon(page) is stable until the page is actually getting + * freed. + */ + if (!PageAnon(page)) + return false; + + /* Paired with a memory barrier in page_try_share_anon_rmap(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_rmb(); + + /* + * Note that PageKsm() pages cannot be exclusive, and consequently, + * cannot get pinned. + */ + return !PageAnonExclusive(page); +} + +/* + * Indicates whether GUP can follow a PROT_NONE mapped page, or whether + * a (NUMA hinting) fault is required. + */ +static inline bool gup_can_follow_protnone(unsigned int flags) +{ + /* + * FOLL_FORCE has to be able to make progress even if the VMA is + * inaccessible. Further, FOLL_FORCE access usually does not represent + * application behaviour and we should avoid triggering NUMA hinting + * faults. + */ + return flags & FOLL_FORCE; +} + typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); @@ -2653,44 +3092,58 @@ extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +extern void __init init_mem_debugging_and_hardening(void); #ifdef CONFIG_PAGE_POISONING -extern bool page_poisoning_enabled(void); -extern void kernel_poison_pages(struct page *page, int numpages, int enable); +extern void __kernel_poison_pages(struct page *page, int numpages); +extern void __kernel_unpoison_pages(struct page *page, int numpages); +extern bool _page_poisoning_enabled_early; +DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); +static inline bool page_poisoning_enabled(void) +{ + return _page_poisoning_enabled_early; +} +/* + * For use in fast paths after init_mem_debugging() has run, or when a + * false negative result is not harmful when called too early. + */ +static inline bool page_poisoning_enabled_static(void) +{ + return static_branch_unlikely(&_page_poisoning_enabled); +} +static inline void kernel_poison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, numpages); +} +static inline void kernel_unpoison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_unpoison_pages(page, numpages); +} #else static inline bool page_poisoning_enabled(void) { return false; } -static inline void kernel_poison_pages(struct page *page, int numpages, - int enable) { } +static inline bool page_poisoning_enabled_static(void) { return false; } +static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } +static inline void kernel_poison_pages(struct page *page, int numpages) { } +static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON -DECLARE_STATIC_KEY_TRUE(init_on_alloc); -#else -DECLARE_STATIC_KEY_FALSE(init_on_alloc); -#endif +DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); static inline bool want_init_on_alloc(gfp_t flags) { - if (static_branch_unlikely(&init_on_alloc) && - !page_poisoning_enabled()) + if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc)) return true; return flags & __GFP_ZERO; } -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON -DECLARE_STATIC_KEY_TRUE(init_on_free); -#else -DECLARE_STATIC_KEY_FALSE(init_on_free); -#endif +DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { - return static_branch_unlikely(&init_on_free) && - !page_poisoning_enabled(); + return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, + &init_on_free); } -#ifdef CONFIG_DEBUG_PAGEALLOC -extern void init_debug_pagealloc(void); -#else -static inline void init_debug_pagealloc(void) {} -#endif extern bool _debug_pagealloc_enabled_early; DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); @@ -2712,28 +3165,28 @@ static inline bool debug_pagealloc_enabled_static(void) return static_branch_unlikely(&_debug_pagealloc_enabled); } -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) -extern void __kernel_map_pages(struct page *page, int numpages, int enable); - +#ifdef CONFIG_DEBUG_PAGEALLOC /* - * When called in DEBUG_PAGEALLOC context, the call should most likely be - * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static() + * To support DEBUG_PAGEALLOC architecture must ensure that + * __kernel_map_pages() never fails */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) +extern void __kernel_map_pages(struct page *page, int numpages, int enable); + +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { - __kernel_map_pages(page, numpages, enable); + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 1); } -#ifdef CONFIG_HIBERNATION -extern bool kernel_page_present(struct page *page); -#endif /* CONFIG_HIBERNATION */ -#else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) {} -#ifdef CONFIG_HIBERNATION -static inline bool kernel_page_present(struct page *page) { return true; } -#endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ + +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 0); +} +#else /* CONFIG_DEBUG_PAGEALLOC */ +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} +#endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); @@ -2755,12 +3208,11 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #endif void drop_slab(void); -void drop_slab_node(int nid); #ifndef CONFIG_MMU #define randomize_va_space 0 @@ -2779,19 +3231,21 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap); + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); -pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); +pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, + struct vmem_altmap *altmap, struct page *reuse); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; -void *vmemmap_alloc_block_buf(unsigned long size, int node); -void *altmap_alloc_block_buf(unsigned long size, struct vmem_altmap *altmap); +void *vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); int vmemmap_populate_basepages(unsigned long start, unsigned long end, - int node); + int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); void vmemmap_populate_print_last(void); @@ -2807,18 +3261,43 @@ enum mf_flags { MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, + MF_UNPOISON = 1 << 4, + MF_SW_SIMULATED = 1 << 5, + MF_NO_RETRY = 1 << 6, }; +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); +extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern int get_hwpoison_page(struct page *page); -#define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p, int access); +extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); +#ifdef CONFIG_MEMORY_FAILURE +extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); +#else +static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + return 0; +} +#endif + +#ifndef arch_memory_failure +static inline int arch_memory_failure(unsigned long pfn, int flags) +{ + return -ENXIO; +} +#endif +#ifndef arch_is_platform_page +static inline bool arch_is_platform_page(u64 paddr) +{ + return false; +} +#endif /* * Error handlers for various types of pages. @@ -2835,10 +3314,8 @@ enum mf_action_page_type { MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, - MF_MSG_POISONED_HUGE, MF_MSG_HUGE, MF_MSG_FREE_HUGE, - MF_MSG_NON_PMD_HUGE, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, @@ -2850,8 +3327,8 @@ enum mf_action_page_type { MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, - MF_MSG_BUDDY_2ND, MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, }; @@ -2867,6 +3344,23 @@ extern long copy_huge_page_from_user(struct page *dst_page, const void __user *usr_src, unsigned int pages_per_huge_page, bool allow_pagefault); + +/** + * vma_is_special_huge - Are transhuge page-table entries considered special? + * @vma: Pointer to the struct vm_area_struct to consider + * + * Whether transhuge page-table entries are considered "special" following + * the definition in vm_normal_page(). + * + * Return: true if transhuge page-table entries should be considered special, + * false otherwise. + */ +static inline bool vma_is_special_huge(const struct vm_area_struct *vma) +{ + return vma_is_dax(vma) || (vma->vm_file && + (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #ifdef CONFIG_DEBUG_PAGEALLOC @@ -2921,5 +3415,64 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif -#endif /* __KERNEL__ */ +extern int sysctl_nr_trim_pages; + +#ifdef CONFIG_PRINTK +void mem_dump_obj(void *object); +#else +static inline void mem_dump_obj(void *object) {} +#endif + +/** + * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it + * @seals: the seals to check + * @vma: the vma to operate on + * + * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on + * the vma flags. Return 0 if check pass, or <0 for errors. + */ +static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) +{ + if (seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (vma->vm_flags & VM_SHARED) + vma->vm_flags &= ~(VM_MAYWRITE); + } + + return 0; +} + +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, + struct anon_vma_name *anon_name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, struct anon_vma_name *anon_name) { + return 0; +} +#endif + +/* + * Whether to drop the pte markers, for example, the uffd-wp information for + * file-backed memory. This should only be specified when we will completely + * drop the page in the mm, either by truncation or unmapping of the vma. By + * default, the flag is not set. + */ +#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) + #endif /* _LINUX_MM_H */ |