diff options
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r-- | include/linux/mm.h | 3150 |
1 files changed, 2090 insertions, 1060 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index ef360fe70aaf..0ef2ba0c667a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3,17 +3,16 @@ #define _LINUX_MM_H #include <linux/errno.h> - -#ifdef __KERNEL__ - #include <linux/mmdebug.h> #include <linux/gfp.h> +#include <linux/pgalloc_tag.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> +#include <linux/compiler.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> @@ -26,36 +25,27 @@ #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> -#include <linux/memremap.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> +#include <linux/kasan.h> +#include <linux/memremap.h> +#include <linux/slab.h> +#include <linux/cacheinfo.h> +#include <linux/rcuwait.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; -struct file_ra_state; struct user_struct; -struct writeback_control; -struct bdi_writeback; struct pt_regs; +struct folio_batch; -extern int sysctl_page_lock_unfairness; - +void arch_mm_preinit(void); +void mm_core_init(void); void init_mm_internals(void); -#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ -extern unsigned long max_mapnr; - -static inline void set_max_mapnr(unsigned long limit) -{ - max_mapnr = limit; -} -#else -static inline void set_max_mapnr(unsigned long limit) { } -#endif - extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { @@ -78,7 +68,6 @@ static inline void totalram_pages_add(long count) } extern void * high_memory; -extern int page_cluster; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; @@ -88,7 +77,7 @@ extern int sysctl_legacy_va_layout; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; -extern const int mmap_rnd_bits_max; +extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS @@ -97,20 +86,17 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif +#ifndef DIRECT_MAP_PHYSMEM_END +# ifdef MAX_PHYSMEM_BITS +# define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +# else +# define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) +# endif +#endif + #include <asm/page.h> #include <asm/processor.h> -/* - * Architectures that support memory tagging (assigning tags to memory regions, - * embedding these tags into addresses that point to these memory regions, and - * checking that the memory and the pointer tags match on memory accesses) - * redefine this macro to strip tags from pointers. - * It's defined as noop for arcitectures that don't support memory tagging. - */ -#ifndef untagged_addr -#define untagged_addr(addr) (addr) -#endif - #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif @@ -141,10 +127,10 @@ extern int mmap_rnd_compat_bits __read_mostly; * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 -/* This function must be updated when the size of struct page grows above 80 +/* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can - * combine write statments if they are both assignments and can be reordered, + * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) @@ -152,12 +138,18 @@ static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; - /* Check that struct page is either 56, 64, 72, or 80 bytes */ + /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); - BUILD_BUG_ON(sizeof(struct page) > 80); + BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { + case 96: + _pp[11] = 0; + fallthrough; + case 88: + _pp[10] = 0; + fallthrough; case 80: _pp[9] = 0; fallthrough; @@ -205,26 +197,30 @@ extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; -extern unsigned long sysctl_overcommit_kbytes; - -int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, - loff_t *); -int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, - loff_t *); -int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, - loff_t *); - +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) +#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) +#else +#define nth_page(page,n) ((page) + (n)) +#define folio_page_idx(folio, p) ((p) - &(folio)->page) +#endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) +/* to align the pointer to the (prev) page boundary */ +#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) + /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) +static inline struct folio *lru_to_folio(struct list_head *head) +{ + return list_entry((head)->prev, struct folio, lru); +} + +void setup_initial_init_mm(void *start_code, void *end_code, + void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. @@ -264,9 +260,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ +#else /* CONFIG_MMU */ +#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ +#define VM_UFFD_MISSING 0 +#endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 @@ -304,34 +304,63 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */ -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 -#ifdef CONFIG_PPC +# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 +# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 +# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 +# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 +#if CONFIG_ARCH_PKEY_BITS > 3 +# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 +#else +# define VM_PKEY_BIT3 0 +#endif +#if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ -#if defined(CONFIG_X86) -# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ -#elif defined(CONFIG_PPC) +#ifdef CONFIG_X86_USER_SHADOW_STACK +/* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace protect + * itself from attacks. A single page is enough for current shadow stack archs + * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c + * for more details on the guard size. + */ +# define VM_SHADOW_STACK VM_HIGH_ARCH_5 +#endif + +#if defined(CONFIG_ARM64_GCS) +/* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ +# define VM_SHADOW_STACK VM_HIGH_ARCH_6 +#endif + +#ifndef VM_SHADOW_STACK +# define VM_SHADOW_STACK VM_NONE +#endif + +#if defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 -#elif defined(CONFIG_IA64) -# define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI @@ -343,8 +372,8 @@ extern unsigned int kobjsize(const void *objp); #endif #if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ +# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ +# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE @@ -354,8 +383,43 @@ extern unsigned int kobjsize(const void *objp); # define VM_GROWSUP VM_NONE #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define VM_UFFD_MINOR_BIT 41 +# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ +#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +# define VM_UFFD_MINOR VM_NONE +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ + +/* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED_BIT 39 +#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#endif + +#ifdef CONFIG_64BIT +#define VM_DROPPABLE_BIT 40 +#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) +#elif defined(CONFIG_PPC32) +#define VM_DROPPABLE VM_ARCH_1 +#else +#define VM_DROPPABLE VM_NONE +#endif + +#ifdef CONFIG_64BIT +/* VM is sealed, in vm_flags */ +#define VM_SEALED _BITUL(63) +#endif + /* Bits set in the VMA until the stack is in its final location */ -#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) @@ -375,10 +439,14 @@ extern unsigned int kobjsize(const void *objp); #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + #ifdef CONFIG_STACK_GROWSUP #define VM_STACK VM_GROWSUP +#define VM_STACK_EARLY VM_GROWSDOWN #else #define VM_STACK VM_GROWSDOWN +#define VM_STACK_EARLY 0 #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) @@ -398,8 +466,8 @@ extern unsigned int kobjsize(const void *objp); /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE -/* This mask is used to clear all the VMA flags used by mlock */ -#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR @@ -411,51 +479,6 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ -extern pgprot_t protection_map[16]; - -/** - * Fault flag definitions. - * - * @FAULT_FLAG_WRITE: Fault was a write fault. - * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. - * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. - * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. - * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. - * @FAULT_FLAG_TRIED: The fault has been tried once. - * @FAULT_FLAG_USER: The fault originated in userspace. - * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. - * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. - * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * - * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify - * whether we would allow page faults to retry by specifying these two - * fault flags correctly. Currently there can be three legal combinations: - * - * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and - * this is the first try - * - * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and - * we've already tried at least once - * - * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry - * - * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never - * be used. Note that page faults can be allowed to retry for multiple times, - * in which case we'll have an initial fault with flags (a) then later on - * continuous faults with flags (b). We should always try to detect pending - * signals before a retry to make sure the continuous page faults can still be - * interrupted if necessary. - */ -#define FAULT_FLAG_WRITE 0x01 -#define FAULT_FLAG_MKWRITE 0x02 -#define FAULT_FLAG_ALLOW_RETRY 0x04 -#define FAULT_FLAG_RETRY_NOWAIT 0x08 -#define FAULT_FLAG_KILLABLE 0x10 -#define FAULT_FLAG_TRIED 0x20 -#define FAULT_FLAG_USER 0x40 -#define FAULT_FLAG_REMOTE 0x80 -#define FAULT_FLAG_INSTRUCTION 0x100 -#define FAULT_FLAG_INTERRUPTIBLE 0x200 /* * The default fault flags that should be used by most of the @@ -467,6 +490,7 @@ extern pgprot_t protection_map[16]; /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time + * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition @@ -477,7 +501,7 @@ extern pgprot_t protection_map[16]; * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ -static inline bool fault_flag_allow_retry_first(unsigned int flags) +static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); @@ -493,7 +517,8 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ - { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ + { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* * vm_fault is filled by the pagefault handler and passed to the vma's @@ -506,17 +531,26 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { - struct vm_area_struct *vma; /* Target VMA */ - unsigned int flags; /* FAULT_FLAG_xxx flags */ - gfp_t gfp_mask; /* gfp mask to be used for allocations */ - pgoff_t pgoff; /* Logical page offset based on vma */ - unsigned long address; /* Faulting virtual address */ + const struct { + struct vm_area_struct *vma; /* Target VMA */ + gfp_t gfp_mask; /* gfp mask to be used for allocations */ + pgoff_t pgoff; /* Logical page offset based on vma */ + unsigned long address; /* Faulting virtual address - masked */ + unsigned long real_address; /* Faulting virtual address - unmasked */ + }; + enum fault_flag flags; /* FAULT_FLAG_xxx flags + * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ - pte_t orig_pte; /* Value of PTE at the time of fault */ + union { + pte_t orig_pte; /* Value of PTE at the time of fault */ + pmd_t orig_pmd; /* Value of PMD at the time of fault, + * used by PMD fault only. + */ + }; struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a @@ -534,21 +568,14 @@ struct vm_fault { * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. - * vm_ops->map_pages() calls - * alloc_set_pte() from atomic context. + * vm_ops->map_pages() sets up a page + * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ }; -/* page entry size for vm->huge_fault() */ -enum page_entry_size { - PE_SIZE_PTE = 0, - PE_SIZE_PMD, - PE_SIZE_PUD, -}; - /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -556,13 +583,24 @@ enum page_entry_size { */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ void (*close)(struct vm_area_struct * area); - int (*split)(struct vm_area_struct * area, unsigned long addr); - int (*mremap)(struct vm_area_struct * area); + /* Called any time before splitting to check if it's allowed */ + int (*may_split)(struct vm_area_struct *area, unsigned long addr); + int (*mremap)(struct vm_area_struct *area); + /* + * Called by mprotect() to make driver-specific permission + * checks before mprotect() is finalised. The VMA must not + * be modified. Returns 0 if mprotect() can proceed. + */ + int (*mprotect)(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); - vm_fault_t (*huge_fault)(struct vm_fault *vmf, - enum page_entry_size pe_size); - void (*map_pages)(struct vm_fault *vmf, + vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); + vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); @@ -574,7 +612,8 @@ struct vm_operations_struct { vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically - * for use by special VMAs that can switch between memory and hardware + * for use by special VMAs. See also generic_access_phys() for a generic + * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); @@ -605,7 +644,7 @@ struct vm_operations_struct { * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, - unsigned long addr); + unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the @@ -616,14 +655,122 @@ struct vm_operations_struct { unsigned long addr); }; -static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +#ifdef CONFIG_NUMA_BALANCING +static inline void vma_numab_state_init(struct vm_area_struct *vma) +{ + vma->numab_state = NULL; +} +static inline void vma_numab_state_free(struct vm_area_struct *vma) { - static const struct vm_operations_struct dummy_vm_ops = {}; + kfree(vma->numab_state); +} +#else +static inline void vma_numab_state_init(struct vm_area_struct *vma) {} +static inline void vma_numab_state_free(struct vm_area_struct *vma) {} +#endif /* CONFIG_NUMA_BALANCING */ + +/* + * These must be here rather than mmap_lock.h as dependent on vm_fault type, + * declared in this header. + */ +#ifdef CONFIG_PER_VMA_LOCK +static inline void release_fault_lock(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_end_read(vmf->vma); + else + mmap_read_unlock(vmf->vma->vm_mm); +} + +static inline void assert_fault_locked(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_assert_locked(vmf->vma); + else + mmap_assert_locked(vmf->vma->vm_mm); +} +#else +static inline void release_fault_lock(struct vm_fault *vmf) +{ + mmap_read_unlock(vmf->vma->vm_mm); +} + +static inline void assert_fault_locked(struct vm_fault *vmf) +{ + mmap_assert_locked(vmf->vma->vm_mm); +} +#endif /* CONFIG_PER_VMA_LOCK */ +extern const struct vm_operations_struct vma_dummy_vm_ops; + +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +{ memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; - vma->vm_ops = &dummy_vm_ops; + vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); + vma_lock_init(vma, false); +} + +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + ACCESS_PRIVATE(vma, __vm_flags) = flags; +} + +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + ACCESS_PRIVATE(vma, __vm_flags) |= flags; +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; +} + +/* + * Use only if VMA is not part of the VMA tree or has no other users and + * therefore needs no locking. + */ +static inline void __vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + +/* + * Use only when the order of set/clear operations is unimportant, otherwise + * use vm_flags_{set|clear} explicitly. + */ +static inline void vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + vma_start_write(vma); + __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) @@ -636,6 +783,31 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma) return !vma->vm_ops; } +/* + * Indicate if the VMA is a heap for the given task; for + * /proc/PID/maps that is the heap of the main task. + */ +static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) +{ + return vma->vm_start < vma->vm_mm->brk && + vma->vm_end > vma->vm_mm->start_brk; +} + +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) +{ + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; +} + static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -666,14 +838,101 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } +static inline bool is_shared_maywrite(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + +static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +{ + return is_shared_maywrite(vma->vm_flags); +} + +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) +{ + return mas_find(&vmi->mas, max - 1); +} + +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) +{ + /* + * Uses mas_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. + */ + return mas_find(&vmi->mas, ULONG_MAX); +} + +static inline +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) +{ + return mas_next_range(&vmi->mas, ULONG_MAX); +} + + +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) +{ + return mas_prev(&vmi->mas, 0); +} + +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +/* Free any unused preallocations */ +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline int vma_iter_bulk_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store(&vmi->mas, vma); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + vma_mark_attached(vma); + return 0; +} + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + +#define for_each_vma(__vmi, __vma) \ + while (((__vma) = vma_next(&(__vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(__vmi, __vma, __end) \ + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) + #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); +bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); @@ -684,6 +943,74 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; +extern void prep_compound_page(struct page *page, unsigned int order); + +static inline unsigned int folio_large_order(const struct folio *folio) +{ + return folio->_flags_1 & 0xff; +} + +#ifdef NR_PAGES_IN_LARGE_FOLIO +static inline long folio_large_nr_pages(const struct folio *folio) +{ + return folio->_nr_pages; +} +#else +static inline long folio_large_nr_pages(const struct folio *folio) +{ + return 1L << folio_large_order(folio); +} +#endif + +/* + * compound_order() can be called without holding a reference, which means + * that niceties like page_folio() don't work. These callers should be + * prepared to handle wild return values. For example, PG_head may be + * set before the order is initialised, or this may be a tail page. + * See compaction.c for some good examples. + */ +static inline unsigned int compound_order(struct page *page) +{ + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) + return 0; + return folio_large_order(folio); +} + +/** + * folio_order - The allocation order of a folio. + * @folio: The folio. + * + * A folio is composed of 2^order pages. See get_order() for the definition + * of order. + * + * Return: The order of the folio. + */ +static inline unsigned int folio_order(const struct folio *folio) +{ + if (!folio_test_large(folio)) + return 0; + return folio_large_order(folio); +} + +/** + * folio_reset_order - Reset the folio order and derived _nr_pages + * @folio: The folio. + * + * Reset the order and derived _nr_pages to 0. Must only be used in the + * process of splitting large folios. + */ +static inline void folio_reset_order(struct folio *folio) +{ + if (WARN_ON_ONCE(!folio_test_large(folio))) + return; + folio->_flags_1 &= ~0xffUL; +#ifdef NR_PAGES_IN_LARGE_FOLIO + folio->_nr_pages = 0; +#endif +} + #include <linux/huge_mm.h> /* @@ -708,17 +1035,29 @@ static inline int put_page_testzero(struct page *page) return page_ref_dec_and_test(page); } +static inline int folio_put_testzero(struct folio *folio) +{ + return put_page_testzero(&folio->page); +} + /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ -static inline int get_page_unless_zero(struct page *page) +static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } +static inline struct folio *folio_get_nontail_page(struct page *page) +{ + if (unlikely(!get_page_unless_zero(page))) + return NULL; + return (struct folio *)page; +} + extern int page_is_ram(unsigned long pfn); enum { @@ -740,11 +1079,6 @@ unsigned long vmalloc_to_pfn(const void *addr); * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ - -#ifndef is_ioremap_addr -#define is_ioremap_addr(x) is_vmalloc_addr(x) -#endif - #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); @@ -759,99 +1093,78 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif -extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kvmalloc(size_t size, gfp_t flags) -{ - return kvmalloc_node(size, flags, NUMA_NO_NODE); -} -static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) -{ - return kvmalloc_node(size, flags | __GFP_ZERO, node); -} -static inline void *kvzalloc(size_t size, gfp_t flags) -{ - return kvmalloc(size, flags | __GFP_ZERO); -} - -static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) -{ - size_t bytes; - - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; - - return kvmalloc(bytes, flags); -} - -static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) +/* + * How many times the entire folio is mapped as a single unit (eg by a + * PMD or PUD entry). This is probably not what you want, except for + * debugging purposes or implementation of other core folio_*() primitives. + */ +static inline int folio_entire_mapcount(const struct folio *folio) { - return kvmalloc_array(n, size, flags | __GFP_ZERO); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) + return 0; + return atomic_read(&folio->_entire_mapcount) + 1; } -extern void kvfree(const void *addr); -extern void kvfree_sensitive(const void *addr, size_t len); - -static inline int head_compound_mapcount(struct page *head) +static inline int folio_large_mapcount(const struct folio *folio) { - return atomic_read(compound_mapcount_ptr(head)) + 1; + VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); + return atomic_read(&folio->_large_mapcount) + 1; } -/* - * Mapcount of compound page as a whole, does not include mapped sub-pages. +/** + * folio_mapcount() - Number of mappings of this folio. + * @folio: The folio. + * + * The folio mapcount corresponds to the number of present user page table + * entries that reference any part of a folio. Each such present user page + * table entry must be paired with exactly on folio reference. * - * Must be called only for compound pages or any their tail sub-pages. + * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts + * exactly once. + * + * For hugetlb folios, each abstracted "hugetlb" user page table entry that + * references the entire folio counts exactly once, even when such special + * page table entries are comprised of multiple ordinary page table entries. + * + * Will report 0 for pages which cannot be mapped into userspace, such as + * slab, page tables and similar. + * + * Return: The number of times this folio is mapped. */ -static inline int compound_mapcount(struct page *page) +static inline int folio_mapcount(const struct folio *folio) { - VM_BUG_ON_PAGE(!PageCompound(page), page); - page = compound_head(page); - return head_compound_mapcount(page); -} + int mapcount; -/* - * The atomic page->_mapcount, starts from -1: so that transitions - * both from it and to it can be tracked, using atomic_inc_and_test - * and atomic_add_negative(-1). - */ -static inline void page_mapcount_reset(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); + if (likely(!folio_test_large(folio))) { + mapcount = atomic_read(&folio->_mapcount) + 1; + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + return mapcount; + } + return folio_large_mapcount(folio); } -int __page_mapcount(struct page *page); - -/* - * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount(). +/** + * folio_mapped - Is this folio mapped into userspace? + * @folio: The folio. * - * Result is undefined for pages which cannot be mapped into userspace. - * For example SLAB or special types of pages. See function page_has_type(). - * They use this place in struct page differently. + * Return: True if any page in this folio is referenced by user page tables. */ -static inline int page_mapcount(struct page *page) +static inline bool folio_mapped(const struct folio *folio) { - if (unlikely(PageCompound(page))) - return __page_mapcount(page); - return atomic_read(&page->_mapcount) + 1; + return folio_mapcount(folio) >= 1; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page, int *total_mapcount); -#else -static inline int total_mapcount(struct page *page) -{ - return page_mapcount(page); -} -static inline int page_trans_huge_mapcount(struct page *page, - int *total_mapcount) +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any sub-page of compound page is mapped, + * even if this particular sub-page is not itself mapped by any PTE or PMD. + */ +static inline bool page_mapped(const struct page *page) { - int mapcount = page_mapcount(page); - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; + return folio_mapped(page_folio(page)); } -#endif static inline struct page *virt_to_head_page(const void *x) { @@ -860,89 +1173,20 @@ static inline struct page *virt_to_head_page(const void *x) return compound_head(page); } -void __put_page(struct page *page); - -void put_pages_list(struct list_head *pages); - -void split_page(struct page *page, unsigned int order); - -/* - * Compound pages have a destructor function. Provide a - * prototype for that function and accessor functions. - * These are _only_ valid on the head of a compound page. - */ -typedef void compound_page_dtor(struct page *); - -/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */ -enum compound_dtor_id { - NULL_COMPOUND_DTOR, - COMPOUND_PAGE_DTOR, -#ifdef CONFIG_HUGETLB_PAGE - HUGETLB_PAGE_DTOR, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - TRANSHUGE_PAGE_DTOR, -#endif - NR_COMPOUND_DTORS, -}; -extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; - -static inline void set_compound_page_dtor(struct page *page, - enum compound_dtor_id compound_dtor) -{ - VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); - page[1].compound_dtor = compound_dtor; -} - -static inline void destroy_compound_page(struct page *page) +static inline struct folio *virt_to_folio(const void *x) { - VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); - compound_page_dtors[page[1].compound_dtor](page); -} - -static inline unsigned int compound_order(struct page *page) -{ - if (!PageHead(page)) - return 0; - return page[1].compound_order; -} - -static inline bool hpage_pincount_available(struct page *page) -{ - /* - * Can the page->hpage_pinned_refcount field be used? That field is in - * the 3rd page of the compound page, so the smallest (2-page) compound - * pages cannot support it. - */ - page = compound_head(page); - return PageCompound(page) && compound_order(page) > 1; -} + struct page *page = virt_to_page(x); -static inline int head_compound_pincount(struct page *head) -{ - return atomic_read(compound_pincount_ptr(head)); + return page_folio(page); } -static inline int compound_pincount(struct page *page) -{ - VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); - page = compound_head(page); - return head_compound_pincount(page); -} +void __folio_put(struct folio *folio); -static inline void set_compound_order(struct page *page, unsigned int order) -{ - page[1].compound_order = order; - page[1].compound_nr = 1U << order; -} +void split_page(struct page *page, unsigned int order); +void folio_copy(struct folio *dst, struct folio *src); +int folio_mc_copy(struct folio *dst, struct folio *src); -/* Returns the number of pages in this potentially compound page. */ -static inline unsigned long compound_nr(struct page *page) -{ - if (!PageHead(page)) - return 1; - return page[1].compound_nr; -} +unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) @@ -956,7 +1200,26 @@ static inline unsigned int page_shift(struct page *page) return PAGE_SHIFT + compound_order(page); } -void free_compound_page(struct page *page); +/** + * thp_order - Order of a transparent huge page. + * @page: Head page of a transparent huge page. + */ +static inline unsigned int thp_order(struct page *page) +{ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + return compound_order(page); +} + +/** + * thp_size - Size of a transparent huge page. + * @page: Head page of a transparent huge page. + * + * Return: Number of bytes in this page. + */ +static inline unsigned long thp_size(struct page *page) +{ + return PAGE_SIZE << thp_order(page); +} #ifdef CONFIG_MMU /* @@ -968,13 +1231,15 @@ void free_compound_page(struct page *page); static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); + pte = pte_mkwrite(pte, vma); return pte; } -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page); +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page); +void set_pte_range(struct vm_fault *vmf, struct folio *folio, + struct page *page, unsigned int nr, unsigned long addr); + vm_fault_t finish_fault(struct vm_fault *vmf); -vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif /* @@ -1011,9 +1276,9 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * - * A page may belong to an inode's memory mapping. In this case, page->mapping - * is the pointer to the inode, and page->index is the file offset of the page, - * in units of PAGE_SIZE. + * A folio may belong to an inode's memory mapping. In this case, + * folio->mapping points to the inode, and folio->index is the file + * offset of the folio, in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that @@ -1037,161 +1302,127 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); * back into memory. */ -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. - */ - -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ -#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) -#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) -#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) +/* 127: arbitrary random number, small enough to assemble well */ +#define folio_ref_zero_or_close_to_overflow(folio) \ + ((unsigned int) folio_ref_count(folio) + 127u <= 127u) -/* - * Define the bit shifts to access each section. For non-existent - * sections we define the shift as 0; that plus a 0 mask ensures - * the compiler will optimise away reference to them. +/** + * folio_get - Increment the reference count on a folio. + * @folio: The folio. + * + * Context: May be called in any context, as long as you know that + * you have a refcount on the folio. If you do not already have one, + * folio_try_get() may be the right interface for you to use. */ -#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) -#define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) - -/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ -#ifdef NODE_NOT_IN_PAGE_FLAGS -#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ - SECTIONS_PGOFF : ZONES_PGOFF) -#else -#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ - NODES_PGOFF : ZONES_PGOFF) -#endif - -#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) - -#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) -#define NODES_MASK ((1UL << NODES_WIDTH) - 1) -#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) -#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) -#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) - -static inline enum zone_type page_zonenum(const struct page *page) +static inline void folio_get(struct folio *folio) { - ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; + VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); + folio_ref_inc(folio); } -#ifdef CONFIG_ZONE_DEVICE -static inline bool is_zone_device_page(const struct page *page) -{ - return page_zonenum(page) == ZONE_DEVICE; -} -extern void memmap_init_zone_device(struct zone *, unsigned long, - unsigned long, struct dev_pagemap *); -#else -static inline bool is_zone_device_page(const struct page *page) +static inline void get_page(struct page *page) { - return false; + struct folio *folio = page_folio(page); + if (WARN_ON_ONCE(folio_test_slab(folio))) + return; + folio_get(folio); } -#endif - -#ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page); -DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -static inline bool page_is_devmap_managed(struct page *page) +static inline __must_check bool try_get_page(struct page *page) { - if (!static_branch_unlikely(&devmap_managed_key)) - return false; - if (!is_zone_device_page(page)) + page = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) return false; - switch (page->pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_FS_DAX: - return true; - default: - break; - } - return false; -} - -void put_devmap_managed_page(struct page *page); - -#else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool page_is_devmap_managed(struct page *page) -{ - return false; -} - -static inline void put_devmap_managed_page(struct page *page) -{ + page_ref_inc(page); + return true; } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool is_device_private_page(const struct page *page) +/** + * folio_put - Decrement the reference count on a folio. + * @folio: The folio. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put() unless you can be sure that it wasn't the + * last reference. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put(struct folio *folio) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; + if (folio_put_testzero(folio)) + __folio_put(folio); } -static inline bool is_pci_p2pdma_page(const struct page *page) +/** + * folio_put_refs - Reduce the reference count on a folio. + * @folio: The folio. + * @refs: The amount to subtract from the folio's reference count. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put_refs() unless you can be sure that these weren't + * the last references. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put_refs(struct folio *folio, int refs) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; + if (folio_ref_sub_and_test(folio, refs)) + __folio_put(folio); } -/* 127: arbitrary random number, small enough to assemble well */ -#define page_ref_zero_or_close_to_overflow(page) \ - ((unsigned int) page_ref_count(page) + 127u <= 127u) +void folios_put_refs(struct folio_batch *folios, unsigned int *refs); -static inline void get_page(struct page *page) -{ - page = compound_head(page); - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_refcount. - */ - VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); - page_ref_inc(page); -} +/* + * union release_pages_arg - an array of pages or folios + * + * release_pages() releases a simple array of multiple pages, and + * accepts various different forms of said page array: either + * a regular old boring array of pages, an array of folios, or + * an array of encoded page pointers. + * + * The transparent union syntax for this kind of "any of these + * argument types" is all kinds of ugly, so look away. + */ +typedef union { + struct page **pages; + struct folio **folios; + struct encoded_page **encoded_pages; +} release_pages_arg __attribute__ ((__transparent_union__)); -bool __must_check try_grab_page(struct page *page, unsigned int flags); +void release_pages(release_pages_arg, int nr); -static inline __must_check bool try_get_page(struct page *page) +/** + * folios_put - Decrement the reference count on an array of folios. + * @folios: The folios. + * + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need to + * reinitialise it. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folios_put(struct folio_batch *folios) { - page = compound_head(page); - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) - return false; - page_ref_inc(page); - return true; + folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) { - page = compound_head(page); + struct folio *folio = page_folio(page); - /* - * For devmap managed pages we need to catch refcount transition from - * 2 to 1, when refcount reach one it means the page is free and we - * need to inform the device driver through callback. See - * include/linux/memremap.h and HMM for details. - */ - if (page_is_devmap_managed(page)) { - put_devmap_managed_page(page); + if (folio_test_slab(folio)) return; - } - if (put_page_testzero(page)) - __put_page(page); + folio_put(folio); } /* @@ -1220,60 +1451,41 @@ static inline void put_page(struct page *page) * applications that don't have huge page reference counts, this won't be an * issue. * - * Locking: the lockless algorithm described in page_cache_get_speculative() - * and page_cache_gup_pin_speculative() provides safe operation for - * get_user_pages and page_mkclean and other calls that race to set up page - * table entries. + * Locking: the lockless algorithm described in folio_try_get_rcu() + * provides safe operation for get_user_pages(), folio_mkclean() and + * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); +void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_user_folio(struct folio *folio, unsigned long npages); +void unpin_folios(struct folio **folios, unsigned long nfolios); -/** - * page_maybe_dma_pinned() - report if a page is pinned for DMA. - * - * This function checks if a page has been pinned via a call to - * pin_user_pages*(). - * - * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, - * because it means "definitely not pinned for DMA", but true means "probably - * pinned for DMA, but possibly a false positive due to having at least - * GUP_PIN_COUNTING_BIAS worth of normal page references". - * - * False positives are OK, because: a) it's unlikely for a page to get that many - * refcounts, and b) all the callers of this routine are expected to be able to - * deal gracefully with a false positive. - * - * For huge pages, the result will be exactly correct. That's because we have - * more tracking data available: the 3rd struct page in the compound page is - * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS - * scheme). - * - * For more information, please see Documentation/core-api/pin_user_pages.rst. - * - * @page: pointer to page to be queried. - * @Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. - */ -static inline bool page_maybe_dma_pinned(struct page *page) +static inline bool is_cow_mapping(vm_flags_t flags) { - if (hpage_pincount_available(page)) - return compound_pincount(page) > 0; + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} +#ifndef CONFIG_MMU +static inline bool is_nommu_shared_mapping(vm_flags_t flags) +{ /* - * page_ref_count() is signed. If that refcount overflows, then - * page_ref_count() returns a negative value, and callers will avoid - * further incrementing the refcount. - * - * Here, for that overflow case, use the signed bit to count a little - * bit higher via unsigned math, and thus still get an accurate result. + * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected + * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of + * a file mapping. R/O MAP_PRIVATE mappings might still modify + * underlying memory if ptrace is active, so this is only possible if + * ptrace does not apply. Note that there is no mprotect() to upgrade + * write permissions later. */ - return ((unsigned int)page_ref_count(compound_head(page))) >= - GUP_PIN_COUNTING_BIAS; + return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } +#endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS @@ -1293,17 +1505,32 @@ static inline int page_zone_id(struct page *page) } #ifdef NODE_NOT_IN_PAGE_FLAGS -extern int page_to_nid(const struct page *page); +int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { - struct page *p = (struct page *)page; - - return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK; + return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif +static inline int folio_nid(const struct folio *folio) +{ + return page_to_nid(&folio->page); +} + #ifdef CONFIG_NUMA_BALANCING +/* page access time bits needs to hold at least 4 seconds */ +#define PAGE_ACCESS_TIME_MIN_BITS 12 +#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS +#define PAGE_ACCESS_TIME_BUCKETS \ + (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) +#else +#define PAGE_ACCESS_TIME_BUCKETS 0 +#endif + +#define PAGE_ACCESS_TIME_MASK \ + (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) + static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); @@ -1341,41 +1568,67 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS -static inline int page_cpupid_xchg_last(struct page *page, int cpupid) +static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { - return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK); + return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK); } -static inline int page_cpupid_last(struct page *page) +static inline int folio_last_cpupid(struct folio *folio) { - return page->_last_cpupid; + return folio->_last_cpupid; } static inline void page_cpupid_reset_last(struct page *page) { page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else -static inline int page_cpupid_last(struct page *page) +static inline int folio_last_cpupid(struct folio *folio) { - return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; + return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } -extern int page_cpupid_xchg_last(struct page *page, int cpupid); +int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ + +static inline int folio_xchg_access_time(struct folio *folio, int time) +{ + int last_time; + + last_time = folio_xchg_last_cpupid(folio, + time >> PAGE_ACCESS_TIME_BUCKETS); + return last_time << PAGE_ACCESS_TIME_BUCKETS; +} + +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) +{ + unsigned int pid_bit; + + pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); + if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { + __set_bit(pid_bit, &vma->numab_state->pids_active[1]); + } +} + +bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ -static inline int page_cpupid_xchg_last(struct page *page, int cpupid) +static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { - return page_to_nid(page); /* XXX */ + return folio_nid(folio); /* XXX */ } -static inline int page_cpupid_last(struct page *page) +static inline int folio_xchg_access_time(struct folio *folio, int time) { - return page_to_nid(page); /* XXX */ + return 0; +} + +static inline int folio_last_cpupid(struct folio *folio) +{ + return folio_nid(folio); /* XXX */ } static inline int cpupid_to_nid(int cpupid) @@ -1411,25 +1664,60 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } + +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) +{ +} +static inline bool folio_use_access_time(struct folio *folio) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_KASAN_SW_TAGS +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +/* + * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid + * setting tags for all pages to native kernel tag value 0xff, as the default + * value 0x00 maps to 0xff. + */ + static inline u8 page_kasan_tag(const struct page *page) { - return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + u8 tag = KASAN_TAG_KERNEL; + + if (kasan_enabled()) { + tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + tag ^= 0xff; + } + + return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { - page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); - page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + unsigned long old_flags, flags; + + if (!kasan_enabled()) + return; + + tag ^= 0xff; + old_flags = READ_ONCE(page->flags); + do { + flags = old_flags; + flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); + flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) { - page_kasan_tag_set(page, 0xff); + if (kasan_enabled()) + page_kasan_tag_set(page, KASAN_TAG_KERNEL); } -#else + +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + static inline u8 page_kasan_tag(const struct page *page) { return 0xff; @@ -1437,7 +1725,8 @@ static inline u8 page_kasan_tag(const struct page *page) static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } -#endif + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { @@ -1449,6 +1738,16 @@ static inline pg_data_t *page_pgdat(const struct page *page) return NODE_DATA(page_to_nid(page)); } +static inline struct zone *folio_zone(const struct folio *folio) +{ + return page_zone(&folio->page); +} + +static inline pg_data_t *folio_pgdat(const struct folio *folio) +{ + return page_pgdat(&folio->page); +} + #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { @@ -1462,6 +1761,188 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/** + * folio_pfn - Return the Page Frame Number of a folio. + * @folio: The folio. + * + * A folio may contain multiple pages. The pages have consecutive + * Page Frame Numbers. + * + * Return: The Page Frame Number of the first page in the folio. + */ +static inline unsigned long folio_pfn(const struct folio *folio) +{ + return page_to_pfn(&folio->page); +} + +static inline struct folio *pfn_folio(unsigned long pfn) +{ + return page_folio(pfn_to_page(pfn)); +} + +#ifdef CONFIG_MMU +static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) +{ + return pfn_pte(page_to_pfn(page), pgprot); +} + +/** + * folio_mk_pte - Create a PTE for this folio + * @folio: The folio to create a PTE for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_ptes(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) +{ + return pfn_pte(folio_pfn(folio), pgprot); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/** + * folio_mk_pmd - Create a PMD for this folio + * @folio: The folio to create a PMD for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_pmd_at(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) +{ + return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); +} +#endif +#endif /* CONFIG_MMU */ + +static inline bool folio_has_pincount(const struct folio *folio) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return folio_test_large(folio); + return folio_order(folio) > 1; +} + +/** + * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. + * @folio: The folio. + * + * This function checks if a folio has been pinned via a call to + * a function in the pin_user_pages() family. + * + * For small folios, the return value is partially fuzzy: false is not fuzzy, + * because it means "definitely not pinned for DMA", but true means "probably + * pinned for DMA, but possibly a false positive due to having at least + * GUP_PIN_COUNTING_BIAS worth of normal folio references". + * + * False positives are OK, because: a) it's unlikely for a folio to + * get that many refcounts, and b) all the callers of this routine are + * expected to be able to deal gracefully with a false positive. + * + * For most large folios, the result will be exactly correct. That's because + * we have more tracking data available: the _pincount field is used + * instead of the GUP_PIN_COUNTING_BIAS scheme. + * + * For more information, please see Documentation/core-api/pin_user_pages.rst. + * + * Return: True, if it is likely that the folio has been "dma-pinned". + * False, if the folio is definitely not dma-pinned. + */ +static inline bool folio_maybe_dma_pinned(struct folio *folio) +{ + if (folio_has_pincount(folio)) + return atomic_read(&folio->_pincount) > 0; + + /* + * folio_ref_count() is signed. If that refcount overflows, then + * folio_ref_count() returns a negative value, and callers will avoid + * further incrementing the refcount. + * + * Here, for that overflow case, use the sign bit to count a little + * bit higher via unsigned math, and thus still get an accurate result. + */ + return ((unsigned int)folio_ref_count(folio)) >= + GUP_PIN_COUNTING_BIAS; +} + +/* + * This should most likely only be called during fork() to see whether we + * should break the cow immediately for an anon page on the src mm. + * + * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. + */ +static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, + struct folio *folio) +{ + VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); + + if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) + return false; + + return folio_maybe_dma_pinned(folio); +} + +/** + * is_zero_page - Query if a page is a zero page + * @page: The page to query + * + * This returns true if @page is one of the permanent zero pages. + */ +static inline bool is_zero_page(const struct page *page) +{ + return is_zero_pfn(page_to_pfn(page)); +} + +/** + * is_zero_folio - Query if a folio is a zero page + * @folio: The folio to query + * + * This returns true if @folio is one of the permanent zero pages. + */ +static inline bool is_zero_folio(const struct folio *folio) +{ + return is_zero_page(&folio->page); +} + +/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ +#ifdef CONFIG_MIGRATION +static inline bool folio_is_longterm_pinnable(struct folio *folio) +{ +#ifdef CONFIG_CMA + int mt = folio_migratetype(folio); + + if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) + return false; +#endif + /* The zero page can be "pinned" but gets special handling. */ + if (is_zero_folio(folio)) + return true; + + /* Coherent device memory must always allow eviction. */ + if (folio_is_device_coherent(folio)) + return false; + + /* + * Filesystems can only tolerate transient delays to truncate and + * hole-punch operations + */ + if (folio_is_fsdax(folio)) + return false; + + /* Otherwise, non-movable zone folios can be pinned. */ + return !folio_is_zone_movable(folio); + +} +#else +static inline bool folio_is_longterm_pinnable(struct folio *folio) +{ + return true; +} +#endif + static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); @@ -1484,25 +1965,212 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *page_memcg(struct page *page) +/** + * folio_nr_pages - The number of pages in the folio. + * @folio: The folio. + * + * Return: A positive power of two. + */ +static inline long folio_nr_pages(const struct folio *folio) { - return page->mem_cgroup; + if (!folio_test_large(folio)) + return 1; + return folio_large_nr_pages(folio); } -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) + +/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#else +#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +#endif + +/* + * compound_nr() returns the number of pages in this potentially compound + * page. compound_nr() can be called on a tail page, and is defined to + * return 1 in that case. + */ +static inline long compound_nr(struct page *page) { - WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(page->mem_cgroup); + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) + return 1; + return folio_large_nr_pages(folio); } -#else -static inline struct mem_cgroup *page_memcg(struct page *page) + +/** + * folio_next - Move to the next physical folio. + * @folio: The folio we're currently operating on. + * + * If you have physically contiguous memory which may span more than + * one folio (eg a &struct bio_vec), use this function to move from one + * folio to the next. Do not use it if the memory is only virtually + * contiguous as the folios are almost certainly not adjacent to each + * other. This is the folio equivalent to writing ``page++``. + * + * Context: We assume that the folios are refcounted and/or locked at a + * higher level and do not adjust the reference counts. + * Return: The next struct folio. + */ +static inline struct folio *folio_next(struct folio *folio) { - return NULL; + return (struct folio *)folio_page(folio, folio_nr_pages(folio)); } -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) + +/** + * folio_shift - The size of the memory described by this folio. + * @folio: The folio. + * + * A folio represents a number of bytes which is a power-of-two in size. + * This function tells you which power-of-two the folio is. See also + * folio_size() and folio_order(). + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The base-2 logarithm of the size of this folio. + */ +static inline unsigned int folio_shift(const struct folio *folio) { - WARN_ON_ONCE(!rcu_read_lock_held()); - return NULL; + return PAGE_SHIFT + folio_order(folio); +} + +/** + * folio_size - The number of bytes in a folio. + * @folio: The folio. + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The number of bytes in this folio. + */ +static inline size_t folio_size(const struct folio *folio) +{ + return PAGE_SIZE << folio_order(folio); +} + +/** + * folio_maybe_mapped_shared - Whether the folio is mapped into the page + * tables of more than one MM + * @folio: The folio. + * + * This function checks if the folio maybe currently mapped into more than one + * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single + * MM ("mapped exclusively"). + * + * For KSM folios, this function also returns "mapped shared" when a folio is + * mapped multiple times into the same MM, because the individual page mappings + * are independent. + * + * For small anonymous folios and anonymous hugetlb folios, the return + * value will be exactly correct: non-KSM folios can only be mapped at most once + * into an MM, and they cannot be partially mapped. KSM folios are + * considered shared even if mapped multiple times into the same MM. + * + * For other folios, the result can be fuzzy: + * #. For partially-mappable large folios (THP), the return value can wrongly + * indicate "mapped shared" (false positive) if a folio was mapped by + * more than two MMs at one point in time. + * #. For pagecache folios (including hugetlb), the return value can wrongly + * indicate "mapped shared" (false positive) when two VMAs in the same MM + * cover the same file range. + * + * Further, this function only considers current page table mappings that + * are tracked using the folio mapcount(s). + * + * This function does not consider: + * #. If the folio might get mapped in the (near) future (e.g., swapcache, + * pagecache, temporary unmapping for migration). + * #. If the folio is mapped differently (VM_PFNMAP). + * #. If hugetlb page table sharing applies. Callers might want to check + * hugetlb_pmd_shared(). + * + * Return: Whether the folio is estimated to be mapped into more than one MM. + */ +static inline bool folio_maybe_mapped_shared(struct folio *folio) +{ + int mapcount = folio_mapcount(folio); + + /* Only partially-mappable folios require more care. */ + if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) + return mapcount > 1; + + /* + * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... + * simply assume "mapped shared", nobody should really care + * about this for arbitrary kernel allocations. + */ + if (!IS_ENABLED(CONFIG_MM_ID)) + return true; + + /* + * A single mapping implies "mapped exclusively", even if the + * folio flag says something different: it's easier to handle this + * case here instead of on the RMAP hot path. + */ + if (mapcount <= 1) + return false; + return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); +} + +/** + * folio_expected_ref_count - calculate the expected folio refcount + * @folio: the folio + * + * Calculate the expected folio refcount, taking references from the pagecache, + * swapcache, PG_private and page table mappings into account. Useful in + * combination with folio_ref_count() to detect unexpected references (e.g., + * GUP or other temporary references). + * + * Does currently not consider references from the LRU cache. If the folio + * was isolated from the LRU (which is the case during migration or split), + * the LRU cache does not apply. + * + * Calling this function on an unmapped folio -- !folio_mapped() -- that is + * locked will return a stable result. + * + * Calling this function on a mapped folio will not result in a stable result, + * because nothing stops additional page table mappings from coming (e.g., + * fork()) or going (e.g., munmap()). + * + * Calling this function without the folio lock will also not result in a + * stable result: for example, the folio might get dropped from the swapcache + * concurrently. + * + * However, even when called without the folio lock or on a mapped folio, + * this function can be used to detect unexpected references early (for example, + * if it makes sense to even lock the folio and unmap it). + * + * The caller must add any reference (e.g., from folio_try_get()) it might be + * holding itself to the result. + * + * Returns the expected folio refcount. + */ +static inline int folio_expected_ref_count(const struct folio *folio) +{ + const int order = folio_order(folio); + int ref_count = 0; + + if (WARN_ON_ONCE(folio_test_slab(folio))) + return 0; + + if (folio_test_anon(folio)) { + /* One reference per page from the swapcache. */ + ref_count += folio_test_swapcache(folio) << order; + } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) { + /* One reference per page from the pagecache. */ + ref_count += !!folio->mapping << order; + /* One reference from PG_private. */ + ref_count += folio_test_private(folio); + } + + /* One reference per page table mapping. */ + return ref_count + folio_mapcount(folio); +} + +#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE +static inline int arch_make_folio_accessible(struct folio *folio) +{ + return 0; } #endif @@ -1511,11 +2179,6 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) */ #include <linux/vmstat.h> -static __always_inline void *lowmem_page_address(const struct page *page) -{ - return page_to_virt(page); -} - #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif @@ -1538,56 +2201,50 @@ void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif +static __always_inline void *lowmem_page_address(const struct page *page) +{ + return page_to_virt(page); +} + #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) #define page_address_init() do { } while(0) #endif -extern void *page_rmapping(struct page *page); -extern struct anon_vma *page_anon_vma(struct page *page); -extern struct address_space *page_mapping(struct page *page); - -extern struct address_space *__page_file_mapping(struct page *); - -static inline -struct address_space *page_file_mapping(struct page *page) +static inline void *folio_address(const struct folio *folio) { - if (unlikely(PageSwapCache(page))) - return __page_file_mapping(page); - - return page->mapping; + return page_address(&folio->page); } -extern pgoff_t __page_file_index(struct page *page); - /* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use swp_offset(->private) + * Return true only if the page has been allocated with + * ALLOC_NO_WATERMARKS and the low watermark was not + * met implying that the system is under some pressure. */ -static inline pgoff_t page_index(struct page *page) +static inline bool page_is_pfmemalloc(const struct page *page) { - if (unlikely(PageSwapCache(page))) - return __page_file_index(page); - return page->index; + /* + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. + */ + return (uintptr_t)page->lru.next & BIT(1); } -bool page_mapped(struct page *page); -struct address_space *page_mapping(struct page *page); -struct address_space *page_mapping_file(struct page *page); - /* - * Return true only if the page has been allocated with + * Return true only if the folio has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ -static inline bool page_is_pfmemalloc(struct page *page) +static inline bool folio_is_pfmemalloc(const struct folio *folio) { /* - * Page index cannot be this large so this must be - * a pfmemalloc page. + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. */ - return page->index == -1UL; + return (uintptr_t)folio->lru.next & BIT(1); } /* @@ -1596,12 +2253,12 @@ static inline bool page_is_pfmemalloc(struct page *page) */ static inline void set_page_pfmemalloc(struct page *page) { - page->index = -1UL; + page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { - page->index = 0; + page->lru.next = NULL; } /* @@ -1610,44 +2267,82 @@ static inline void clear_page_pfmemalloc(struct page *page) extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) -#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) +#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) + +/* + * Parameter block passed down to zap_pte_range in exceptional cases. + */ +struct zap_details { + struct folio *single_folio; /* Locked folio to be unmapped */ + bool even_cows; /* Zap COWed private pages too? */ + bool reclaim_pt; /* Need reclaim page tables? */ + zap_flags_t zap_flags; /* Extra flags for zapping */ +}; /* - * Flags passed to show_mem() and show_free_areas() to suppress output in - * various contexts. + * Whether to drop the pte markers, for example, the uffd-wp information for + * file-backed memory. This should only be specified when we will completely + * drop the page in the mm, either by truncation or unmapping of the vma. By + * default, the flag is not set. */ -#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ +#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) +/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ +#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) -extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit_signals(struct task_struct *t); +static inline int task_mm_cid(struct task_struct *t) +{ + return t->mm_cid; +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline int task_mm_cid(struct task_struct *t) +{ + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return raw_smp_processor_id(); +} +#endif #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif -extern int user_shm_lock(size_t, struct user_struct *); -extern void user_shm_unlock(size_t, struct user_struct *); - -/* - * Parameter block passed down to zap_pte_range in exceptional cases. - */ -struct zap_details { - struct address_space *check_mapping; /* Check page->mapping if set */ - pgoff_t first_index; /* Lowest page->index to unmap */ - pgoff_t last_index; /* Highest page->index to unmap */ -}; +extern int user_shm_lock(size_t, struct ucounts *); +extern void user_shm_unlock(size_t, struct ucounts *); +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); +struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size); -void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long start, unsigned long end); +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details); +static inline void zap_vma_pages(struct vm_area_struct *vma) +{ + zap_page_range_single(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); +} +void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, + struct vm_area_struct *start_vma, unsigned long start, + unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; @@ -1655,23 +2350,51 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_pte_pmd(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn); -int follow_phys(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); +struct follow_pfnmap_args { + /** + * Inputs: + * @vma: Pointer to @vm_area_struct struct + * @address: the virtual address to walk + */ + struct vm_area_struct *vma; + unsigned long address; + /** + * Internals: + * + * The caller shouldn't touch any of these. + */ + spinlock_t *lock; + pte_t *ptep; + /** + * Outputs: + * + * @pfn: the PFN of the address + * @addr_mask: address mask covering pfn + * @pgprot: the pgprot_t of the mapping + * @writable: whether the mapping is writable + * @special: whether the mapping is a special mapping (real PFN maps) + */ + unsigned long pfn; + unsigned long addr_mask; + pgprot_t pgprot; + bool writable; + bool special; +}; +int follow_pfnmap_start(struct follow_pfnmap_args *args); +void follow_pfnmap_end(struct follow_pfnmap_args *args); + extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int truncate_inode_page(struct address_space *mapping, struct page *page); -int generic_error_remove_page(struct address_space *mapping, struct page *page); -int invalidate_inode_page(struct page *page); +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio); + +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long address, struct pt_regs *regs); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, @@ -1712,131 +2435,103 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, unmap_mapping_range(mapping, holebegin, holelen, 0); } +static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, + unsigned long addr); + extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags); + +#ifdef CONFIG_BPF_SYSCALL +extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); +#endif long get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked); + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked); + int *locked); + +/* + * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. + */ +static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, + unsigned long addr, + int gup_flags, + struct vm_area_struct **vmap) +{ + struct page *page; + struct vm_area_struct *vma; + int got; + + if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) + return ERR_PTR(-EINVAL); + + got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); + + if (got < 0) + return ERR_PTR(got); + + vma = vma_lookup(mm, addr); + if (WARN_ON_ONCE(!vma)) { + put_page(page); + return ERR_PTR(-EINVAL); + } + + *vmap = vma; + return page; +} + long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, int *locked); -long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, int *locked); + unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, + struct folio **folios, unsigned int max_folios, + pgoff_t *offset); +int folio_add_pins(struct folio *folio, unsigned int pins); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); +void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); -/* Container for pinned pfns / pages */ -struct frame_vector { - unsigned int nr_allocated; /* Number of frames we have space for */ - unsigned int nr_frames; /* Number of frames stored in ptrs array */ - bool got_ref; /* Did we pin pages by getting page ref? */ - bool is_pfns; /* Does array contain pages or pfns? */ - void *ptrs[]; /* Array of pinned pfns / pages. Use - * pfns_vector_pages() or pfns_vector_pfns() - * for access */ -}; - -struct frame_vector *frame_vector_create(unsigned int nr_frames); -void frame_vector_destroy(struct frame_vector *vec); -int get_vaddr_frames(unsigned long start, unsigned int nr_pfns, - unsigned int gup_flags, struct frame_vector *vec); -void put_vaddr_frames(struct frame_vector *vec); -int frame_vector_to_pages(struct frame_vector *vec); -void frame_vector_to_pfns(struct frame_vector *vec); - -static inline unsigned int frame_vector_count(struct frame_vector *vec) -{ - return vec->nr_frames; -} - -static inline struct page **frame_vector_pages(struct frame_vector *vec) -{ - if (vec->is_pfns) { - int err = frame_vector_to_pages(vec); - - if (err) - return ERR_PTR(err); - } - return (struct page **)(vec->ptrs); -} - -static inline unsigned long *frame_vector_pfns(struct frame_vector *vec) -{ - if (!vec->is_pfns) - frame_vector_to_pfns(vec); - return (unsigned long *)(vec->ptrs); -} - struct kvec; -int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, - struct page **pages); -int get_kernel_page(unsigned long start, int write, struct page **pages); -struct page *get_dump_page(unsigned long addr); - -extern int try_to_release_page(struct page * page, gfp_t gfp_mask); -extern void do_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); - -void __set_page_dirty(struct page *, struct address_space *, int warn); -int __set_page_dirty_nobuffers(struct page *page); -int __set_page_dirty_no_writeback(struct page *page); -int redirty_page_for_writepage(struct writeback_control *wbc, - struct page *page); -void account_page_dirtied(struct page *page, struct address_space *mapping); -void account_page_cleaned(struct page *page, struct address_space *mapping, - struct bdi_writeback *wb); -int set_page_dirty(struct page *page); +struct page *get_dump_page(unsigned long addr, int *locked); + +bool folio_mark_dirty(struct folio *folio); +bool folio_mark_dirty_lock(struct folio *folio); +bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -void __cancel_dirty_page(struct page *page); -static inline void cancel_dirty_page(struct page *page) -{ - /* Avoid atomic ops, locking, etc. when not actually needed. */ - if (PageDirty(page)) - __cancel_dirty_page(page); -} -int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); -extern unsigned long move_page_tables(struct vm_area_struct *vma, - unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len, - bool need_rmap_locks); - /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However * for now all the callers are only use one of the flags at the same * time. */ -/* Whether we should allow dirty bit accounting */ -#define MM_CP_DIRTY_ACCT (1UL << 0) +/* + * Whether we should manually check if we can map individual PTEs writable, + * because something (e.g., COW, uffd-wp) blocks that from happening for all + * PTEs automatically in a writable mapping. + */ +#define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ @@ -1845,20 +2540,20 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) -extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - unsigned long cp_flags); -extern int mprotect_fixup(struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags); +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); +extern long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long cp_flags); +extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); -int pin_user_pages_fast_only(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) @@ -1870,55 +2565,45 @@ static inline bool get_user_page_fast_only(unsigned long addr, */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - long val = atomic_long_read(&mm->rss_stat.count[member]); - -#ifdef SPLIT_RSS_COUNTING - /* - * counter is updated in asynchronous manner and may go to minus. - * But it's never be expected number for users. - */ - if (val < 0) - val = 0; -#endif - return (unsigned long)val; + return percpu_counter_read_positive(&mm->rss_stat[member]); } -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count); +void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); + percpu_counter_add(&mm->rss_stat[member], value); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_inc_return(&mm->rss_stat.count[member]); + percpu_counter_inc(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_dec_return(&mm->rss_stat.count[member]); + percpu_counter_dec(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } -/* Optimized variant when page is already known not to be PageAnon */ -static inline int mm_counter_file(struct page *page) +/* Optimized variant when folio is already known not to be anon */ +static inline int mm_counter_file(struct folio *folio) { - if (PageSwapBacked(page)) + if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } -static inline int mm_counter(struct page *page) +static inline int mm_counter(struct folio *folio) { - if (PageAnon(page)) + if (folio_test_anon(folio)) return MM_ANONPAGES; - return mm_counter_file(page); + return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) @@ -1942,7 +2627,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); - if ((mm)->hiwater_rss < _rss) + if (data_race(mm->hiwater_rss) < _rss) (mm)->hiwater_rss = _rss; } @@ -1966,14 +2651,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, *maxrss = hiwater_rss; } -#if defined(SPLIT_RSS_COUNTING) -void sync_mm_rss(struct mm_struct *mm); -#else -static inline void sync_mm_rss(struct mm_struct *mm) -{ -} -#endif - #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL static inline int pte_special(pte_t pte) { @@ -1986,6 +2663,30 @@ static inline pte_t pte_mkspecial(pte_t pte) } #endif +#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return false; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { @@ -1993,8 +2694,6 @@ static inline int pte_devmap(pte_t pte) } #endif -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); - extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, @@ -2128,42 +2827,101 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU */ -#if USE_SPLIT_PTE_PTLOCKS +static inline struct ptdesc *virt_to_ptdesc(const void *x) +{ + return page_ptdesc(virt_to_page(x)); +} + +static inline void *ptdesc_to_virt(const struct ptdesc *pt) +{ + return page_to_virt(ptdesc_page(pt)); +} + +static inline void *ptdesc_address(const struct ptdesc *pt) +{ + return folio_address(ptdesc_folio(pt)); +} + +static inline bool pagetable_is_reserved(struct ptdesc *pt) +{ + return folio_test_reserved(ptdesc_folio(pt)); +} + +/** + * pagetable_alloc - Allocate pagetables + * @gfp: GFP flags + * @order: desired pagetable order + * + * pagetable_alloc allocates memory for page tables as well as a page table + * descriptor to describe that memory. + * + * Return: The ptdesc describing the allocated page tables. + */ +static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) +{ + struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); + + return page_ptdesc(page); +} +#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) + +/** + * pagetable_free - Free pagetables + * @pt: The page table descriptor + * + * pagetable_free frees the memory of all page tables described by a page + * table descriptor and the memory for the descriptor itself. + */ +static inline void pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); -extern bool ptlock_alloc(struct page *page); -extern void ptlock_free(struct page *page); +bool ptlock_alloc(struct ptdesc *ptdesc); +void ptlock_free(struct ptdesc *ptdesc); -static inline spinlock_t *ptlock_ptr(struct page *page) +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { - return page->ptl; + return ptdesc->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) { } -static inline bool ptlock_alloc(struct page *page) +static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } -static inline void ptlock_free(struct page *page) +static inline void ptlock_free(struct ptdesc *ptdesc) { } -static inline spinlock_t *ptlock_ptr(struct page *page) +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { - return &page->ptl; + return &ptdesc->ptl; } #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(pmd_page(*pmd)); + return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); +} + +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); + BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); + return ptlock_ptr(virt_to_ptdesc(pte)); } -static inline bool ptlock_init(struct page *page) +static inline bool ptlock_init(struct ptdesc *ptdesc) { /* * prep_new_page() initialize page->private (and therefore page->ptl) @@ -2172,14 +2930,14 @@ static inline bool ptlock_init(struct page *page) * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl. */ - VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); - if (!ptlock_alloc(page)) + VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); + if (!ptlock_alloc(ptdesc)) return false; - spin_lock_init(ptlock_ptr(page)); + spin_lock_init(ptlock_ptr(ptdesc)); return true; } -#else /* !USE_SPLIT_PTE_PTLOCKS */ +#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ @@ -2187,41 +2945,78 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + return &mm->page_table_lock; +} static inline void ptlock_cache_init(void) {} -static inline bool ptlock_init(struct page *page) { return true; } -static inline void ptlock_free(struct page *page) {} -#endif /* USE_SPLIT_PTE_PTLOCKS */ +static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } +static inline void ptlock_free(struct ptdesc *ptdesc) {} +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline void pgtable_init(void) +static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - ptlock_cache_init(); - pgtable_cache_init(); + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); } -static inline bool pgtable_pte_page_ctor(struct page *page) +static inline void pagetable_dtor(struct ptdesc *ptdesc) { - if (!ptlock_init(page)) + struct folio *folio = ptdesc_folio(ptdesc); + + ptlock_free(ptdesc); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + +static inline void pagetable_dtor_free(struct ptdesc *ptdesc) +{ + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + +static inline bool pagetable_pte_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) +{ + if (mm != &init_mm && !ptlock_init(ptdesc)) return false; - __SetPageTable(page); - inc_zone_page_state(page, NR_PAGETABLE); + __pagetable_ctor(ptdesc); return true; } -static inline void pgtable_pte_page_dtor(struct page *page) +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, + pmd_t *pmdvalp) +{ + pte_t *pte; + + __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); + return pte; +} +static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) +{ + return __pte_offset_map(pmd, addr, NULL); +} + +pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); +static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) { - ptlock_free(page); - __ClearPageTable(page); - dec_zone_page_state(page, NR_PAGETABLE); + pte_t *pte; + + __cond_lock(RCU, __cond_lock(*ptlp, + pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); + return pte; } -#define pte_offset_map_lock(mm, pmd, address, ptlp) \ -({ \ - spinlock_t *__ptl = pte_lockptr(mm, pmd); \ - pte_t *__pte = pte_offset_map(pmd, address); \ - *(ptlp) = __ptl; \ - spin_lock(__ptl); \ - __pte; \ -}) +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ @@ -2241,36 +3036,33 @@ static inline void pgtable_pte_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) -#if USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_SPLIT_PMD_PTLOCKS) -static struct page *pmd_to_page(pmd_t *pmd) +static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); } -static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) +static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) { - return ptlock_ptr(pmd_to_page(pmd)); + return page_ptdesc(pmd_pgtable_page(pmd)); } -static inline bool pmd_ptlock_init(struct page *page) +static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - page->pmd_huge_pte = NULL; -#endif - return ptlock_init(page); + return ptlock_ptr(pmd_ptdesc(pmd)); } -static inline void pmd_ptlock_free(struct page *page) +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(page->pmd_huge_pte, page); + ptdesc->pmd_huge_pte = NULL; #endif - ptlock_free(page); + return ptlock_init(ptdesc); } -#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte) +#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else @@ -2279,8 +3071,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } -static inline bool pmd_ptlock_init(struct page *page) { return true; } -static inline void pmd_ptlock_free(struct page *page) {} +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -2293,22 +3084,16 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } -static inline bool pgtable_pmd_page_ctor(struct page *page) +static inline bool pagetable_pmd_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) { - if (!pmd_ptlock_init(page)) + if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) return false; - __SetPageTable(page); - inc_zone_page_state(page, NR_PAGETABLE); + ptdesc_pmd_pts_init(ptdesc); + __pagetable_ctor(ptdesc); return true; } -static inline void pgtable_pmd_page_dtor(struct page *page) -{ - pmd_ptlock_free(page); - __ClearPageTable(page); - dec_zone_page_state(page, NR_PAGETABLE); -} - /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be @@ -2328,8 +3113,22 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) return ptl; } +static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); +} + +static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); +} + +static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); +} + extern void __init pagecache_init(void); -extern void __init free_area_init_memoryless_node(int nid); extern void free_initmem(void); /* @@ -2341,32 +3140,13 @@ extern void free_initmem(void); extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); -#ifdef CONFIG_HIGHMEM -/* - * Free a highmem page into the buddy system, adjusting totalhigh_pages - * and totalram_pages. - */ -extern void free_highmem_page(struct page *page); -#endif - extern void adjust_managed_page_count(struct page *page, long count); -extern void mem_init_print_info(const char *str); -extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); +extern void reserve_bootmem_region(phys_addr_t start, + phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ -static inline void __free_reserved_page(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); -} - -static inline void free_reserved_page(struct page *page) -{ - __free_reserved_page(page); - adjust_managed_page_count(page, 1); -} +void free_reserved_page(struct page *page); static inline void mark_page_reserved(struct page *page) { @@ -2374,6 +3154,11 @@ static inline void mark_page_reserved(struct page *page) adjust_managed_page_count(page, -1); } +static inline void free_reserved_ptdesc(struct ptdesc *pt) +{ + free_reserved_page(ptdesc_page(pt)); +} + /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within @@ -2385,7 +3170,7 @@ static inline unsigned long free_initmem_default(int poison) extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, - poison, "unused kernel"); + poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) @@ -2412,20 +3197,17 @@ static inline unsigned long get_num_physpages(void) * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() - * memblock_add_node(base, size, nid) + * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); -unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, - unsigned long end_pfn); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); -extern unsigned long find_min_pfn_with_active_regions(void); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; @@ -2433,37 +3215,25 @@ static inline int early_pfn_to_nid(unsigned long pfn) #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); -/* there is a per-arch backend function. */ -extern int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state); #endif -extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum meminit_context, struct vmem_altmap *, int migratetype); -extern void setup_per_zone_wmarks(void); -extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(unsigned int flags, nodemask_t *nodemask); + +extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static inline void show_mem(void) +{ + __show_mem(0, NULL, MAX_NR_ZONES - 1); +} extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES -extern unsigned long arch_reserved_kernel_pages(void); -#endif extern __printf(3, 4) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); -/* page_alloc.c */ -extern int min_free_kbytes; -extern int watermark_boost_factor; -extern int watermark_scale_factor; -extern bool arch_has_descending_max_zone_pfns(void); - /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); @@ -2504,31 +3274,10 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand); -static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) -{ - return __vma_adjust(vma, start, end, pgoff, insert, NULL); -} -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); -extern void unlink_file_vma(struct vm_area_struct *); -extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -2547,7 +3296,8 @@ static inline int check_data_rlimit(unsigned long rlim, extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); @@ -2560,23 +3310,31 @@ extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); -/* This is an obsolete alternative to _install_special_mapping. */ -extern int install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long flags, struct page **pages); unsigned long randomize_stack_top(unsigned long stack_top); +unsigned long randomize_page(unsigned long start, unsigned long range); -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +unsigned long +__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); + +static inline unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return __get_unmapped_area(file, addr, len, pgoff, flags, 0); +} -extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int __do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf, bool downgrade); + vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, + struct list_head *uf); +extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool unlock); +int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); @@ -2593,8 +3351,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif -/* These take the mm semaphore themselves */ -extern int __must_check vm_brk(unsigned long, unsigned long); +/* This takes the mm semaphore itself */ extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, @@ -2609,6 +3366,7 @@ struct vm_unmapped_area_info { unsigned long high_limit; unsigned long align_mask; unsigned long align_offset; + unsigned long start_gap; }; extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); @@ -2621,52 +3379,60 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); -extern void filemap_map_pages(struct vm_fault *vmf, +extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); -/* mm/page-writeback.c */ -int __must_check write_one_page(struct page *page); -void task_dirty_inc(struct task_struct *tsk); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -extern int expand_stack(struct vm_area_struct *vma, unsigned long address); - -/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -extern int expand_downwards(struct vm_area_struct *vma, - unsigned long address); -#if VM_GROWSUP -extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); -#else - #define expand_upwards(vma, address) (0) -#endif +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); +struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +/* + * Look up the first VMA which intersects the interval [start_addr, end_addr) + * NULL if none. Assume start_addr < end_addr. + */ +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr); + +/** + * vma_lookup() - Find a VMA at a specific address + * @mm: The process address space. + * @addr: The user address. + * + * Return: The vm_area_struct at the given address, %NULL otherwise. + */ +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) +{ + return mtree_load(&mm->mm_mt, addr); +} + +static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) { - struct vm_area_struct * vma = find_vma(mm,start_addr); + if (vma->vm_flags & VM_GROWSDOWN) + return stack_guard_gap; - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; + /* See reasoning around the VM_SHADOW_STACK definition */ + if (vma->vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) { + unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; - if (vma->vm_flags & VM_GROWSDOWN) { - vm_start -= stack_guard_gap; - if (vm_start > vma->vm_start) - vm_start = 0; - } + vm_start -= gap; + if (vm_start > vma->vm_start) + vm_start = 0; return vm_start; } @@ -2691,7 +3457,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { - struct vm_area_struct *vma = find_vma(mm, vm_start); + struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; @@ -2719,14 +3485,19 @@ static inline void vma_set_page_prot(struct vm_area_struct *vma) } #endif +void vma_set_file(struct vm_area_struct *vma, struct file *file); + #ifdef CONFIG_NUMA_BALANCING unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif -struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, + unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); @@ -2734,14 +3505,14 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num); +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, + bool write); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); -vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); @@ -2759,93 +3530,41 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +#ifndef io_remap_pfn_range +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); +} +#endif + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) return VM_FAULT_OOM; + else if (err == -EHWPOISON) + return VM_FAULT_HWPOISON; return VM_FAULT_SIGBUS; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags); - -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ -#define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ -#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ -#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO - * and return without waiting upon it */ -#define FOLL_POPULATE 0x40 /* fault in page */ -#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ -#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ -#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ -#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_MLOCK 0x1000 /* lock present pages */ -#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_COW 0x4000 /* internal GUP flag */ -#define FOLL_ANON 0x8000 /* don't do file mappings */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ -#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ -#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ -#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ - -/* - * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each - * other. Here is what they mean, and how to use them: - * - * FOLL_LONGTERM indicates that the page will be held for an indefinite time - * period _often_ under userspace control. This is in contrast to - * iov_iter_get_pages(), whose usages are transient. - * - * FIXME: For pages which are part of a filesystem, mappings are subject to the - * lifetime enforced by the filesystem and we need guarantees that longterm - * users like RDMA and V4L2 only establish mappings which coordinate usage with - * the filesystem. Ideas for this coordination include revoking the longterm - * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was - * added after the problem with filesystems was found FS DAX VMAs are - * specifically failed. Filesystem pages are still subject to bugs and use of - * FOLL_LONGTERM should be avoided on those pages. - * - * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. - * Currently only get_user_pages() and get_user_pages_fast() support this flag - * and calls to get_user_pages_[un]locked are specifically not allowed. This - * is due to an incompatibility with the FS DAX check and - * FAULT_FLAG_ALLOW_RETRY. - * - * In the CMA case: long term pins in a CMA region would unnecessarily fragment - * that region. And so, CMA attempts to migrate the page before pinning, when - * FOLL_LONGTERM is specified. - * - * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, - * but an additional pin counting system) will be invoked. This is intended for - * anything that gets a page reference and then touches page data (for example, - * Direct IO). This lets the filesystem know that some non-file-system entity is - * potentially changing the pages' data. In contrast to FOLL_GET (whose pages - * are released via put_page()), FOLL_PIN pages must be released, ultimately, by - * a call to unpin_user_page(). - * - * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different - * and separate refcounting mechanisms, however, and that means that each has - * its own acquire and release mechanisms: - * - * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. - * - * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. - * - * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. - * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based - * calls applied to them, and that's perfectly OK. This is a constraint on the - * callers, not on the pages.) - * - * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never - * directly by the caller. That's in order to help avoid mismatches when - * releasing pages: get_user_pages*() pages must be released via put_page(), - * while pin_user_pages*() pages must be released via unpin_user_page(). - * - * Please see Documentation/core-api/pin_user_pages.rst for more information. +/* + * Convert errno to return value for ->page_mkwrite() calls. + * + * This should eventually be merged with vmf_error() above, but will need a + * careful audit of all vmf_error() callers. */ +static inline vm_fault_t vmf_fs_error(int err) +{ + if (err == 0) + return VM_FAULT_LOCKED; + if (err == -EFAULT || err == -EAGAIN) + return VM_FAULT_NOPAGE; + if (err == -ENOMEM) + return VM_FAULT_OOM; + /* -ENOSPC, -EDQUOT, -EIO ... */ + return VM_FAULT_SIGBUS; +} static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { @@ -2858,6 +3577,30 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) return 0; } +/* + * Indicates whether GUP can follow a PROT_NONE mapped page, or whether + * a (NUMA hinting) fault is required. + */ +static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, + unsigned int flags) +{ + /* + * If callers don't want to honor NUMA hinting faults, no need to + * determine if we would actually have to trigger a NUMA hinting fault. + */ + if (!(flags & FOLL_HONOR_NUMA_FAULT)) + return true; + + /* + * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. + * + * Requiring a fault here even for inaccessible VMAs would mean that + * FOLL_FORCE cannot make any progress, because handle_mm_fault() + * refuses to process NUMA hinting faults in inaccessible VMAs. + */ + return !vma_is_accessible(vma); +} + typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); @@ -2866,43 +3609,56 @@ extern int apply_to_existing_page_range(struct mm_struct *mm, pte_fn_t fn, void *data); #ifdef CONFIG_PAGE_POISONING -extern bool page_poisoning_enabled(void); -extern void kernel_poison_pages(struct page *page, int numpages, int enable); +extern void __kernel_poison_pages(struct page *page, int numpages); +extern void __kernel_unpoison_pages(struct page *page, int numpages); +extern bool _page_poisoning_enabled_early; +DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); +static inline bool page_poisoning_enabled(void) +{ + return _page_poisoning_enabled_early; +} +/* + * For use in fast paths after init_mem_debugging() has run, or when a + * false negative result is not harmful when called too early. + */ +static inline bool page_poisoning_enabled_static(void) +{ + return static_branch_unlikely(&_page_poisoning_enabled); +} +static inline void kernel_poison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, numpages); +} +static inline void kernel_unpoison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_unpoison_pages(page, numpages); +} #else static inline bool page_poisoning_enabled(void) { return false; } -static inline void kernel_poison_pages(struct page *page, int numpages, - int enable) { } +static inline bool page_poisoning_enabled_static(void) { return false; } +static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } +static inline void kernel_poison_pages(struct page *page, int numpages) { } +static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON -DECLARE_STATIC_KEY_TRUE(init_on_alloc); -#else -DECLARE_STATIC_KEY_FALSE(init_on_alloc); -#endif +DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); static inline bool want_init_on_alloc(gfp_t flags) { - if (static_branch_unlikely(&init_on_alloc) && - !page_poisoning_enabled()) + if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc)) return true; return flags & __GFP_ZERO; } -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON -DECLARE_STATIC_KEY_TRUE(init_on_free); -#else -DECLARE_STATIC_KEY_FALSE(init_on_free); -#endif +DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { - return static_branch_unlikely(&init_on_free) && - !page_poisoning_enabled(); + return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, + &init_on_free); } -#ifdef CONFIG_DEBUG_PAGEALLOC -extern void init_debug_pagealloc(void); -#else -static inline void init_debug_pagealloc(void) {} -#endif extern bool _debug_pagealloc_enabled_early; DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); @@ -2913,8 +3669,8 @@ static inline bool debug_pagealloc_enabled(void) } /* - * For use in fast paths after init_debug_pagealloc() has run, or when a - * false negative result is not harmful when called too early. + * For use in fast paths after mem_debugging_and_hardening_init() has run, + * or when a false negative result is not harmful when called too early. */ static inline bool debug_pagealloc_enabled_static(void) { @@ -2924,28 +3680,74 @@ static inline bool debug_pagealloc_enabled_static(void) return static_branch_unlikely(&_debug_pagealloc_enabled); } -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) -extern void __kernel_map_pages(struct page *page, int numpages, int enable); - /* - * When called in DEBUG_PAGEALLOC context, the call should most likely be - * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static() + * To support DEBUG_PAGEALLOC architecture must ensure that + * __kernel_map_pages() never fails */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) +extern void __kernel_map_pages(struct page *page, int numpages, int enable); +#ifdef CONFIG_DEBUG_PAGEALLOC +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 1); +} + +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 0); +} + +extern unsigned int _debug_guardpage_minorder; +DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); + +static inline unsigned int debug_guardpage_minorder(void) +{ + return _debug_guardpage_minorder; +} + +static inline bool debug_guardpage_enabled(void) +{ + return static_branch_unlikely(&_debug_guardpage_enabled); +} + +static inline bool page_is_guard(struct page *page) +{ + if (!debug_guardpage_enabled()) + return false; + + return PageGuard(page); +} + +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order) { - __kernel_map_pages(page, numpages, enable); + if (!debug_guardpage_enabled()) + return false; + return __set_page_guard(zone, page, order); } -#ifdef CONFIG_HIBERNATION -extern bool kernel_page_present(struct page *page); -#endif /* CONFIG_HIBERNATION */ -#else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) {} -#ifdef CONFIG_HIBERNATION -static inline bool kernel_page_present(struct page *page) { return true; } -#endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ + +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order) +{ + if (!debug_guardpage_enabled()) + return; + __clear_page_guard(zone, page, order); +} + +#else /* CONFIG_DEBUG_PAGEALLOC */ +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} +static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool debug_guardpage_enabled(void) { return false; } +static inline bool page_is_guard(struct page *page) { return false; } +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order) { return false; } +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order) {} +#endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); @@ -2965,14 +3767,7 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); -#ifdef CONFIG_SYSCTL -extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, - loff_t *); -#endif - void drop_slab(void); -void drop_slab_node(int nid); #ifndef CONFIG_MMU #define randomize_va_space 0 @@ -2990,47 +3785,177 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) #endif void *sparse_buffer_alloc(unsigned long size); +unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap); + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, - struct vmem_altmap *altmap); + struct vmem_altmap *altmap, unsigned long ptpfn, + unsigned long flags); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); +void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next); +int vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +int vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif -void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, - unsigned long nr_pages); + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + if (altmap) + return altmap->reserve + altmap->free; + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} +#else +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ +} +#endif + +#define VMEMMAP_RESERVE_NR 2 +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP +static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + unsigned long nr_pages; + unsigned long nr_vmemmap_pages; + + if (!pgmap || !is_power_of_2(sizeof(struct page))) + return false; + + nr_pages = pgmap_vmemmap_nr(pgmap); + nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); + /* + * For vmemmap optimization with DAX we need minimum 2 vmemmap + * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst + */ + return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); +} +/* + * If we don't have an architecture override, use the generic rule + */ +#ifndef vmemmap_can_optimize +#define vmemmap_can_optimize __vmemmap_can_optimize +#endif + +#else +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return false; +} +#endif enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, + MF_UNPOISON = 1 << 4, + MF_SW_SIMULATED = 1 << 5, + MF_NO_RETRY = 1 << 6, + MF_MEM_PRE_REMOVE = 1 << 7, }; +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); -extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern int sysctl_memory_failure_early_kill; -extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); +#ifdef CONFIG_MEMORY_FAILURE +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; +extern void memory_failure_queue(unsigned long pfn, int flags); +extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); +void num_poisoned_pages_inc(unsigned long pfn); +void num_poisoned_pages_sub(unsigned long pfn, long i); +#else +static inline void memory_failure_queue(unsigned long pfn, int flags) +{ +} + +static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) +{ + return 0; +} + +static inline void num_poisoned_pages_inc(unsigned long pfn) +{ +} + +static inline void num_poisoned_pages_sub(unsigned long pfn, long i) +{ +} +#endif + +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +extern void memblk_nr_poison_inc(unsigned long pfn); +extern void memblk_nr_poison_sub(unsigned long pfn, long i); +#else +static inline void memblk_nr_poison_inc(unsigned long pfn) +{ +} +static inline void memblk_nr_poison_sub(unsigned long pfn, long i) +{ +} +#endif + +#ifndef arch_memory_failure +static inline int arch_memory_failure(unsigned long pfn, int flags) +{ + return -ENXIO; +} +#endif + +#ifndef arch_is_platform_page +static inline bool arch_is_platform_page(u64 paddr) +{ + return false; +} +#endif /* * Error handlers for various types of pages. @@ -3045,12 +3970,10 @@ enum mf_result { enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, - MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, - MF_MSG_POISONED_HUGE, MF_MSG_HUGE, MF_MSG_FREE_HUGE, - MF_MSG_NON_PMD_HUGE, + MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, @@ -3062,24 +3985,20 @@ enum mf_action_page_type { MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, - MF_MSG_BUDDY_2ND, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, + MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) -extern void clear_huge_page(struct page *page, - unsigned long addr_hint, - unsigned int pages_per_huge_page); -extern void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr_hint, - struct vm_area_struct *vma, - unsigned int pages_per_huge_page); -extern long copy_huge_page_from_user(struct page *dst_page, - const void __user *usr_src, - unsigned int pages_per_huge_page, - bool allow_pagefault); +void folio_zero_user(struct folio *folio, unsigned long addr_hint); +int copy_user_large_folio(struct folio *dst, struct folio *src, + unsigned long addr_hint, + struct vm_area_struct *vma); +long copy_folio_from_user(struct folio *dst_folio, + const void __user *usr_src, + bool allow_pagefault); /** * vma_is_special_huge - Are transhuge page-table entries considered special? @@ -3099,33 +4018,6 @@ static inline bool vma_is_special_huge(const struct vm_area_struct *vma) #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -#ifdef CONFIG_DEBUG_PAGEALLOC -extern unsigned int _debug_guardpage_minorder; -DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); - -static inline unsigned int debug_guardpage_minorder(void) -{ - return _debug_guardpage_minorder; -} - -static inline bool debug_guardpage_enabled(void) -{ - return static_branch_unlikely(&_debug_guardpage_enabled); -} - -static inline bool page_is_guard(struct page *page) -{ - if (!debug_guardpage_enabled()) - return false; - - return PageGuard(page); -} -#else -static inline unsigned int debug_guardpage_minorder(void) { return 0; } -static inline bool debug_guardpage_enabled(void) { return false; } -static inline bool page_is_guard(struct page *page) { return false; } -#endif /* CONFIG_DEBUG_PAGEALLOC */ - #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); #else @@ -3151,7 +4043,145 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif -extern int sysctl_nr_trim_pages; +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, + struct anon_vma_name *anon_name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, struct anon_vma_name *anon_name) { + return 0; +} +#endif + +#ifdef CONFIG_UNACCEPTED_MEMORY + +bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); +void accept_memory(phys_addr_t start, unsigned long size); + +#else + +static inline bool range_contains_unaccepted_memory(phys_addr_t start, + unsigned long size) +{ + return false; +} + +static inline void accept_memory(phys_addr_t start, unsigned long size) +{ +} + +#endif + +static inline bool pfn_is_unaccepted_memory(unsigned long pfn) +{ + return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); +} + +void vma_pgtable_walk_begin(struct vm_area_struct *vma); +void vma_pgtable_walk_end(struct vm_area_struct *vma); + +int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); +int reserve_mem_release_by_name(const char *name); + +#ifdef CONFIG_64BIT +int do_mseal(unsigned long start, size_t len_in, unsigned long flags); +#else +static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags) +{ + /* noop on 32 bit */ + return 0; +} +#endif + +/* + * user_alloc_needs_zeroing checks if a user folio from page allocator needs to + * be zeroed or not. + */ +static inline bool user_alloc_needs_zeroing(void) +{ + /* + * for user folios, arch with cache aliasing requires cache flush and + * arc changes folio->flags to make icache coherent with dcache, so + * always return false to make caller use + * clear_user_page()/clear_user_highpage(). + */ + return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || + !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + +int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); +int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); +int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); + + +/* + * mseal of userspace process's system mappings. + */ +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED +#else +#define VM_SEALED_SYSMAP VM_NONE +#endif + +/* + * DMA mapping IDs for page_pool + * + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot + * mistake a valid kernel pointer with any of the values we write into this + * field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we make sure that we leave the two topmost bits empty, as that guarantees + * we won't mistake a valid kernel pointer for a value we set, regardless of the + * VMSPLIT setting. + * + * Altogether, this means that the number of bits available is constrained by + * the size of an unsigned long (at the upper end, subtracting two bits per the + * above), and the definition of PP_SIGNATURE (with or without + * POISON_POINTER_DELTA). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Always leave out the topmost two; see above. */ +#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) + +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(struct page *page) +{ + return false; +} +#endif -#endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ |