aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/hugetlb.h16
-rw-r--r--include/asm-generic/memory_model.h12
-rw-r--r--include/asm-generic/page.h2
-rw-r--r--include/linux/blkdev.h12
-rw-r--r--include/linux/buffer_head.h5
-rw-r--r--include/linux/damon.h68
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/gfp_types.h12
-rw-r--r--include/linux/highmem.h72
-rw-r--r--include/linux/huge_mm.h13
-rw-r--r--include/linux/hugetlb.h109
-rw-r--r--include/linux/hugetlb_cgroup.h8
-rw-r--r--include/linux/jbd2.h2
-rw-r--r--include/linux/kasan.h36
-rw-r--r--include/linux/maple_tree.h19
-rw-r--r--include/linux/memcontrol.h66
-rw-r--r--include/linux/migrate.h30
-rw-r--r--include/linux/mm.h542
-rw-r--r--include/linux/mm_inline.h19
-rw-r--r--include/linux/mm_types.h186
-rw-r--r--include/linux/mman.h34
-rw-r--r--include/linux/mmu_notifier.h13
-rw-r--r--include/linux/mmzone.h150
-rw-r--r--include/linux/page-flags.h1
-rw-r--r--include/linux/page_ext.h20
-rw-r--r--include/linux/pagemap.h34
-rw-r--r--include/linux/pagevec.h13
-rw-r--r--include/linux/pagewalk.h11
-rw-r--r--include/linux/pgtable.h34
-rw-r--r--include/linux/pid_namespace.h19
-rw-r--r--include/linux/rmap.h13
-rw-r--r--include/linux/sched/coredump.h6
-rw-r--r--include/linux/shmem_fs.h18
-rw-r--r--include/linux/slab_def.h2
-rw-r--r--include/linux/slub_def.h2
-rw-r--r--include/linux/stackdepot.h152
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/swap.h4
-rw-r--r--include/linux/swapops.h6
-rw-r--r--include/linux/userfaultfd_k.h2
-rw-r--r--include/linux/vmalloc.h1
-rw-r--r--include/linux/writeback.h16
-rw-r--r--include/linux/xarray.h3
-rw-r--r--include/trace/events/cma.h32
-rw-r--r--include/trace/events/mmflags.h1
-rw-r--r--include/uapi/linux/fcntl.h1
-rw-r--r--include/uapi/linux/memfd.h4
-rw-r--r--include/uapi/linux/prctl.h6
48 files changed, 1197 insertions, 633 deletions
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index a57d667addd2..d7f6335d3999 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -25,6 +25,13 @@ static inline pte_t huge_pte_mkwrite(pte_t pte)
return pte_mkwrite(pte);
}
+#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
+static inline pte_t huge_pte_wrprotect(pte_t pte)
+{
+ return pte_wrprotect(pte);
+}
+#endif
+
static inline pte_t huge_pte_mkdirty(pte_t pte)
{
return pte_mkdirty(pte);
@@ -37,7 +44,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
{
- return pte_mkuffd_wp(pte);
+ return huge_pte_wrprotect(pte_mkuffd_wp(pte));
}
static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
@@ -104,13 +111,6 @@ static inline int huge_pte_none_mostly(pte_t pte)
return huge_pte_none(pte) || is_pte_marker(pte);
}
-#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-#endif
-
#ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len)
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index a2c8ed60233a..6796abe1900e 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -19,6 +19,18 @@
#define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \
ARCH_PFN_OFFSET)
+#ifndef pfn_valid
+static inline int pfn_valid(unsigned long pfn)
+{
+ /* avoid <linux/mm.h> include hell */
+ extern unsigned long max_mapnr;
+ unsigned long pfn_offset = ARCH_PFN_OFFSET;
+
+ return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr;
+}
+#define pfn_valid pfn_valid
+#endif
+
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
/* memmap is virtually contiguous. */
diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h
index 6fc47561814c..c0be2edeb484 100644
--- a/include/asm-generic/page.h
+++ b/include/asm-generic/page.h
@@ -84,8 +84,6 @@ extern unsigned long memory_end;
#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
#endif
-#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr)
-
#define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \
((void *)(kaddr) < (void *)memory_end))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b9637d63e6f0..41a41561b773 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -556,6 +556,7 @@ struct request_queue {
#define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */
#define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */
#define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */
+#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */
#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */
#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */
#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */
@@ -1253,6 +1254,12 @@ static inline bool bdev_nonrot(struct block_device *bdev)
return blk_queue_nonrot(bdev_get_queue(bdev));
}
+static inline bool bdev_synchronous(struct block_device *bdev)
+{
+ return test_bit(QUEUE_FLAG_SYNCHRONOUS,
+ &bdev_get_queue(bdev)->queue_flags);
+}
+
static inline bool bdev_stable_writes(struct block_device *bdev)
{
return test_bit(QUEUE_FLAG_STABLE_WRITES,
@@ -1397,7 +1404,6 @@ struct block_device_operations {
unsigned int flags);
int (*open) (struct block_device *, fmode_t);
void (*release) (struct gendisk *, fmode_t);
- int (*rw_page)(struct block_device *, sector_t, struct page *, enum req_op);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
unsigned int (*check_events) (struct gendisk *disk,
@@ -1432,10 +1438,6 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t,
#define blkdev_compat_ptr_ioctl NULL
#endif
-extern int bdev_read_page(struct block_device *, sector_t, struct page *);
-extern int bdev_write_page(struct block_device *, sector_t, struct page *,
- struct writeback_control *);
-
static inline void blk_wake_io_task(struct task_struct *waiter)
{
/*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 33fa5e94aa80..8f14dca5fed7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -61,7 +61,10 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
struct buffer_head {
unsigned long b_state; /* buffer state bitmap (see above) */
struct buffer_head *b_this_page;/* circular list of page's buffers */
- struct page *b_page; /* the page this bh is mapped to */
+ union {
+ struct page *b_page; /* the page this bh is mapped to */
+ struct folio *b_folio; /* the folio this bh is mapped to */
+ };
sector_t b_blocknr; /* start block number */
size_t b_size; /* size of mapping */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index ad15a5b88e3a..d5d4d19928e0 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -8,6 +8,7 @@
#ifndef _DAMON_H_
#define _DAMON_H_
+#include <linux/memcontrol.h>
#include <linux/mutex.h>
#include <linux/time64.h>
#include <linux/types.h>
@@ -90,6 +91,12 @@ struct damon_target {
* @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists.
* @DAMOS_STAT: Do nothing but count the stat.
* @NR_DAMOS_ACTIONS: Total number of DAMOS actions
+ *
+ * The support of each action is up to running &struct damon_operations.
+ * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except
+ * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR
+ * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum
+ * DAMOS_LRU_DEPRIO, and &DAMOS_STAT.
*/
enum damos_action {
DAMOS_WILLNEED,
@@ -216,6 +223,44 @@ struct damos_stat {
};
/**
+ * enum damos_filter_type - Type of memory for &struct damos_filter
+ * @DAMOS_FILTER_TYPE_ANON: Anonymous pages.
+ * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages.
+ * @NR_DAMOS_FILTER_TYPES: Number of filter types.
+ *
+ * The support of each filter type is up to running &struct damon_operations.
+ * &enum DAMON_OPS_PADDR is supporting all filter types, while
+ * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any
+ * filter types.
+ */
+enum damos_filter_type {
+ DAMOS_FILTER_TYPE_ANON,
+ DAMOS_FILTER_TYPE_MEMCG,
+ NR_DAMOS_FILTER_TYPES,
+};
+
+/**
+ * struct damos_filter - DAMOS action target memory filter.
+ * @type: Type of the page.
+ * @matching: If the matching page should filtered out or in.
+ * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG.
+ * @list: List head for siblings.
+ *
+ * Before applying the &damos->action to a memory region, DAMOS checks if each
+ * page of the region matches to this and avoid applying the action if so.
+ * Note that the check support is up to &struct damon_operations
+ * implementation.
+ */
+struct damos_filter {
+ enum damos_filter_type type;
+ bool matching;
+ union {
+ unsigned short memcg_id;
+ };
+ struct list_head list;
+};
+
+/**
* struct damos_access_pattern - Target access pattern of the given scheme.
* @min_sz_region: Minimum size of target regions.
* @max_sz_region: Maximum size of target regions.
@@ -239,6 +284,7 @@ struct damos_access_pattern {
* @action: &damo_action to be applied to the target regions.
* @quota: Control the aggressiveness of this scheme.
* @wmarks: Watermarks for automated (in)activation of this scheme.
+ * @filters: Additional set of &struct damos_filter for &action.
* @stat: Statistics of this scheme.
* @list: List head for siblings.
*
@@ -254,6 +300,10 @@ struct damos_access_pattern {
* If all schemes that registered to a &struct damon_ctx are inactive, DAMON
* stops monitoring and just repeatedly checks the watermarks.
*
+ * Before applying the &action to a memory region, &struct damon_operations
+ * implementation could check pages of the region and skip &action to respect
+ * &filters
+ *
* After applying the &action to each region, &stat_count and &stat_sz is
* updated to reflect the number of regions and total size of regions that the
* &action is applied.
@@ -263,6 +313,7 @@ struct damos {
enum damos_action action;
struct damos_quota quota;
struct damos_watermarks wmarks;
+ struct list_head filters;
struct damos_stat stat;
struct list_head list;
};
@@ -303,10 +354,10 @@ struct damon_ctx;
* users should register the low level operations for their target address
* space and usecase via the &damon_ctx.ops. Then, the monitoring thread
* (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting
- * the monitoring, @update after each &damon_ctx.ops_update_interval, and
+ * the monitoring, @update after each &damon_attrs.ops_update_interval, and
* @check_accesses, @target_valid and @prepare_access_checks after each
- * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each
- * &damon_ctx.aggr_interval.
+ * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after
+ * each &damon_attrs.aggr_interval.
*
* Each &struct damon_operations instance having valid @id can be registered
* via damon_register_ops() and selected by damon_select_ops() later.
@@ -516,6 +567,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
#define damon_for_each_scheme_safe(s, next, ctx) \
list_for_each_entry_safe(s, next, &(ctx)->schemes, list)
+#define damos_for_each_filter(f, scheme) \
+ list_for_each_entry(f, &(scheme)->filters, list)
+
+#define damos_for_each_filter_safe(f, next, scheme) \
+ list_for_each_entry_safe(f, next, &(scheme)->filters, list)
+
#ifdef CONFIG_DAMON
struct damon_region *damon_new_region(unsigned long start, unsigned long end);
@@ -536,6 +593,11 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
unsigned int nr_ranges);
+struct damos_filter *damos_new_filter(enum damos_filter_type type,
+ bool matching);
+void damos_add_filter(struct damos *s, struct damos_filter *f);
+void damos_destroy_filter(struct damos_filter *f);
+
struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
enum damos_action action, struct damos_quota *quota,
struct damos_watermarks *wmarks);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d46ae1e525fc..c85916e9f7db 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File supports DIRECT IO */
#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000)
+#define FMODE_NOREUSE ((__force fmode_t)0x800000)
+
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index d88c46ca82e1..5088637fe5c2 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t;
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
#define ___GFP_ZERO 0x100u
-#define ___GFP_ATOMIC 0x200u
+/* 0x200u unused */
#define ___GFP_DIRECT_RECLAIM 0x400u
#define ___GFP_KSWAPD_RECLAIM 0x800u
#define ___GFP_WRITE 0x1000u
@@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t;
*
* %__GFP_HIGH indicates that the caller is high-priority and that granting
* the request is necessary before the system can make forward progress.
- * For example, creating an IO context to clean pages.
- *
- * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
- * high priority. Users are typically interrupt handlers. This may be
- * used in conjunction with %__GFP_HIGH
+ * For example creating an IO context to clean pages and requests
+ * from atomic context.
*
* %__GFP_MEMALLOC allows access to all memory. This should only be used when
* the caller guarantees the allocation will allow more memory to be freed
@@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t;
* %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
* This takes precedence over the %__GFP_MEMALLOC flag if both are set.
*/
-#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC)
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
@@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t;
* version does not attempt reclaim/compaction at all and is by default used
* in page fault path, while the non-light is used by khugepaged.
*/
-#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 44242268f53b..b06254e76d99 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -86,8 +86,8 @@ static inline void kmap_flush_unused(void);
* virtual address of the direct mapping. Only real highmem pages are
* temporarily mapped.
*
- * While it is significantly faster than kmap() for the higmem case it
- * comes with restrictions about the pointer validity.
+ * While kmap_local_page() is significantly faster than kmap() for the highmem
+ * case it comes with restrictions about the pointer validity.
*
* On HIGHMEM enabled systems mapping a highmem page has the side effect of
* disabling migration in order to keep the virtual address stable across
@@ -119,9 +119,8 @@ static inline void *kmap_local_page(struct page *page);
* virtual address of the direct mapping. Only real highmem pages are
* temporarily mapped.
*
- * While it is significantly faster than kmap() for the higmem case it
- * comes with restrictions about the pointer validity. Only use when really
- * necessary.
+ * While it is significantly faster than kmap() for the highmem case it
+ * comes with restrictions about the pointer validity.
*
* On HIGHMEM enabled systems mapping a highmem page has the side effect of
* disabling migration in order to keep the virtual address stable across
@@ -208,31 +207,30 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
}
#endif
-#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
+#ifndef vma_alloc_zeroed_movable_folio
/**
- * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move
- * @vma: The VMA the page is to be allocated for
- * @vaddr: The virtual address the page will be inserted into
+ * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA.
+ * @vma: The VMA the page is to be allocated for.
+ * @vaddr: The virtual address the page will be inserted into.
*
- * Returns: The allocated and zeroed HIGHMEM page
+ * This function will allocate a page suitable for inserting into this
+ * VMA at this virtual address. It may be allocated from highmem or
+ * the movable zone. An architecture may provide its own implementation.
*
- * This function will allocate a page for a VMA that the caller knows will
- * be able to migrate in the future using move_pages() or reclaimed
- *
- * An architecture may override this function by defining
- * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE and providing their own
- * implementation.
+ * Return: A folio containing one allocated and zeroed page or NULL if
+ * we are out of memory.
*/
-static inline struct page *
-alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+static inline
+struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
unsigned long vaddr)
{
- struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+ struct folio *folio;
- if (page)
- clear_user_highpage(page, vaddr);
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false);
+ if (folio)
+ clear_user_highpage(&folio->page, vaddr);
- return page;
+ return folio;
}
#endif
@@ -416,6 +414,36 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len)
}
/**
+ * memcpy_from_file_folio - Copy some bytes from a file folio.
+ * @to: The destination buffer.
+ * @folio: The folio to copy from.
+ * @pos: The position in the file.
+ * @len: The maximum number of bytes to copy.
+ *
+ * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE
+ * if the folio comes from HIGHMEM, and by the size of the folio.
+ *
+ * Return: The number of bytes copied from the folio.
+ */
+static inline size_t memcpy_from_file_folio(char *to, struct folio *folio,
+ loff_t pos, size_t len)
+{
+ size_t offset = offset_in_folio(folio, pos);
+ char *from = kmap_local_folio(folio, offset);
+
+ if (folio_test_highmem(folio)) {
+ offset = offset_in_page(offset);
+ len = min_t(size_t, len, PAGE_SIZE - offset);
+ } else
+ len = min(len, folio_size(folio) - offset);
+
+ memcpy(to, from, len);
+ kunmap_local(from);
+
+ return len;
+}
+
+/**
* folio_zero_segments() - Zero two byte ranges in a folio.
* @folio: The folio to write to.
* @start1: The first byte to zero.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a1341fdcf666..70bd867eba94 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -187,7 +187,7 @@ static inline int split_huge_page(struct page *page)
{
return split_huge_page_to_list(page, NULL);
}
-void deferred_split_huge_page(struct page *page);
+void deferred_split_folio(struct folio *folio);
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct folio *folio);
@@ -293,15 +293,6 @@ static inline bool thp_migration_supported(void)
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}
-static inline struct list_head *page_deferred_list(struct page *page)
-{
- /*
- * See organization of tail pages of compound page in
- * "struct page" definition.
- */
- return &page[2].deferred_list;
-}
-
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -349,7 +340,7 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
-static inline void deferred_split_huge_page(struct page *page) {}
+static inline void deferred_split_folio(struct folio *folio) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 9ab9d3105d5c..7c977d234aba 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H
+#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
@@ -170,11 +171,11 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
-int isolate_hugetlb(struct page *page, struct list_head *list);
-int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison);
+bool isolate_hugetlb(struct folio *folio, struct list_head *list);
+int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
-void putback_active_hugepage(struct page *page);
+void folio_putback_active_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -193,6 +194,43 @@ extern struct list_head huge_boot_pages;
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz);
+/*
+ * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
+ * Returns the pte_t* if found, or NULL if the address is not mapped.
+ *
+ * IMPORTANT: we should normally not directly call this function, instead
+ * this is only a common interface to implement arch-specific
+ * walker. Please use hugetlb_walk() instead, because that will attempt to
+ * verify the locking for you.
+ *
+ * Since this function will walk all the pgtable pages (including not only
+ * high-level pgtable page, but also PUD entry that can be unshared
+ * concurrently for VM_SHARED), the caller of this function should be
+ * responsible of its thread safety. One can follow this rule:
+ *
+ * (1) For private mappings: pmd unsharing is not possible, so holding the
+ * mmap_lock for either read or write is sufficient. Most callers
+ * already hold the mmap_lock, so normally, no special action is
+ * required.
+ *
+ * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged
+ * pgtable page can go away from under us! It can be done by a pmd
+ * unshare with a follow up munmap() on the other process), then we
+ * need either:
+ *
+ * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare
+ * won't happen upon the range (it also makes sure the pte_t we
+ * read is the right and stable one), or,
+ *
+ * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make
+ * sure even if unshare happened the racy unmap() will wait until
+ * i_mmap_rwsem is released.
+ *
+ * Option (2.1) is the safest, which guarantees pte stability from pmd
+ * sharing pov, until the vma lock released. Option (2.2) doesn't protect
+ * a concurrent pmd unshare, but it makes sure the pgtable page is safe to
+ * access.
+ */
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
@@ -211,7 +249,7 @@ void hugetlb_vma_lock_release(struct kref *kref);
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
@@ -375,12 +413,12 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
return NULL;
}
-static inline int isolate_hugetlb(struct page *page, struct list_head *list)
+static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list)
{
- return -EBUSY;
+ return false;
}
-static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
+static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
return 0;
}
@@ -391,7 +429,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return 0;
}
-static inline void putback_active_hugepage(struct page *page)
+static inline void folio_putback_active_hugetlb(struct folio *folio)
{
}
@@ -400,7 +438,7 @@ static inline void move_hugetlb_state(struct folio *old_folio,
{
}
-static inline unsigned long hugetlb_change_protection(
+static inline long hugetlb_change_protection(
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
@@ -679,16 +717,16 @@ struct huge_bootmem_page {
};
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
-struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
-struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask);
-struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address);
-int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
- unsigned long address, struct page *page);
+ unsigned long address, struct folio *folio);
/* arch callback */
int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
@@ -843,9 +881,9 @@ extern int dissolve_free_huge_pages(unsigned long start_pfn,
unsigned long end_pfn);
#ifdef CONFIG_MEMORY_FAILURE
-extern void hugetlb_clear_page_hwpoison(struct page *hpage);
+extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
#else
-static inline void hugetlb_clear_page_hwpoison(struct page *hpage)
+static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
}
#endif
@@ -998,21 +1036,21 @@ static inline int isolate_or_dissolve_huge_page(struct page *page,
return -ENOMEM;
}
-static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
+static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr,
int avoid_reserve)
{
return NULL;
}
-static inline struct page *
-alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+static inline struct folio *
+alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
return NULL;
}
-static inline struct page *alloc_huge_page_vma(struct hstate *h,
+static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address)
{
@@ -1213,4 +1251,35 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
#endif
+static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
+{
+ return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
+}
+
+/*
+ * Safe version of huge_pte_offset() to check the locks. See comments
+ * above huge_pte_offset().
+ */
+static inline pte_t *
+hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
+{
+#if defined(CONFIG_HUGETLB_PAGE) && \
+ defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ /*
+ * If pmd sharing possible, locking needed to safely walk the
+ * hugetlb pgtables. More information can be found at the comment
+ * above huge_pte_offset() in the same file.
+ *
+ * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
+ */
+ if (__vma_shareable_lock(vma))
+ WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
+ !lockdep_is_held(
+ &vma->vm_file->f_mapping->i_mmap_rwsem));
+#endif
+ return huge_pte_offset(vma->vm_mm, addr, sz);
+}
+
#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index f706626a8063..3d82d91f49ac 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -141,10 +141,10 @@ extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
struct hugetlb_cgroup **ptr);
extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page);
+ struct folio *folio);
extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page);
+ struct folio *folio);
extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
struct folio *folio);
extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
@@ -230,14 +230,14 @@ static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx,
static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct folio *folio)
{
}
static inline void
hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct folio *folio)
{
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 2170e0cc279d..5962072a4b19 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1570,8 +1570,6 @@ extern int jbd2_journal_inode_ranged_write(handle_t *handle,
extern int jbd2_journal_inode_ranged_wait(handle_t *handle,
struct jbd2_inode *inode, loff_t start_byte,
loff_t length);
-extern int jbd2_journal_submit_inode_data_buffers(
- struct jbd2_inode *jinode);
extern int jbd2_journal_finish_inode_data_buffers(
struct jbd2_inode *jinode);
extern int jbd2_journal_begin_ordered_truncate(journal_t *journal,
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 96c9d56e5510..f7ef70661ce2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -96,15 +96,6 @@ static inline bool kasan_has_integrated_init(void)
}
#ifdef CONFIG_KASAN
-
-struct kasan_cache {
-#ifdef CONFIG_KASAN_GENERIC
- int alloc_meta_offset;
- int free_meta_offset;
-#endif
- bool is_kmalloc;
-};
-
void __kasan_unpoison_range(const void *addr, size_t size);
static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
{
@@ -120,19 +111,13 @@ static __always_inline void kasan_poison_pages(struct page *page,
__kasan_poison_pages(page, order, init);
}
-void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init);
-static __always_inline void kasan_unpoison_pages(struct page *page,
+bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init);
+static __always_inline bool kasan_unpoison_pages(struct page *page,
unsigned int order, bool init)
{
if (kasan_enabled())
- __kasan_unpoison_pages(page, order, init);
-}
-
-void __kasan_cache_create_kmalloc(struct kmem_cache *cache);
-static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
-{
- if (kasan_enabled())
- __kasan_cache_create_kmalloc(cache);
+ return __kasan_unpoison_pages(page, order, init);
+ return false;
}
void __kasan_poison_slab(struct slab *slab);
@@ -249,9 +234,11 @@ static __always_inline bool kasan_check_byte(const void *addr)
static inline void kasan_unpoison_range(const void *address, size_t size) {}
static inline void kasan_poison_pages(struct page *page, unsigned int order,
bool init) {}
-static inline void kasan_unpoison_pages(struct page *page, unsigned int order,
- bool init) {}
-static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
+static inline bool kasan_unpoison_pages(struct page *page, unsigned int order,
+ bool init)
+{
+ return false;
+}
static inline void kasan_poison_slab(struct slab *slab) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
void *object) {}
@@ -302,6 +289,11 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
#ifdef CONFIG_KASAN_GENERIC
+struct kasan_cache {
+ int alloc_meta_offset;
+ int free_meta_offset;
+};
+
size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object);
slab_flags_t kasan_never_merge(void);
void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index e594db58a0f1..1fadb5f5978b 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -12,7 +12,6 @@
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
/* #define CONFIG_MAPLE_RCU_DISABLED */
-/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */
/*
* Allocated nodes are mutable until they have been inserted into the tree,
@@ -433,6 +432,7 @@ struct ma_wr_state {
.min = 0, \
.max = ULONG_MAX, \
.alloc = NULL, \
+ .mas_flags = 0, \
}
#define MA_WR_STATE(name, ma_state, wr_entry) \
@@ -456,7 +456,7 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
void mas_store_prealloc(struct ma_state *mas, void *entry);
void *mas_find(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
-int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
+int mas_preallocate(struct ma_state *mas, gfp_t gfp);
bool mas_is_err(struct ma_state *mas);
bool mas_nomem(struct ma_state *mas, gfp_t gfp);
@@ -471,6 +471,16 @@ void *mas_next(struct ma_state *mas, unsigned long max);
int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
unsigned long size);
+static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
+ unsigned long addr)
+{
+ memset(mas, 0, sizeof(struct ma_state));
+ mas->tree = tree;
+ mas->index = mas->last = addr;
+ mas->max = ULONG_MAX;
+ mas->node = MAS_START;
+}
+
/* Checks if a mas has not found anything */
static inline bool mas_is_none(struct ma_state *mas)
{
@@ -483,9 +493,6 @@ static inline bool mas_is_paused(struct ma_state *mas)
return mas->node == MAS_PAUSE;
}
-void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas);
-void mas_dup_store(struct ma_state *mas, void *entry);
-
/*
* This finds an empty area from the highest address to the lowest.
* AKA "Topdown" version,
@@ -517,7 +524,6 @@ static inline void mas_reset(struct ma_state *mas)
* entry.
*
* Note: may return the zero entry.
- *
*/
#define mas_for_each(__mas, __entry, __max) \
while (((__entry) = mas_find((__mas), (__max))) != NULL)
@@ -639,7 +645,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt)
}
static inline unsigned int mt_height(const struct maple_tree *mt)
-
{
return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
}
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1e38e99998c7..b6eda2ab205d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -466,34 +466,34 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
}
/*
- * page_memcg_check - get the memory cgroup associated with a page
- * @page: a pointer to the page struct
+ * folio_memcg_check - Get the memory cgroup associated with a folio.
+ * @folio: Pointer to the folio.
*
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function unlike page_memcg() can take any page
- * as an argument. It has to be used in cases when it's not known if a page
+ * Returns a pointer to the memory cgroup associated with the folio,
+ * or NULL. This function unlike folio_memcg() can take any folio
+ * as an argument. It has to be used in cases when it's not known if a folio
* has an associated memory cgroup pointer or an object cgroups vector or
* an object cgroup.
*
- * For a non-kmem page any of the following ensures page and memcg binding
+ * For a non-kmem folio any of the following ensures folio and memcg binding
* stability:
*
- * - the page lock
+ * - the folio lock
* - LRU isolation
- * - lock_page_memcg()
+ * - lock_folio_memcg()
* - exclusive reference
* - mem_cgroup_trylock_pages()
*
- * For a kmem page a caller should hold an rcu read lock to protect memcg
- * associated with a kmem page from being released.
+ * For a kmem folio a caller should hold an rcu read lock to protect memcg
+ * associated with a kmem folio from being released.
*/
-static inline struct mem_cgroup *page_memcg_check(struct page *page)
+static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
/*
- * Because page->memcg_data might be changed asynchronously
- * for slab pages, READ_ONCE() should be used here.
+ * Because folio->memcg_data might be changed asynchronously
+ * for slabs, READ_ONCE() should be used here.
*/
- unsigned long memcg_data = READ_ONCE(page->memcg_data);
+ unsigned long memcg_data = READ_ONCE(folio->memcg_data);
if (memcg_data & MEMCG_DATA_OBJCGS)
return NULL;
@@ -508,6 +508,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
+static inline struct mem_cgroup *page_memcg_check(struct page *page)
+{
+ if (PageTail(page))
+ return NULL;
+ return folio_memcg_check((struct folio *)page);
+}
+
static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
struct mem_cgroup *memcg;
@@ -794,6 +801,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
percpu_ref_put(&objcg->refcnt);
}
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
+{
+ return !memcg || css_tryget(&memcg->css);
+}
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
if (memcg)
@@ -878,7 +890,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
return match;
}
-struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio);
ino_t page_cgroup_ino(struct page *page);
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
@@ -1165,6 +1177,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
return NULL;
}
+static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
+{
+ return NULL;
+}
+
static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
return NULL;
@@ -1301,6 +1318,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
}
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
+{
+ return true;
+}
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}
@@ -1760,24 +1782,24 @@ static inline bool memcg_bpf_enabled(void)
return static_branch_likely(&memcg_bpf_enabled_key);
}
-extern struct static_key_false memcg_kmem_enabled_key;
+extern struct static_key_false memcg_kmem_online_key;
-static inline bool memcg_kmem_enabled(void)
+static inline bool memcg_kmem_online(void)
{
- return static_branch_likely(&memcg_kmem_enabled_key);
+ return static_branch_likely(&memcg_kmem_online_key);
}
static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
int order)
{
- if (memcg_kmem_enabled())
+ if (memcg_kmem_online())
return __memcg_kmem_charge_page(page, gfp, order);
return 0;
}
static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
- if (memcg_kmem_enabled())
+ if (memcg_kmem_online())
__memcg_kmem_uncharge_page(page, order);
}
@@ -1798,7 +1820,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
{
struct mem_cgroup *memcg;
- if (!memcg_kmem_enabled())
+ if (!memcg_kmem_online())
return;
rcu_read_lock();
@@ -1843,7 +1865,7 @@ static inline bool memcg_bpf_enabled(void)
return false;
}
-static inline bool memcg_kmem_enabled(void)
+static inline bool memcg_kmem_online(void)
{
return false;
}
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3ef77f52a4f0..6241a1596a75 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -18,6 +18,7 @@ struct migration_target_control;
* - zero on page migration success;
*/
#define MIGRATEPAGE_SUCCESS 0
+#define MIGRATEPAGE_UNMAP 1
/**
* struct movable_operations - Driver page migration
@@ -61,16 +62,16 @@ extern const char *migrate_reason_names[MR_TYPES];
#ifdef CONFIG_MIGRATION
-extern void putback_movable_pages(struct list_head *l);
+void putback_movable_pages(struct list_head *l);
int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode, int extra_count);
int migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode);
-extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
- unsigned long private, enum migrate_mode mode, int reason,
- unsigned int *ret_succeeded);
-extern struct page *alloc_migration_target(struct page *page, unsigned long private);
-extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
+int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
+ unsigned long private, enum migrate_mode mode, int reason,
+ unsigned int *ret_succeeded);
+struct page *alloc_migration_target(struct page *page, unsigned long private);
+bool isolate_movable_page(struct page *page, isolate_mode_t mode);
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src);
@@ -91,8 +92,8 @@ static inline int migrate_pages(struct list_head *l, new_page_t new,
static inline struct page *alloc_migration_target(struct page *page,
unsigned long private)
{ return NULL; }
-static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
- { return -EBUSY; }
+static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+ { return false; }
static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src)
@@ -123,6 +124,15 @@ static inline bool folio_test_movable(struct folio *folio)
}
static inline
+const struct movable_operations *folio_movable_ops(struct folio *folio)
+{
+ VM_BUG_ON(!__folio_test_movable(folio));
+
+ return (const struct movable_operations *)
+ ((unsigned long)folio->mapping - PAGE_MAPPING_MOVABLE);
+}
+
+static inline
const struct movable_operations *page_movable_ops(struct page *page)
{
VM_BUG_ON(!__PageMovable(page));
@@ -132,8 +142,8 @@ const struct movable_operations *page_movable_ops(struct page *page)
}
#ifdef CONFIG_NUMA_BALANCING
-extern int migrate_misplaced_page(struct page *page,
- struct vm_area_struct *vma, int node);
+int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+ int node);
#else
static inline int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 716d30d93616..1f79667824eb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -282,7 +282,12 @@ extern unsigned int kobjsize(const void *objp);
#define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#ifdef CONFIG_MMU
#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
+#else /* CONFIG_MMU */
+#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
+#define VM_UFFD_MISSING 0
+#endif /* CONFIG_MMU */
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
@@ -422,8 +427,8 @@ extern unsigned int kobjsize(const void *objp);
/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK VM_NOHUGEPAGE
-/* This mask is used to clear all the VMA flags used by mlock */
-#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT))
+/* This mask represents all the VMA flag bits used by mlock */
+#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
@@ -628,6 +633,63 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
INIT_LIST_HEAD(&vma->anon_vma_chain);
}
+/* Use when VMA is not part of the VMA tree and needs no locking */
+static inline void vm_flags_init(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ ACCESS_PRIVATE(vma, __vm_flags) = flags;
+}
+
+/* Use when VMA is part of the VMA tree and modifications need coordination */
+static inline void vm_flags_reset(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+ vm_flags_init(vma, flags);
+}
+
+static inline void vm_flags_reset_once(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+ WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
+}
+
+static inline void vm_flags_set(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+ ACCESS_PRIVATE(vma, __vm_flags) |= flags;
+}
+
+static inline void vm_flags_clear(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+ ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
+}
+
+/*
+ * Use only if VMA is not part of the VMA tree or has no other users and
+ * therefore needs no locking.
+ */
+static inline void __vm_flags_mod(struct vm_area_struct *vma,
+ vm_flags_t set, vm_flags_t clear)
+{
+ vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
+}
+
+/*
+ * Use only when the order of set/clear operations is unimportant, otherwise
+ * use vm_flags_{set|clear} explicitly.
+ */
+static inline void vm_flags_mod(struct vm_area_struct *vma,
+ vm_flags_t set, vm_flags_t clear)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+ __vm_flags_mod(vma, set, clear);
+}
+
static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
vma->vm_ops = NULL;
@@ -671,16 +733,16 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
- return mas_find(&vmi->mas, max);
+ return mas_find(&vmi->mas, max - 1);
}
static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
{
/*
- * Uses vma_find() to get the first VMA when the iterator starts.
+ * Uses mas_find() to get the first VMA when the iterator starts.
* Calling mas_next() could skip the first entry.
*/
- return vma_find(vmi, ULONG_MAX);
+ return mas_find(&vmi->mas, ULONG_MAX);
}
static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
@@ -693,12 +755,50 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
return vmi->mas.index;
}
+static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
+{
+ return vmi->mas.last + 1;
+}
+static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
+ unsigned long count)
+{
+ return mas_expected_entries(&vmi->mas, count);
+}
+
+/* Free any unused preallocations */
+static inline void vma_iter_free(struct vma_iterator *vmi)
+{
+ mas_destroy(&vmi->mas);
+}
+
+static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
+{
+ vmi->mas.index = vma->vm_start;
+ vmi->mas.last = vma->vm_end - 1;
+ mas_store(&vmi->mas, vma);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static inline void vma_iter_invalidate(struct vma_iterator *vmi)
+{
+ mas_pause(&vmi->mas);
+}
+
+static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
+{
+ mas_set(&vmi->mas, addr);
+}
+
#define for_each_vma(__vmi, __vma) \
while (((__vma) = vma_next(&(__vmi))) != NULL)
/* The MM code likes to work with exclusive end addresses */
#define for_each_vma_range(__vmi, __vma, __end) \
- while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL)
+ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
#ifdef CONFIG_SHMEM
/*
@@ -720,11 +820,20 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
struct mmu_gather;
struct inode;
+/*
+ * compound_order() can be called without holding a reference, which means
+ * that niceties like page_folio() don't work. These callers should be
+ * prepared to handle wild return values. For example, PG_head may be
+ * set before _folio_order is initialised, or this may be a tail page.
+ * See compaction.c for some good examples.
+ */
static inline unsigned int compound_order(struct page *page)
{
- if (!PageHead(page))
+ struct folio *folio = (struct folio *)page;
+
+ if (!test_bit(PG_head, &folio->flags))
return 0;
- return page[1].compound_order;
+ return folio->_folio_order;
}
/**
@@ -783,6 +892,13 @@ static inline bool get_page_unless_zero(struct page *page)
return page_ref_add_unless(page, 1, 0);
}
+static inline struct folio *folio_get_nontail_page(struct page *page)
+{
+ if (unlikely(!get_page_unless_zero(page)))
+ return NULL;
+ return (struct folio *)page;
+}
+
extern int page_is_ram(unsigned long pfn);
enum {
@@ -832,34 +948,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
static inline int folio_entire_mapcount(struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- return atomic_read(folio_mapcount_ptr(folio)) + 1;
-}
-
-/*
- * Mapcount of compound page as a whole, does not include mapped sub-pages.
- * Must be called only on head of compound page.
- */
-static inline int head_compound_mapcount(struct page *head)
-{
- return atomic_read(compound_mapcount_ptr(head)) + 1;
-}
-
-/*
- * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages,
- * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit
- * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently
- * leaves subpages_mapcount at 0, but avoid surprise if it participates later.
- */
-#define COMPOUND_MAPPED 0x800000
-#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1)
-
-/*
- * Number of sub-pages mapped by PTE, does not include compound mapcount.
- * Must be called only on head of compound page.
- */
-static inline int head_subpages_mapcount(struct page *head)
-{
- return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED;
+ return atomic_read(&folio->_entire_mapcount) + 1;
}
/*
@@ -872,25 +961,29 @@ static inline void page_mapcount_reset(struct page *page)
atomic_set(&(page)->_mapcount, -1);
}
-/*
- * Mapcount of 0-order page; when compound sub-page, includes
- * compound_mapcount of compound_head of page.
+/**
+ * page_mapcount() - Number of times this precise page is mapped.
+ * @page: The page.
+ *
+ * The number of times this page is mapped. If this page is part of
+ * a large folio, it includes the number of times this page is mapped
+ * as part of that folio.
*
- * Result is undefined for pages which cannot be mapped into userspace.
+ * The result is undefined for pages which cannot be mapped into userspace.
* For example SLAB or special types of pages. See function page_has_type().
- * They use this place in struct page differently.
+ * They use this field in struct page differently.
*/
static inline int page_mapcount(struct page *page)
{
int mapcount = atomic_read(&page->_mapcount) + 1;
- if (likely(!PageCompound(page)))
- return mapcount;
- page = compound_head(page);
- return head_compound_mapcount(page) + mapcount;
+ if (unlikely(PageCompound(page)))
+ mapcount += folio_entire_mapcount(page_folio(page));
+
+ return mapcount;
}
-int total_compound_mapcount(struct page *head);
+int folio_total_mapcount(struct folio *folio);
/**
* folio_mapcount() - Calculate the number of mappings of this folio.
@@ -907,24 +1000,24 @@ static inline int folio_mapcount(struct folio *folio)
{
if (likely(!folio_test_large(folio)))
return atomic_read(&folio->_mapcount) + 1;
- return total_compound_mapcount(&folio->page);
+ return folio_total_mapcount(folio);
}
static inline int total_mapcount(struct page *page)
{
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) + 1;
- return total_compound_mapcount(compound_head(page));
+ return folio_total_mapcount(page_folio(page));
}
static inline bool folio_large_is_mapped(struct folio *folio)
{
/*
- * Reading folio_mapcount_ptr() below could be omitted if hugetlb
- * participated in incrementing subpages_mapcount when compound mapped.
+ * Reading _entire_mapcount below could be omitted if hugetlb
+ * participated in incrementing nr_pages_mapped when compound mapped.
*/
- return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 ||
- atomic_read(folio_mapcount_ptr(folio)) >= 0;
+ return atomic_read(&folio->_nr_pages_mapped) > 0 ||
+ atomic_read(&folio->_entire_mapcount) >= 0;
}
/**
@@ -999,8 +1092,11 @@ extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS];
static inline void set_compound_page_dtor(struct page *page,
enum compound_dtor_id compound_dtor)
{
+ struct folio *folio = (struct folio *)page;
+
VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page);
- page[1].compound_dtor = compound_dtor;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ folio->_folio_dtor = compound_dtor;
}
static inline void folio_set_compound_dtor(struct folio *folio,
@@ -1012,44 +1108,13 @@ static inline void folio_set_compound_dtor(struct folio *folio,
void destroy_large_folio(struct folio *folio);
-static inline int head_compound_pincount(struct page *head)
-{
- return atomic_read(compound_pincount_ptr(head));
-}
-
static inline void set_compound_order(struct page *page, unsigned int order)
{
- page[1].compound_order = order;
-#ifdef CONFIG_64BIT
- page[1].compound_nr = 1U << order;
-#endif
-}
-
-/*
- * folio_set_compound_order is generally passed a non-zero order to
- * initialize a large folio. However, hugetlb code abuses this by
- * passing in zero when 'dissolving' a large folio.
- */
-static inline void folio_set_compound_order(struct folio *folio,
- unsigned int order)
-{
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ struct folio *folio = (struct folio *)page;
folio->_folio_order = order;
#ifdef CONFIG_64BIT
- folio->_folio_nr_pages = order ? 1U << order : 0;
-#endif
-}
-
-/* Returns the number of pages in this potentially compound page. */
-static inline unsigned long compound_nr(struct page *page)
-{
- if (!PageHead(page))
- return 1;
-#ifdef CONFIG_64BIT
- return page[1].compound_nr;
-#else
- return 1UL << compound_order(page);
+ folio->_folio_nr_pages = 1U << order;
#endif
}
@@ -1076,16 +1141,6 @@ static inline unsigned int thp_order(struct page *page)
}
/**
- * thp_nr_pages - The number of regular pages in this huge page.
- * @page: The head page of a huge page.
- */
-static inline int thp_nr_pages(struct page *page)
-{
- VM_BUG_ON_PGFLAGS(PageTail(page), page);
- return compound_nr(page);
-}
-
-/**
* thp_size - Size of a transparent huge page.
* @page: Head page of a transparent huge page.
*
@@ -1226,8 +1281,6 @@ static inline void get_page(struct page *page)
folio_get(page_folio(page));
}
-int __must_check try_grab_page(struct page *page, unsigned int flags);
-
static inline __must_check bool try_get_page(struct page *page)
{
page = compound_head(page);
@@ -1369,6 +1422,21 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
+#ifndef CONFIG_MMU
+static inline bool is_nommu_shared_mapping(vm_flags_t flags)
+{
+ /*
+ * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
+ * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
+ * a file mapping. R/O MAP_PRIVATE mappings might still modify
+ * underlying memory if ptrace is active, so this is only possible if
+ * ptrace does not apply. Note that there is no mprotect() to upgrade
+ * write permissions later.
+ */
+ return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
+}
+#endif
+
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
@@ -1643,11 +1711,6 @@ static inline struct folio *pfn_folio(unsigned long pfn)
return page_folio(pfn_to_page(pfn));
}
-static inline atomic_t *folio_pincount_ptr(struct folio *folio)
-{
- return &folio_page(folio, 1)->compound_pincount;
-}
-
/**
* folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
* @folio: The folio.
@@ -1665,7 +1728,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio)
* expected to be able to deal gracefully with a false positive.
*
* For large folios, the result will be exactly correct. That's because
- * we have more tracking data available: the compound_pincount is used
+ * we have more tracking data available: the _pincount field is used
* instead of the GUP_PIN_COUNTING_BIAS scheme.
*
* For more information, please see Documentation/core-api/pin_user_pages.rst.
@@ -1676,7 +1739,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio)
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
if (folio_test_large(folio))
- return atomic_read(folio_pincount_ptr(folio)) > 0;
+ return atomic_read(&folio->_pincount) > 0;
/*
* folio_ref_count() is signed. If that refcount overflows, then
@@ -1784,6 +1847,33 @@ static inline long folio_nr_pages(struct folio *folio)
#endif
}
+/*
+ * compound_nr() returns the number of pages in this potentially compound
+ * page. compound_nr() can be called on a tail page, and is defined to
+ * return 1 in that case.
+ */
+static inline unsigned long compound_nr(struct page *page)
+{
+ struct folio *folio = (struct folio *)page;
+
+ if (!test_bit(PG_head, &folio->flags))
+ return 1;
+#ifdef CONFIG_64BIT
+ return folio->_folio_nr_pages;
+#else
+ return 1L << folio->_folio_order;
+#endif
+}
+
+/**
+ * thp_nr_pages - The number of regular pages in this huge page.
+ * @page: The head page of a huge page.
+ */
+static inline int thp_nr_pages(struct page *page)
+{
+ return folio_nr_pages((struct folio *)page);
+}
+
/**
* folio_next - Move to the next physical folio.
* @folio: The folio we're currently operating on.
@@ -1833,6 +1923,24 @@ static inline size_t folio_size(struct folio *folio)
return PAGE_SIZE << folio_order(folio);
}
+/**
+ * folio_estimated_sharers - Estimate the number of sharers of a folio.
+ * @folio: The folio.
+ *
+ * folio_estimated_sharers() aims to serve as a function to efficiently
+ * estimate the number of processes sharing a folio. This is done by
+ * looking at the precise mapcount of the first subpage in the folio, and
+ * assuming the other subpages are the same. This may not be true for large
+ * folios. If you want exact mapcounts for exact calculations, look at
+ * page_mapcount() or folio_total_mapcount().
+ *
+ * Return: The estimated number of processes sharing a folio.
+ */
+static inline int folio_estimated_sharers(struct folio *folio)
+{
+ return page_mapcount(folio_page(folio, 0));
+}
+
#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
static inline int arch_make_page_accessible(struct page *page)
{
@@ -1929,6 +2037,21 @@ static inline bool page_is_pfmemalloc(const struct page *page)
}
/*
+ * Return true only if the folio has been allocated with
+ * ALLOC_NO_WATERMARKS and the low watermark was not
+ * met implying that the system is under some pressure.
+ */
+static inline bool folio_is_pfmemalloc(const struct folio *folio)
+{
+ /*
+ * lru.next has bit 1 set if the page is allocated from the
+ * pfmemalloc reserves. Callers may simply overwrite it if
+ * they do not need to preserve that information.
+ */
+ return (uintptr_t)folio->lru.next & BIT(1);
+}
+
+/*
* Only to be called by the page allocator on a freshly allocated
* page.
*/
@@ -2015,6 +2138,8 @@ static inline bool can_do_mlock(void) { return false; }
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);
+struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -2022,13 +2147,16 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
-void zap_page_range(struct vm_area_struct *vma, unsigned long address,
- unsigned long size);
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details);
+static inline void zap_vma_pages(struct vm_area_struct *vma)
+{
+ zap_page_range_single(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start, NULL);
+}
void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long start,
- unsigned long end);
+ unsigned long end, bool mm_wr_locked);
struct mmu_notifier_range;
@@ -2175,21 +2303,18 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma
}
bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
-extern unsigned long change_protection(struct mmu_gather *tlb,
+extern long change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgprot_t newprot,
- unsigned long cp_flags);
-extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
- struct vm_area_struct **pprev, unsigned long start,
- unsigned long end, unsigned long newflags);
+ unsigned long end, unsigned long cp_flags);
+extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
+ struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ unsigned long start, unsigned long end, unsigned long newflags);
/*
* doesn't attempt to fault and will return short.
*/
int get_user_pages_fast_only(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
-int pin_user_pages_fast_only(unsigned long start, int nr_pages,
- unsigned int gup_flags, struct page **pages);
static inline bool get_user_page_fast_only(unsigned long addr,
unsigned int gup_flags, struct page **pagep)
@@ -2813,23 +2938,21 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
-extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
- struct vm_area_struct *expand);
-static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
-{
- return __vma_adjust(vma, start, end, pgoff, insert, NULL);
-}
-extern struct vm_area_struct *vma_merge(struct mm_struct *,
- struct vm_area_struct *prev, unsigned long addr, unsigned long end,
- unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *);
+extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff,
+ struct vm_area_struct *next);
+extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff);
+extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi,
+ struct mm_struct *, struct vm_area_struct *prev, unsigned long addr,
+ unsigned long end, unsigned long vm_flags, struct anon_vma *,
+ struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx,
+ struct anon_vma_name *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
-extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
- unsigned long addr, int new_below);
-extern int split_vma(struct mm_struct *, struct vm_area_struct *,
- unsigned long addr, int new_below);
+extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
+ unsigned long addr, int new_below);
+extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
+ unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
@@ -2837,9 +2960,6 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
-void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
-void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas);
-
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
unsigned long start,
@@ -2887,7 +3007,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
unsigned long pgoff, unsigned long *populate, struct list_head *uf);
-extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
@@ -2895,6 +3015,9 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t,
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
#ifdef CONFIG_MMU
+extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct list_head *uf, bool downgrade);
extern int __mm_populate(unsigned long addr, unsigned long len,
int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
@@ -3100,81 +3223,6 @@ static inline vm_fault_t vmf_error(int err)
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags);
-#define FOLL_WRITE 0x01 /* check pte is writable */
-#define FOLL_TOUCH 0x02 /* mark page accessed */
-#define FOLL_GET 0x04 /* do get_page on page */
-#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
-#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
-#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
- * and return without waiting upon it */
-#define FOLL_NOFAULT 0x80 /* do not fault in pages */
-#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
-#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
-#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
-#define FOLL_ANON 0x8000 /* don't do file mappings */
-#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
-#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
-#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
-#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
-#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */
-#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */
-
-/*
- * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
- * other. Here is what they mean, and how to use them:
- *
- * FOLL_LONGTERM indicates that the page will be held for an indefinite time
- * period _often_ under userspace control. This is in contrast to
- * iov_iter_get_pages(), whose usages are transient.
- *
- * FIXME: For pages which are part of a filesystem, mappings are subject to the
- * lifetime enforced by the filesystem and we need guarantees that longterm
- * users like RDMA and V4L2 only establish mappings which coordinate usage with
- * the filesystem. Ideas for this coordination include revoking the longterm
- * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was
- * added after the problem with filesystems was found FS DAX VMAs are
- * specifically failed. Filesystem pages are still subject to bugs and use of
- * FOLL_LONGTERM should be avoided on those pages.
- *
- * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
- * Currently only get_user_pages() and get_user_pages_fast() support this flag
- * and calls to get_user_pages_[un]locked are specifically not allowed. This
- * is due to an incompatibility with the FS DAX check and
- * FAULT_FLAG_ALLOW_RETRY.
- *
- * In the CMA case: long term pins in a CMA region would unnecessarily fragment
- * that region. And so, CMA attempts to migrate the page before pinning, when
- * FOLL_LONGTERM is specified.
- *
- * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
- * but an additional pin counting system) will be invoked. This is intended for
- * anything that gets a page reference and then touches page data (for example,
- * Direct IO). This lets the filesystem know that some non-file-system entity is
- * potentially changing the pages' data. In contrast to FOLL_GET (whose pages
- * are released via put_page()), FOLL_PIN pages must be released, ultimately, by
- * a call to unpin_user_page().
- *
- * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
- * and separate refcounting mechanisms, however, and that means that each has
- * its own acquire and release mechanisms:
- *
- * FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
- *
- * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
- *
- * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
- * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
- * calls applied to them, and that's perfectly OK. This is a constraint on the
- * callers, not on the pages.)
- *
- * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
- * directly by the caller. That's in order to help avoid mismatches when
- * releasing pages: get_user_pages*() pages must be released via put_page(),
- * while pin_user_pages*() pages must be released via unpin_user_page().
- *
- * Please see Documentation/core-api/pin_user_pages.rst for more information.
- */
-
static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
if (vm_fault & VM_FAULT_OOM)
@@ -3187,71 +3235,6 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
}
/*
- * Indicates for which pages that are write-protected in the page table,
- * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
- * GUP pin will remain consistent with the pages mapped into the page tables
- * of the MM.
- *
- * Temporary unmapping of PageAnonExclusive() pages or clearing of
- * PageAnonExclusive() has to protect against concurrent GUP:
- * * Ordinary GUP: Using the PT lock
- * * GUP-fast and fork(): mm->write_protect_seq
- * * GUP-fast and KSM or temporary unmapping (swap, migration): see
- * page_try_share_anon_rmap()
- *
- * Must be called with the (sub)page that's actually referenced via the
- * page table entry, which might not necessarily be the head page for a
- * PTE-mapped THP.
- *
- * If the vma is NULL, we're coming from the GUP-fast path and might have
- * to fallback to the slow path just to lookup the vma.
- */
-static inline bool gup_must_unshare(struct vm_area_struct *vma,
- unsigned int flags, struct page *page)
-{
- /*
- * FOLL_WRITE is implicitly handled correctly as the page table entry
- * has to be writable -- and if it references (part of) an anonymous
- * folio, that part is required to be marked exclusive.
- */
- if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
- return false;
- /*
- * Note: PageAnon(page) is stable until the page is actually getting
- * freed.
- */
- if (!PageAnon(page)) {
- /*
- * We only care about R/O long-term pining: R/O short-term
- * pinning does not have the semantics to observe successive
- * changes through the process page tables.
- */
- if (!(flags & FOLL_LONGTERM))
- return false;
-
- /* We really need the vma ... */
- if (!vma)
- return true;
-
- /*
- * ... because we only care about writable private ("COW")
- * mappings where we have to break COW early.
- */
- return is_cow_mapping(vma->vm_flags);
- }
-
- /* Paired with a memory barrier in page_try_share_anon_rmap(). */
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
- smp_rmb();
-
- /*
- * Note that PageKsm() pages cannot be exclusive, and consequently,
- * cannot get pinned.
- */
- return !PageAnonExclusive(page);
-}
-
-/*
* Indicates whether GUP can follow a PROT_NONE mapped page, or whether
* a (NUMA hinting) fault is required.
*/
@@ -3550,6 +3533,11 @@ enum mf_action_page_type {
MF_MSG_UNKNOWN,
};
+/*
+ * Sysfs entries for memory failure handling statistics.
+ */
+extern const struct attribute_group memory_failure_attr_group;
+
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
unsigned long addr_hint,
@@ -3667,7 +3655,7 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
* VM_MAYWRITE as we still want them to be COW-writable.
*/
if (vma->vm_flags & VM_SHARED)
- vma->vm_flags &= ~(VM_MAYWRITE);
+ vm_flags_clear(vma, VM_MAYWRITE);
}
return 0;
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ff3f3f23f649..de1e622dd366 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
int zone = folio_zonenum(folio);
int delta = folio_nr_pages(folio);
enum lru_list lru = type * LRU_INACTIVE_FILE;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
@@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
@@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
lru_gen_update_size(lruvec, folio, -1, gen);
/* for folio_rotate_reclaimable() */
if (reclaiming)
- list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
else
- list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_add(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
@@ -577,4 +577,15 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
#endif
}
+static inline bool vma_has_recency(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
+ return false;
+
+ if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
+ return false;
+
+ return true;
+}
+
#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index af8119776ab1..0722859c3647 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -140,30 +140,6 @@ struct page {
};
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
-
- /* First tail page only */
- unsigned char compound_dtor;
- unsigned char compound_order;
- atomic_t compound_mapcount;
- atomic_t subpages_mapcount;
- atomic_t compound_pincount;
-#ifdef CONFIG_64BIT
- unsigned int compound_nr; /* 1 << compound_order */
-#endif
- };
- struct { /* Second tail page of transparent huge page */
- unsigned long _compound_pad_1; /* compound_head */
- unsigned long _compound_pad_2;
- /* For both global and memcg */
- struct list_head deferred_list;
- };
- struct { /* Second tail page of hugetlb page */
- unsigned long _hugetlb_pad_1; /* compound_head */
- void *hugetlb_subpool;
- void *hugetlb_cgroup;
- void *hugetlb_cgroup_rsvd;
- void *hugetlb_hwpoison;
- /* No more space on 32-bit: use third tail if more */
};
struct { /* Page table pages */
unsigned long _pt_pad_1; /* compound_head */
@@ -302,20 +278,17 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page)
* @_refcount: Do not access this member directly. Use folio_ref_count()
* to find how many references there are to this folio.
* @memcg_data: Memory Control Group data.
- * @_flags_1: For large folios, additional page flags.
- * @_head_1: Points to the folio. Do not use.
* @_folio_dtor: Which destructor to use for this folio.
* @_folio_order: Do not use directly, call folio_order().
- * @_compound_mapcount: Do not use directly, call folio_entire_mapcount().
- * @_subpages_mapcount: Do not use directly, call folio_mapcount().
+ * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
+ * @_nr_pages_mapped: Do not use directly, call folio_mapcount().
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
* @_folio_nr_pages: Do not use directly, call folio_nr_pages().
- * @_flags_2: For alignment. Do not use.
- * @_head_2: Points to the folio. Do not use.
* @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
* @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
* @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
* @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head().
+ * @_deferred_list: Folios to be split under memory pressure.
*
* A folio is a physically, virtually and logically contiguous set
* of bytes. It is a power-of-two in size, and it is aligned to that
@@ -358,14 +331,16 @@ struct folio {
struct {
unsigned long _flags_1;
unsigned long _head_1;
+ /* public: */
unsigned char _folio_dtor;
unsigned char _folio_order;
- atomic_t _compound_mapcount;
- atomic_t _subpages_mapcount;
+ atomic_t _entire_mapcount;
+ atomic_t _nr_pages_mapped;
atomic_t _pincount;
#ifdef CONFIG_64BIT
unsigned int _folio_nr_pages;
#endif
+ /* private: the union with struct page is transitional */
};
struct page __page_1;
};
@@ -373,10 +348,19 @@ struct folio {
struct {
unsigned long _flags_2;
unsigned long _head_2;
+ /* public: */
void *_hugetlb_subpool;
void *_hugetlb_cgroup;
void *_hugetlb_cgroup_rsvd;
void *_hugetlb_hwpoison;
+ /* private: the union with struct page is transitional */
+ };
+ struct {
+ unsigned long _flags_2a;
+ unsigned long _head_2a;
+ /* public: */
+ struct list_head _deferred_list;
+ /* private: the union with struct page is transitional */
};
struct page __page_2;
};
@@ -401,53 +385,14 @@ FOLIO_MATCH(memcg_data, memcg_data);
offsetof(struct page, pg) + sizeof(struct page))
FOLIO_MATCH(flags, _flags_1);
FOLIO_MATCH(compound_head, _head_1);
-FOLIO_MATCH(compound_dtor, _folio_dtor);
-FOLIO_MATCH(compound_order, _folio_order);
-FOLIO_MATCH(compound_mapcount, _compound_mapcount);
-FOLIO_MATCH(subpages_mapcount, _subpages_mapcount);
-FOLIO_MATCH(compound_pincount, _pincount);
-#ifdef CONFIG_64BIT
-FOLIO_MATCH(compound_nr, _folio_nr_pages);
-#endif
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl) \
static_assert(offsetof(struct folio, fl) == \
offsetof(struct page, pg) + 2 * sizeof(struct page))
FOLIO_MATCH(flags, _flags_2);
FOLIO_MATCH(compound_head, _head_2);
-FOLIO_MATCH(hugetlb_subpool, _hugetlb_subpool);
-FOLIO_MATCH(hugetlb_cgroup, _hugetlb_cgroup);
-FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd);
-FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison);
#undef FOLIO_MATCH
-static inline atomic_t *folio_mapcount_ptr(struct folio *folio)
-{
- struct page *tail = &folio->page + 1;
- return &tail->compound_mapcount;
-}
-
-static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio)
-{
- struct page *tail = &folio->page + 1;
- return &tail->subpages_mapcount;
-}
-
-static inline atomic_t *compound_mapcount_ptr(struct page *page)
-{
- return &page[1].compound_mapcount;
-}
-
-static inline atomic_t *subpages_mapcount_ptr(struct page *page)
-{
- return &page[1].subpages_mapcount;
-}
-
-static inline atomic_t *compound_pincount_ptr(struct page *page)
-{
- return &page[1].compound_pincount;
-}
-
/*
* Used for sizing the vmemmap region on some architectures
*/
@@ -546,7 +491,15 @@ struct vm_area_struct {
* See vmf_insert_mixed_prot() for discussion.
*/
pgprot_t vm_page_prot;
- unsigned long vm_flags; /* Flags, see mm.h. */
+
+ /*
+ * Flags, see mm.h.
+ * To modify use vm_flags_{init|reset|set|clear|mod} functions.
+ */
+ union {
+ const vm_flags_t vm_flags;
+ vm_flags_t __private __vm_flags;
+ };
/*
* For areas with an address space and backing store,
@@ -658,7 +611,7 @@ struct mm_struct {
raw_spinlock_t cid_lock;
#endif
#ifdef CONFIG_MMU
- atomic_long_t pgtables_bytes; /* PTE page table pages */
+ atomic_long_t pgtables_bytes; /* size of all page tables */
#endif
int map_count; /* number of VMAs */
@@ -915,9 +868,7 @@ struct vma_iterator {
static inline void vma_iter_init(struct vma_iterator *vmi,
struct mm_struct *mm, unsigned long addr)
{
- vmi->mas.tree = &mm->mm_mt;
- vmi->mas.index = addr;
- vmi->mas.node = MAS_START;
+ mas_init(&vmi->mas, &mm->mm_mt, addr);
}
#ifdef CONFIG_SCHED_MM_CID
@@ -1126,4 +1077,87 @@ enum fault_flag {
typedef unsigned int __bitwise zap_flags_t;
+/*
+ * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
+ * other. Here is what they mean, and how to use them:
+ *
+ *
+ * FIXME: For pages which are part of a filesystem, mappings are subject to the
+ * lifetime enforced by the filesystem and we need guarantees that longterm
+ * users like RDMA and V4L2 only establish mappings which coordinate usage with
+ * the filesystem. Ideas for this coordination include revoking the longterm
+ * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was
+ * added after the problem with filesystems was found FS DAX VMAs are
+ * specifically failed. Filesystem pages are still subject to bugs and use of
+ * FOLL_LONGTERM should be avoided on those pages.
+ *
+ * In the CMA case: long term pins in a CMA region would unnecessarily fragment
+ * that region. And so, CMA attempts to migrate the page before pinning, when
+ * FOLL_LONGTERM is specified.
+ *
+ * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
+ * but an additional pin counting system) will be invoked. This is intended for
+ * anything that gets a page reference and then touches page data (for example,
+ * Direct IO). This lets the filesystem know that some non-file-system entity is
+ * potentially changing the pages' data. In contrast to FOLL_GET (whose pages
+ * are released via put_page()), FOLL_PIN pages must be released, ultimately, by
+ * a call to unpin_user_page().
+ *
+ * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
+ * and separate refcounting mechanisms, however, and that means that each has
+ * its own acquire and release mechanisms:
+ *
+ * FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
+ *
+ * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
+ *
+ * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
+ * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
+ * calls applied to them, and that's perfectly OK. This is a constraint on the
+ * callers, not on the pages.)
+ *
+ * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
+ * directly by the caller. That's in order to help avoid mismatches when
+ * releasing pages: get_user_pages*() pages must be released via put_page(),
+ * while pin_user_pages*() pages must be released via unpin_user_page().
+ *
+ * Please see Documentation/core-api/pin_user_pages.rst for more information.
+ */
+
+enum {
+ /* check pte is writable */
+ FOLL_WRITE = 1 << 0,
+ /* do get_page on page */
+ FOLL_GET = 1 << 1,
+ /* give error on hole if it would be zero */
+ FOLL_DUMP = 1 << 2,
+ /* get_user_pages read/write w/o permission */
+ FOLL_FORCE = 1 << 3,
+ /*
+ * if a disk transfer is needed, start the IO and return without waiting
+ * upon it
+ */
+ FOLL_NOWAIT = 1 << 4,
+ /* do not fault in pages */
+ FOLL_NOFAULT = 1 << 5,
+ /* check page is hwpoisoned */
+ FOLL_HWPOISON = 1 << 6,
+ /* don't do file mappings */
+ FOLL_ANON = 1 << 7,
+ /*
+ * FOLL_LONGTERM indicates that the page will be held for an indefinite
+ * time period _often_ under userspace control. This is in contrast to
+ * iov_iter_get_pages(), whose usages are transient.
+ */
+ FOLL_LONGTERM = 1 << 8,
+ /* split huge pmd before returning */
+ FOLL_SPLIT_PMD = 1 << 9,
+ /* allow returning PCI P2PDMA pages */
+ FOLL_PCI_P2PDMA = 1 << 10,
+ /* allow interrupts from generic signals */
+ FOLL_INTERRUPTIBLE = 1 << 11,
+
+ /* See also internal only FOLL flags in mm/internal.h */
+};
+
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 58b3abd457a3..cee1e4b566d8 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags)
}
unsigned long vm_commit_limit(void);
+
+/*
+ * Denies creating a writable executable mapping or gaining executable permissions.
+ *
+ * This denies the following:
+ *
+ * a) mmap(PROT_WRITE | PROT_EXEC)
+ *
+ * b) mmap(PROT_WRITE)
+ * mprotect(PROT_EXEC)
+ *
+ * c) mmap(PROT_WRITE)
+ * mprotect(PROT_READ)
+ * mprotect(PROT_EXEC)
+ *
+ * But allows the following:
+ *
+ * d) mmap(PROT_READ | PROT_EXEC)
+ * mmap(PROT_READ | PROT_EXEC | PROT_BTI)
+ */
+static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
+{
+ if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ return false;
+
+ if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+ return true;
+
+ if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+ return true;
+
+ return false;
+}
+
#endif /* _LINUX_MMAN_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d6c06e140277..64a3e051c3c4 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -269,7 +269,6 @@ extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif
struct mmu_notifier_range {
- struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long start;
unsigned long end;
@@ -514,12 +513,10 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
enum mmu_notifier_event event,
unsigned flags,
- struct vm_area_struct *vma,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
- range->vma = vma;
range->event = event;
range->mm = mm;
range->start = start;
@@ -530,10 +527,10 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
static inline void mmu_notifier_range_init_owner(
struct mmu_notifier_range *range,
enum mmu_notifier_event event, unsigned int flags,
- struct vm_area_struct *vma, struct mm_struct *mm,
- unsigned long start, unsigned long end, void *owner)
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, void *owner)
{
- mmu_notifier_range_init(range, event, flags, vma, mm, start, end);
+ mmu_notifier_range_init(range, event, flags, mm, start, end);
range->owner = owner;
}
@@ -659,9 +656,9 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
range->end = end;
}
-#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \
+#define mmu_notifier_range_init(range,event,flags,mm,start,end) \
_mmu_notifier_range_init(range, start, end)
-#define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \
+#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \
end, owner) \
_mmu_notifier_range_init(range, start, end)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd28a100d9e4..9fb1b03b83b2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -7,6 +7,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
+#include <linux/list_nulls.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
@@ -312,7 +313,7 @@ enum lruvec_flags {
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
* corresponding generation. The gen counter in folio->flags stores gen+1 while
- * a page is on one of lrugen->lists[]. Otherwise it stores 0.
+ * a page is on one of lrugen->folios[]. Otherwise it stores 0.
*
* A page is added to the youngest generation on faulting. The aging needs to
* check the accessed bit at least twice before handing this page over to the
@@ -324,8 +325,8 @@ enum lruvec_flags {
* rest of generations, if they exist, are considered inactive. See
* lru_gen_is_active().
*
- * PG_active is always cleared while a page is on one of lrugen->lists[] so that
- * the aging needs not to worry about it. And it's set again when a page
+ * PG_active is always cleared while a page is on one of lrugen->folios[] so
+ * that the aging needs not to worry about it. And it's set again when a page
* considered active is isolated for non-reclaiming purposes, e.g., migration.
* See lru_gen_add_folio() and lru_gen_del_folio().
*
@@ -404,7 +405,7 @@ enum {
* The number of pages in each generation is eventually consistent and therefore
* can be transiently negative when reset_batch_size() is pending.
*/
-struct lru_gen_struct {
+struct lru_gen_folio {
/* the aging increments the youngest generation number */
unsigned long max_seq;
/* the eviction increments the oldest generation numbers */
@@ -412,7 +413,7 @@ struct lru_gen_struct {
/* the birth time of each generation in jiffies */
unsigned long timestamps[MAX_NR_GENS];
/* the multi-gen LRU lists, lazily sorted on eviction */
- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the multi-gen LRU sizes, eventually consistent */
long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the exponential moving average of refaulted */
@@ -426,6 +427,14 @@ struct lru_gen_struct {
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* whether the multi-gen LRU is enabled */
bool enabled;
+#ifdef CONFIG_MEMCG
+ /* the memcg generation this lru_gen_folio belongs to */
+ u8 gen;
+ /* the list segment this lru_gen_folio belongs to */
+ u8 seg;
+ /* per-node lru_gen_folio list for global reclaim */
+ struct hlist_nulls_node list;
+#endif
};
enum {
@@ -461,7 +470,7 @@ struct lru_gen_mm_state {
struct lru_gen_mm_walk {
/* the lruvec under reclaim */
struct lruvec *lruvec;
- /* unstable max_seq from lru_gen_struct */
+ /* unstable max_seq from lru_gen_folio */
unsigned long max_seq;
/* the next address within an mm to scan */
unsigned long next_addr;
@@ -479,12 +488,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
#ifdef CONFIG_MEMCG
+
+/*
+ * For each node, memcgs are divided into two generations: the old and the
+ * young. For each generation, memcgs are randomly sharded into multiple bins
+ * to improve scalability. For each bin, the hlist_nulls is virtually divided
+ * into three segments: the head, the tail and the default.
+ *
+ * An onlining memcg is added to the tail of a random bin in the old generation.
+ * The eviction starts at the head of a random bin in the old generation. The
+ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
+ * the old generation, is incremented when all its bins become empty.
+ *
+ * There are four operations:
+ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
+ * current generation (old or young) and updates its "seg" to "head";
+ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
+ * current generation (old or young) and updates its "seg" to "tail";
+ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
+ * generation, updates its "gen" to "old" and resets its "seg" to "default";
+ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
+ * young generation, updates its "gen" to "young" and resets its "seg" to
+ * "default".
+ *
+ * The events that trigger the above operations are:
+ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
+ * 2. The first attempt to reclaim an memcg below low, which triggers
+ * MEMCG_LRU_TAIL;
+ * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
+ * which triggers MEMCG_LRU_TAIL;
+ * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
+ * which triggers MEMCG_LRU_YOUNG;
+ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
+ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
+ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
+ *
+ * Note that memcg LRU only applies to global reclaim, and the round-robin
+ * incrementing of their max_seq counters ensures the eventual fairness to all
+ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
+ */
+#define MEMCG_NR_GENS 2
+#define MEMCG_NR_BINS 8
+
+struct lru_gen_memcg {
+ /* the per-node memcg generation counter */
+ unsigned long seq;
+ /* each memcg has one lru_gen_folio per node */
+ unsigned long nr_memcgs[MEMCG_NR_GENS];
+ /* per-node lru_gen_folio list for global reclaim */
+ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
+ /* protects the above */
+ spinlock_t lock;
+};
+
+void lru_gen_init_pgdat(struct pglist_data *pgdat);
+
void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
-#endif
+void lru_gen_online_memcg(struct mem_cgroup *memcg);
+void lru_gen_offline_memcg(struct mem_cgroup *memcg);
+void lru_gen_release_memcg(struct mem_cgroup *memcg);
+void lru_gen_soft_reclaim(struct lruvec *lruvec);
+
+#else /* !CONFIG_MEMCG */
+
+#define MEMCG_NR_GENS 1
+
+struct lru_gen_memcg {
+};
+
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+}
+
+#endif /* CONFIG_MEMCG */
#else /* !CONFIG_LRU_GEN */
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+}
+
static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}
@@ -494,6 +578,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
}
#ifdef CONFIG_MEMCG
+
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}
@@ -501,7 +586,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
}
-#endif
+
+static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
+{
+}
+
+#endif /* CONFIG_MEMCG */
#endif /* CONFIG_LRU_GEN */
@@ -524,7 +626,7 @@ struct lruvec {
unsigned long flags;
#ifdef CONFIG_LRU_GEN
/* evictable pages divided into generations */
- struct lru_gen_struct lrugen;
+ struct lru_gen_folio lrugen;
/* to concurrently iterate lru_gen_mm_list */
struct lru_gen_mm_state mm_state;
#endif
@@ -1110,6 +1212,31 @@ struct deferred_split {
};
#endif
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Per NUMA node memory failure handling statistics.
+ */
+struct memory_failure_stats {
+ /*
+ * Number of raw pages poisoned.
+ * Cases not accounted: memory outside kernel control, offline page,
+ * arch-specific memory_failure (SGX), hwpoison_filter() filtered
+ * error events, and unpoison actions from hwpoison_unpoison.
+ */
+ unsigned long total;
+ /*
+ * Recovery results of poisoned raw pages handled by memory_failure,
+ * in sync with mf_result.
+ * total = ignored + failed + delayed + recovered.
+ * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
+ */
+ unsigned long ignored;
+ unsigned long failed;
+ unsigned long delayed;
+ unsigned long recovered;
+};
+#endif
+
/*
* On NUMA machines, each NUMA node would have a pg_data_t to describe
* it's memory layout. On UMA machines there is a single pglist_data which
@@ -1243,6 +1370,8 @@ typedef struct pglist_data {
#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
struct lru_gen_mm_walk mm_walk;
+ /* lru_gen_folio list */
+ struct lru_gen_memcg memcg_lru;
#endif
CACHELINE_PADDING(_pad2_);
@@ -1253,6 +1382,9 @@ typedef struct pglist_data {
#ifdef CONFIG_NUMA
struct memory_tier __rcu *memtier;
#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ struct memory_failure_stats mf_stats;
+#endif
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 69e93a0c1277..a7e3a3405520 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -531,6 +531,7 @@ PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND)
* available at this point.
*/
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
+#define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f))
#else
PAGEFLAG_FALSE(HighMem, highmem)
#endif
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 22be4582faae..bc2e39090a1f 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -7,15 +7,35 @@
#include <linux/stackdepot.h>
struct pglist_data;
+
+/**
+ * struct page_ext_operations - per page_ext client operations
+ * @offset: Offset to the client's data within page_ext. Offset is returned to
+ * the client by page_ext_init.
+ * @size: The size of the client data within page_ext.
+ * @need: Function that returns true if client requires page_ext.
+ * @init: (optional) Called to initialize client once page_exts are allocated.
+ * @need_shared_flags: True when client is using shared page_ext->flags
+ * field.
+ *
+ * Each Page Extension client must define page_ext_operations in
+ * page_ext_ops array.
+ */
struct page_ext_operations {
size_t offset;
size_t size;
bool (*need)(void);
void (*init)(void);
+ bool need_shared_flags;
};
+extern bool deferred_struct_pages;
+
#ifdef CONFIG_PAGE_EXTENSION
+/*
+ * The page_ext_flags users must set need_shared_flags to true.
+ */
enum page_ext_flags {
PAGE_EXT_OWNER,
PAGE_EXT_OWNER_ALLOCATED,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2f5b36f446cc..0acb8e1fb7af 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -547,6 +547,26 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping,
}
/**
+ * filemap_grab_folio - grab a folio from the page cache
+ * @mapping: The address space to search
+ * @index: The page index
+ *
+ * Looks up the page cache entry at @mapping & @index. If no folio is found,
+ * a new folio is created. The folio is locked, marked as accessed, and
+ * returned.
+ *
+ * Return: A found or created folio. NULL if no folio is found and failed to
+ * create a folio.
+ */
+static inline struct folio *filemap_grab_folio(struct address_space *mapping,
+ pgoff_t index)
+{
+ return __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(mapping));
+}
+
+/**
* find_get_page - find and get a page reference
* @mapping: the address_space to search
* @offset: the page index
@@ -719,16 +739,8 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_contig(struct address_space *mapping,
pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
-unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
- pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
- struct page **pages);
-static inline unsigned find_get_pages_tag(struct address_space *mapping,
- pgoff_t *index, xa_mark_t tag, unsigned int nr_pages,
- struct page **pages)
-{
- return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
- nr_pages, pages);
-}
+unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index);
@@ -744,6 +756,8 @@ static inline struct page *grab_cache_page(struct address_space *mapping,
struct folio *read_cache_folio(struct address_space *, pgoff_t index,
filler_t *filler, struct file *file);
+struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index,
+ gfp_t flags);
struct page *read_cache_page(struct address_space *, pgoff_t index,
filler_t *filler, struct file *file);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 215eb6c3bdc9..f582f7213ea5 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -26,14 +26,6 @@ struct pagevec {
};
void __pagevec_release(struct pagevec *pvec);
-unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *index, pgoff_t end,
- xa_mark_t tag);
-static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *index, xa_mark_t tag)
-{
- return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
-}
static inline void pagevec_init(struct pagevec *pvec)
{
@@ -103,6 +95,11 @@ static inline void folio_batch_init(struct folio_batch *fbatch)
fbatch->percpu_pvec_drained = false;
}
+static inline void folio_batch_reinit(struct folio_batch *fbatch)
+{
+ fbatch->nr = 0;
+}
+
static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
{
return fbatch->nr;
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 959f52e5867d..27a6df448ee5 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -21,7 +21,16 @@ struct mm_walk;
* depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
* Any folded depths (where PTRS_PER_P?D is equal to 1)
* are skipped.
- * @hugetlb_entry: if set, called for each hugetlb entry
+ * @hugetlb_entry: if set, called for each hugetlb entry. This hook
+ * function is called with the vma lock held, in order to
+ * protect against a concurrent freeing of the pte_t* or
+ * the ptl. In some cases, the hook function needs to drop
+ * and retake the vma lock in order to avoid deadlocks
+ * while calling other functions. In such cases the hook
+ * function must either refrain from accessing the pte or
+ * ptl after dropping the vma lock, or else revalidate
+ * those items after re-acquiring the vma lock and before
+ * accessing them.
* @test_walk: caller specific callback function to determine whether
* we walk over the current vma or not. Returning 0 means
* "do page table walk over the current vma", returning
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 1159b25b0542..c63cd44777ec 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1064,35 +1064,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#define arch_start_context_switch(prev) do {} while (0)
#endif
-/*
- * When replacing an anonymous page by a real (!non) swap entry, we clear
- * PG_anon_exclusive from the page and instead remember whether the flag was
- * set in the swp pte. During fork(), we have to mark the entry as !exclusive
- * (possibly shared). On swapin, we use that information to restore
- * PG_anon_exclusive, which is very helpful in cases where we might have
- * additional (e.g., FOLL_GET) references on a page and wouldn't be able to
- * detect exclusivity.
- *
- * These functions don't apply to non-swap entries (e.g., migration, hwpoison,
- * ...).
- */
-#ifndef __HAVE_ARCH_PTE_SWP_EXCLUSIVE
-static inline pte_t pte_swp_mkexclusive(pte_t pte)
-{
- return pte;
-}
-
-static inline int pte_swp_exclusive(pte_t pte)
-{
- return false;
-}
-
-static inline pte_t pte_swp_clear_exclusive(pte_t pte)
-{
- return pte;
-}
-#endif
-
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
@@ -1214,7 +1185,8 @@ static inline int track_pfn_copy(struct vm_area_struct *vma)
* can be for the entire vma (in which case pfn, size are zero).
*/
static inline void untrack_pfn(struct vm_area_struct *vma,
- unsigned long pfn, unsigned long size)
+ unsigned long pfn, unsigned long size,
+ bool mm_wr_locked)
{
}
@@ -1232,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
pfn_t pfn);
extern int track_pfn_copy(struct vm_area_struct *vma);
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
- unsigned long size);
+ unsigned long size, bool mm_wr_locked);
extern void untrack_pfn_moved(struct vm_area_struct *vma);
#endif
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 07481bb87d4e..c758809d5bcf 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -16,6 +16,21 @@
struct fs_pin;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+/*
+ * sysctl for vm.memfd_noexec
+ * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
+ * acts like MFD_EXEC was set.
+ * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
+ * acts like MFD_NOEXEC_SEAL was set.
+ * 2: memfd_create() without MFD_NOEXEC_SEAL will be
+ * rejected.
+ */
+#define MEMFD_NOEXEC_SCOPE_EXEC 0
+#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1
+#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2
+#endif
+
struct pid_namespace {
struct idr idr;
struct rcu_head rcu;
@@ -31,6 +46,10 @@ struct pid_namespace {
struct ucounts *ucounts;
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ /* sysctl for vm.memfd_noexec */
+ int memfd_noexec_scope;
+#endif
} __randomize_layout;
extern struct pid_namespace init_pid_ns;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bd3504d11b15..a4570da03e58 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -194,6 +194,8 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long address, rmap_t flags);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long address);
+void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
+ unsigned long address);
void page_add_file_rmap(struct page *, struct vm_area_struct *,
bool compound);
void page_remove_rmap(struct page *, struct vm_area_struct *,
@@ -201,12 +203,19 @@ void page_remove_rmap(struct page *, struct vm_area_struct *,
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long address, rmap_t flags);
-void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
unsigned long address);
static inline void __page_dup_rmap(struct page *page, bool compound)
{
- atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
+ if (compound) {
+ struct folio *folio = (struct folio *)page;
+
+ VM_BUG_ON_PAGE(compound && !PageHead(page), page);
+ atomic_inc(&folio->_entire_mapcount);
+ } else {
+ atomic_inc(&page->_mapcount);
+ }
}
static inline void page_dup_file_rmap(struct page *page, bool compound)
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 8270ad7ae14c..0e17ae7fbfd3 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm)
* lifecycle of this mm, just for simplicity.
*/
#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
+
+#define MMF_HAS_MDWE 28
+#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
+
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
- MMF_DISABLE_THP_MASK)
+ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index d500ea967dc7..103d1000a5a2 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -92,14 +92,8 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
int shmem_unuse(unsigned int type);
-extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
- pgoff_t index, bool shmem_huge_force);
-static inline bool shmem_huge_enabled(struct vm_area_struct *vma,
- bool shmem_huge_force)
-{
- return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff,
- shmem_huge_force);
-}
+extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
+ struct mm_struct *mm, unsigned long vm_flags);
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end);
@@ -115,6 +109,14 @@ enum sgp_type {
int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
enum sgp_type sgp);
+struct folio *shmem_read_folio_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp);
+
+static inline struct folio *shmem_read_folio(struct address_space *mapping,
+ pgoff_t index)
+{
+ return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping));
+}
static inline struct page *shmem_read_mapping_page(
struct address_space *mapping, pgoff_t index)
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 5834bad8ad78..a61e7d55d0d3 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -72,7 +72,7 @@ struct kmem_cache {
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
struct kasan_cache kasan_info;
#endif
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index aa0ee1678d29..f6df03f934e5 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -136,7 +136,7 @@ struct kmem_cache {
unsigned int *random_seq;
#endif
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
struct kasan_cache kasan_info;
#endif
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 9ca7798d7a31..e58306783d8e 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -1,11 +1,22 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
- * A generic stack depot implementation
+ * Stack depot - a stack trace storage that avoids duplication.
+ *
+ * Stack depot is intended to be used by subsystems that need to store and
+ * later retrieve many potentially duplicated stack traces without wasting
+ * memory.
+ *
+ * For example, KASAN needs to save allocation and free stack traces for each
+ * object. Storing two stack traces per object requires a lot of memory (e.g.
+ * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free
+ * stack traces often repeat, using stack depot allows to save about 100x space.
+ *
+ * Stack traces are never removed from the stack depot.
*
* Author: Alexander Potapenko <glider@google.com>
* Copyright (C) 2016 Google, Inc.
*
- * Based on code by Dmitry Chernenkov.
+ * Based on the code by Dmitry Chernenkov.
*/
#ifndef _LINUX_STACKDEPOT_H
@@ -14,62 +25,143 @@
#include <linux/gfp.h>
typedef u32 depot_stack_handle_t;
+
/*
* Number of bits in the handle that stack depot doesn't use. Users may store
- * information in them.
+ * information in them via stack_depot_set/get_extra_bits.
*/
#define STACK_DEPOT_EXTRA_BITS 5
-depot_stack_handle_t __stack_depot_save(unsigned long *entries,
- unsigned int nr_entries,
- unsigned int extra_bits,
- gfp_t gfp_flags, bool can_alloc);
-
/*
- * Every user of stack depot has to call stack_depot_init() during its own init
- * when it's decided that it will be calling stack_depot_save() later. This is
- * recommended for e.g. modules initialized later in the boot process, when
- * slab_is_available() is true.
- *
- * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot
- * enabled as part of mm_init(), for subsystems where it's known at compile time
- * that stack depot will be used.
- *
- * Another alternative is to call stack_depot_want_early_init(), when the
- * decision to use stack depot is taken e.g. when evaluating kernel boot
- * parameters, which precedes the enablement point in mm_init().
- *
- * stack_depot_init() and stack_depot_want_early_init() can be called regardless
- * of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print
- * functions should only be called from code that makes sure CONFIG_STACKDEPOT
- * is enabled.
+ * Using stack depot requires its initialization, which can be done in 3 ways:
+ *
+ * 1. Selecting CONFIG_STACKDEPOT_ALWAYS_INIT. This option is suitable in
+ * scenarios where it's known at compile time that stack depot will be used.
+ * Enabling this config makes the kernel initialize stack depot in mm_init().
+ *
+ * 2. Calling stack_depot_request_early_init() during early boot, before
+ * stack_depot_early_init() in mm_init() completes. For example, this can
+ * be done when evaluating kernel boot parameters.
+ *
+ * 3. Calling stack_depot_init(). Possible after boot is complete. This option
+ * is recommended for modules initialized later in the boot process, after
+ * mm_init() completes.
+ *
+ * stack_depot_init() and stack_depot_request_early_init() can be called
+ * regardless of whether CONFIG_STACKDEPOT is enabled and are no-op when this
+ * config is disabled. The save/fetch/print stack depot functions can only be
+ * called from the code that makes sure CONFIG_STACKDEPOT is enabled _and_
+ * initializes stack depot via one of the ways listed above.
*/
#ifdef CONFIG_STACKDEPOT
int stack_depot_init(void);
-void __init stack_depot_want_early_init(void);
+void __init stack_depot_request_early_init(void);
-/* This is supposed to be called only from mm_init() */
+/* Must be only called from mm_init(). */
int __init stack_depot_early_init(void);
#else
static inline int stack_depot_init(void) { return 0; }
-static inline void stack_depot_want_early_init(void) { }
+static inline void stack_depot_request_early_init(void) { }
static inline int stack_depot_early_init(void) { return 0; }
#endif
+/**
+ * __stack_depot_save - Save a stack trace to stack depot
+ *
+ * @entries: Pointer to the stack trace
+ * @nr_entries: Number of frames in the stack
+ * @alloc_flags: Allocation GFP flags
+ * @can_alloc: Allocate stack pools (increased chance of failure if false)
+ *
+ * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is
+ * %true, stack depot can replenish the stack pools in case no space is left
+ * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids
+ * any allocations and fails if no space is left to store the stack trace.
+ *
+ * If the provided stack trace comes from the interrupt context, only the part
+ * up to the interrupt entry is saved.
+ *
+ * Context: Any context, but setting @can_alloc to %false is required if
+ * alloc_pages() cannot be used from the current context. Currently
+ * this is the case for contexts where neither %GFP_ATOMIC nor
+ * %GFP_NOWAIT can be used (NMI, raw_spin_lock).
+ *
+ * Return: Handle of the stack struct stored in depot, 0 on failure
+ */
+depot_stack_handle_t __stack_depot_save(unsigned long *entries,
+ unsigned int nr_entries,
+ gfp_t gfp_flags, bool can_alloc);
+
+/**
+ * stack_depot_save - Save a stack trace to stack depot
+ *
+ * @entries: Pointer to the stack trace
+ * @nr_entries: Number of frames in the stack
+ * @alloc_flags: Allocation GFP flags
+ *
+ * Context: Contexts where allocations via alloc_pages() are allowed.
+ * See __stack_depot_save() for more details.
+ *
+ * Return: Handle of the stack trace stored in depot, 0 on failure
+ */
depot_stack_handle_t stack_depot_save(unsigned long *entries,
unsigned int nr_entries, gfp_t gfp_flags);
+/**
+ * stack_depot_fetch - Fetch a stack trace from stack depot
+ *
+ * @handle: Stack depot handle returned from stack_depot_save()
+ * @entries: Pointer to store the address of the stack trace
+ *
+ * Return: Number of frames for the fetched stack
+ */
unsigned int stack_depot_fetch(depot_stack_handle_t handle,
unsigned long **entries);
-unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle);
+/**
+ * stack_depot_print - Print a stack trace from stack depot
+ *
+ * @stack: Stack depot handle returned from stack_depot_save()
+ */
+void stack_depot_print(depot_stack_handle_t stack);
+/**
+ * stack_depot_snprint - Print a stack trace from stack depot into a buffer
+ *
+ * @handle: Stack depot handle returned from stack_depot_save()
+ * @buf: Pointer to the print buffer
+ * @size: Size of the print buffer
+ * @spaces: Number of leading spaces to print
+ *
+ * Return: Number of bytes printed
+ */
int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
int spaces);
-void stack_depot_print(depot_stack_handle_t stack);
+/**
+ * stack_depot_set_extra_bits - Set extra bits in a stack depot handle
+ *
+ * @handle: Stack depot handle returned from stack_depot_save()
+ * @extra_bits: Value to set the extra bits
+ *
+ * Return: Stack depot handle with extra bits set
+ *
+ * Stack depot handles have a few unused bits, which can be used for storing
+ * user-specific information. These bits are transparent to the stack depot.
+ */
+depot_stack_handle_t __must_check stack_depot_set_extra_bits(
+ depot_stack_handle_t handle, unsigned int extra_bits);
+
+/**
+ * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle
+ *
+ * @handle: Stack depot handle with extra bits saved
+ *
+ * Return: Extra bits retrieved from the stack depot handle
+ */
+unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle);
#endif
diff --git a/include/linux/string.h b/include/linux/string.h
index db28802ab0a6..c062c581a98b 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -177,6 +177,7 @@ extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
+extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ceed49516ad..209a425739a9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -401,8 +401,8 @@ extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
extern void lru_add_drain_all(void);
-extern void deactivate_page(struct page *page);
-extern void mark_page_lazyfree(struct page *page);
+void folio_deactivate(struct folio *folio);
+void folio_mark_lazyfree(struct folio *folio);
extern void swap_setup(void);
extern void lru_cache_add_inactive_or_unevictable(struct page *page,
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index b982dd614572..3a451b7afcb3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address);
#ifdef CONFIG_HUGETLB_PAGE
-extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
+extern void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
#endif /* CONFIG_HUGETLB_PAGE */
#else /* CONFIG_MIGRATION */
@@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { }
#ifdef CONFIG_HUGETLB_PAGE
-static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
+static inline void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
#endif /* CONFIG_HUGETLB_PAGE */
static inline int is_writable_migration_entry(swp_entry_t entry)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 9df0b9a762cc..3767f18114ef 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -73,7 +73,7 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
-extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
/* mm helpers */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 096d48aa3437..69250efa03d1 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -76,6 +76,7 @@ struct vmap_area {
unsigned long subtree_max_size; /* in "free" tree */
struct vm_struct *vm; /* in "busy" tree */
};
+ unsigned long flags; /* mark type of vm_map_ram area */
};
/* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 06f9291b6fd5..46020373e155 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -207,7 +207,7 @@ static inline void wait_on_inode(struct inode *inode)
#include <linux/cgroup.h>
#include <linux/bio.h>
-void __inode_attach_wb(struct inode *inode, struct page *page);
+void __inode_attach_wb(struct inode *inode, struct folio *folio);
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
struct inode *inode)
__releases(&inode->i_lock);
@@ -222,16 +222,16 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb);
/**
* inode_attach_wb - associate an inode with its wb
* @inode: inode of interest
- * @page: page being dirtied (may be NULL)
+ * @folio: folio being dirtied (may be NULL)
*
* If @inode doesn't have its wb, associate it with the wb matching the
- * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o
+ * memcg of @folio or, if @folio is NULL, %current. May be called w/ or w/o
* @inode->i_lock.
*/
-static inline void inode_attach_wb(struct inode *inode, struct page *page)
+static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
if (!inode->i_wb)
- __inode_attach_wb(inode, page);
+ __inode_attach_wb(inode, folio);
}
/**
@@ -290,7 +290,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
#else /* CONFIG_CGROUP_WRITEBACK */
-static inline void inode_attach_wb(struct inode *inode, struct page *page)
+static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
}
@@ -366,11 +366,9 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
bool wb_over_bg_thresh(struct bdi_writeback *wb);
-typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
+typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc,
void *data);
-int generic_writepages(struct address_space *mapping,
- struct writeback_control *wbc);
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end);
int write_cache_pages(struct address_space *mapping,
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 44dd6d6e01bc..741703b45f61 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1643,7 +1643,8 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index,
* @update: Function to call when updating a node.
*
* The XArray can notify a caller after it has updated an xa_node.
- * This is advanced functionality and is only needed by the page cache.
+ * This is advanced functionality and is only needed by the page
+ * cache and swap cache.
*/
static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
{
diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index 3d708dae1542..ef75ea606ab2 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -91,12 +91,38 @@ TRACE_EVENT(cma_alloc_start,
__entry->align)
);
-DEFINE_EVENT(cma_alloc_class, cma_alloc_finish,
+TRACE_EVENT(cma_alloc_finish,
TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
- unsigned long count, unsigned int align),
+ unsigned long count, unsigned int align, int errorno),
- TP_ARGS(name, pfn, page, count, align)
+ TP_ARGS(name, pfn, page, count, align, errorno),
+
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(unsigned long, pfn)
+ __field(const struct page *, page)
+ __field(unsigned long, count)
+ __field(unsigned int, align)
+ __field(int, errorno)
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->pfn = pfn;
+ __entry->page = page;
+ __entry->count = count;
+ __entry->align = align;
+ __entry->errorno = errorno;
+ ),
+
+ TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u errorno=%d",
+ __get_str(name),
+ __entry->pfn,
+ __entry->page,
+ __entry->count,
+ __entry->align,
+ __entry->errorno)
);
DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry,
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 412b5a46374c..9db52bc4ce19 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -31,7 +31,6 @@
gfpflag_string(__GFP_HIGHMEM), \
gfpflag_string(GFP_DMA32), \
gfpflag_string(__GFP_HIGH), \
- gfpflag_string(__GFP_ATOMIC), \
gfpflag_string(__GFP_IO), \
gfpflag_string(__GFP_FS), \
gfpflag_string(__GFP_NOWARN), \
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..e8c07da58c9f 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,7 @@
#define F_SEAL_GROW 0x0004 /* prevent file from growing */
#define F_SEAL_WRITE 0x0008 /* prevent writes */
#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
+#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */
/* (1U << 31) is reserved for signed error codes */
/*
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 7a8a26751c23..273a4e15dfcf 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -8,6 +8,10 @@
#define MFD_CLOEXEC 0x0001U
#define MFD_ALLOW_SEALING 0x0002U
#define MFD_HUGETLB 0x0004U
+/* not executable and sealed to prevent changing to executable. */
+#define MFD_NOEXEC_SEAL 0x0008U
+/* executable */
+#define MFD_EXEC 0x0010U
/*
* Huge page size encoding when MFD_HUGETLB is specified, and a huge page
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a5e06dcbba13..1312a137f7fb 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -281,6 +281,12 @@ struct prctl_mm_map {
# define PR_SME_VL_LEN_MASK 0xffff
# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */
+/* Memory deny write / execute */
+#define PR_SET_MDWE 65
+# define PR_MDWE_REFUSE_EXEC_GAIN 1
+
+#define PR_GET_MDWE 66
+
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0