From 0ae0227fa31dda5bfc6b5a0145952d46fe57408b Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Tue, 6 May 2025 03:30:34 +0800 Subject: mm/codetag: move tag retrieval back upfront in __free_pages() Commit 51ff4d7486f0 ("mm: avoid extra mem_alloc_profiling_enabled() checks") introduces a possible use-after-free scenario, when page is non-compound, page[0] could be released by other thread right after put_page_testzero failed in current thread, pgalloc_tag_sub_pages afterwards would manipulate an invalid page for accounting remaining pages: [timeline] [thread1] [thread2] | alloc_page non-compound V | get_page, rf counter inc V | in ___free_pages | put_page_testzero fails V | put_page, page released V | in ___free_pages, | pgalloc_tag_sub_pages | manipulate an invalid page V Restore __free_pages() to its state before, retrieve alloc tag beforehand. Link: https://lkml.kernel.org/r/20250505193034.91682-1-00107082@163.com Fixes: 51ff4d7486f0 ("mm: avoid extra mem_alloc_profiling_enabled() checks") Signed-off-by: David Wang <00107082@163.com> Acked-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5669baf2a6fe..1b00e14a9780 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1151,14 +1151,9 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) __pgalloc_tag_sub(page, nr); } -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) +/* When tag is not NULL, assuming mem_alloc_profiling_enabled */ +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) { - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = __pgalloc_tag_get(page); if (tag) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } @@ -1168,7 +1163,7 @@ static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ @@ -5065,11 +5060,13 @@ static void ___free_pages(struct page *page, unsigned int order, { /* get PageHead before we drop reference */ int head = PageHead(page); + /* get alloc tag in case the page is released by others */ + struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) __free_frozen_pages(page, order, fpi_flags); else if (!head) { - pgalloc_tag_sub_pages(page, (1 << order) - 1); + pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) __free_frozen_pages(page + (1 << order), order, fpi_flags); -- cgit v1.2.3-59-g8ed1b From 23fa022a07555a9cd2dcfe827c769a89c4c2e21e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 6 May 2025 14:25:08 +0300 Subject: mm/page_alloc: ensure try_alloc_pages() plays well with unaccepted memory try_alloc_pages() will not attempt to allocate memory if the system has *any* unaccepted memory. Memory is accepted as needed and can remain in the system indefinitely, causing the interface to always fail. Rather than immediately giving up, attempt to use already accepted memory on free lists. Pass 'alloc_flags' to cond_accept_memory() and do not accept new memory for ALLOC_TRYLOCK requests. Found via code inspection - only BPF uses this at present and the runtime effects are unclear. Link: https://lkml.kernel.org/r/20250506112509.905147-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Fixes: 97769a53f117 ("mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation") Cc: Alexei Starovoitov Cc: Vlastimil Babka Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Brendan Jackman Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1b00e14a9780..7248e300d36e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -290,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes); #endif static bool page_contains_unaccepted(struct page *page, unsigned int order); -static bool cond_accept_memory(struct zone *zone, unsigned int order); +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags); static bool __free_unaccepted(struct page *page); int page_group_by_mobility_disabled __read_mostly; @@ -3611,7 +3612,7 @@ retry: } } - cond_accept_memory(zone, order); + cond_accept_memory(zone, order, alloc_flags); /* * Detect whether the number of free pages is below high @@ -3638,7 +3639,7 @@ check_alloc_wmark: gfp_mask)) { int ret; - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* @@ -3691,7 +3692,7 @@ try_this_zone: return page; } else { - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* Try again if zone has deferred pages */ @@ -4844,7 +4845,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto failed; } - cond_accept_memory(zone, 0); + cond_accept_memory(zone, 0, alloc_flags); retry_this_zone: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; if (zone_watermark_fast(zone, 0, mark, @@ -4853,7 +4854,7 @@ retry_this_zone: break; } - if (cond_accept_memory(zone, 0)) + if (cond_accept_memory(zone, 0, alloc_flags)) goto retry_this_zone; /* Try again if zone has deferred pages */ @@ -7281,7 +7282,8 @@ static inline bool has_unaccepted_memory(void) return static_branch_unlikely(&zones_with_unaccepted_pages); } -static bool cond_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { long to_accept, wmark; bool ret = false; @@ -7292,6 +7294,10 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) if (list_empty(&zone->unaccepted_pages)) return false; + /* Bailout, since try_to_accept_memory_one() needs to take a lock */ + if (alloc_flags & ALLOC_TRYLOCK) + return false; + wmark = promo_wmark_pages(zone); /* @@ -7348,7 +7354,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) return false; } -static bool cond_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { return false; } @@ -7419,11 +7426,6 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) if (!pcp_allowed_order(order)) return NULL; -#ifdef CONFIG_UNACCEPTED_MEMORY - /* Bailout, since try_to_accept_memory_one() needs to take a lock */ - if (has_unaccepted_memory()) - return NULL; -#endif /* Bailout, since _deferred_grow_zone() needs to take a lock */ if (deferred_pages_enabled()) return NULL; -- cgit v1.2.3-59-g8ed1b From fefc075182275057ce607effaa3daa9e6e3bdc73 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 6 May 2025 16:32:07 +0300 Subject: mm/page_alloc: fix race condition in unaccepted memory handling The page allocator tracks the number of zones that have unaccepted memory using static_branch_enc/dec() and uses that static branch in hot paths to determine if it needs to deal with unaccepted memory. Borislav and Thomas pointed out that the tracking is racy: operations on static_branch are not serialized against adding/removing unaccepted pages to/from the zone. Sanity checks inside static_branch machinery detects it: WARNING: CPU: 0 PID: 10 at kernel/jump_label.c:276 __static_key_slow_dec_cpuslocked+0x8e/0xa0 The comment around the WARN() explains the problem: /* * Warn about the '-1' case though; since that means a * decrement is concurrent with a first (0->1) increment. IOW * people are trying to disable something that wasn't yet fully * enabled. This suggests an ordering problem on the user side. */ The effect of this static_branch optimization is only visible on microbenchmark. Instead of adding more complexity around it, remove it altogether. Link: https://lkml.kernel.org/r/20250506133207.1009676-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Link: https://lore.kernel.org/all/20250506092445.GBaBnVXXyvnazly6iF@fat_crate.local Reported-by: Borislav Petkov Tested-by: Borislav Petkov (AMD) Reported-by: Thomas Gleixner Cc: Vlastimil Babka Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Brendan Jackman Cc: Johannes Weiner Cc: [6.5+] Signed-off-by: Andrew Morton --- mm/internal.h | 1 - mm/mm_init.c | 1 - mm/page_alloc.c | 47 ----------------------------------------------- 3 files changed, 49 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 25a29872c634..5c7a2b43ad76 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1590,7 +1590,6 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); -void unaccepted_cleanup_work(struct work_struct *work); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 327764ca0ee4..eedce9321e13 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1441,7 +1441,6 @@ static void __meminit zone_init_free_lists(struct zone *zone) #ifdef CONFIG_UNACCEPTED_MEMORY INIT_LIST_HEAD(&zone->unaccepted_pages); - INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work); #endif } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7248e300d36e..8258349e49ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7172,16 +7172,8 @@ bool has_managed_dma(void) #ifdef CONFIG_UNACCEPTED_MEMORY -/* Counts number of zones with unaccepted pages. */ -static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); - static bool lazy_accept = true; -void unaccepted_cleanup_work(struct work_struct *work) -{ - static_branch_dec(&zones_with_unaccepted_pages); -} - static int __init accept_memory_parse(char *p) { if (!strcmp(p, "lazy")) { @@ -7206,11 +7198,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) static void __accept_page(struct zone *zone, unsigned long *flags, struct page *page) { - bool last; - list_del(&page->lru); - last = list_empty(&zone->unaccepted_pages); - account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); __ClearPageUnaccepted(page); @@ -7219,28 +7207,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags, accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - - if (last) { - /* - * There are two corner cases: - * - * - If allocation occurs during the CPU bring up, - * static_branch_dec() cannot be used directly as - * it causes a deadlock on cpu_hotplug_lock. - * - * Instead, use schedule_work() to prevent deadlock. - * - * - If allocation occurs before workqueues are initialized, - * static_branch_dec() should be called directly. - * - * Workqueues are initialized before CPU bring up, so this - * will not conflict with the first scenario. - */ - if (system_wq) - schedule_work(&zone->unaccepted_cleanup); - else - unaccepted_cleanup_work(&zone->unaccepted_cleanup); - } } void accept_page(struct page *page) @@ -7277,20 +7243,12 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; } -static inline bool has_unaccepted_memory(void) -{ - return static_branch_unlikely(&zones_with_unaccepted_pages); -} - static bool cond_accept_memory(struct zone *zone, unsigned int order, int alloc_flags) { long to_accept, wmark; bool ret = false; - if (!has_unaccepted_memory()) - return false; - if (list_empty(&zone->unaccepted_pages)) return false; @@ -7328,22 +7286,17 @@ static bool __free_unaccepted(struct page *page) { struct zone *zone = page_zone(page); unsigned long flags; - bool first = false; if (!lazy_accept) return false; spin_lock_irqsave(&zone->lock, flags); - first = list_empty(&zone->unaccepted_pages); list_add_tail(&page->lru, &zone->unaccepted_pages); account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); __SetPageUnaccepted(page); spin_unlock_irqrestore(&zone->lock, flags); - if (first) - static_branch_inc(&zones_with_unaccepted_pages); - return true; } -- cgit v1.2.3-59-g8ed1b