From e42dfe4e0a51b476dcc6f1461c51fdb1b76573aa Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 6 Mar 2024 18:13:26 +0800 Subject: mm: record the migration reason for struct migration_target_control Patch series "make the hugetlb migration strategy consistent", v2. As discussed in previous thread [1], there is an inconsistency when handling hugetlb migration. When handling the migration of freed hugetlb, it prevents fallback to other NUMA nodes in alloc_and_dissolve_hugetlb_folio(). However, when dealing with in-use hugetlb, it allows fallback to other NUMA nodes in alloc_hugetlb_folio_nodemask(), which can break the per-node hugetlb pool and might result in unexpected failures when node bound workloads doesn't get what is asssumed available. This patchset tries to make the hugetlb migration strategy more clear and consistent. Please find details in each patch. [1] https://lore.kernel.org/all/6f26ce22d2fcd523418a085f2c588fe0776d46e7.1706794035.git.baolin.wang@linux.alibaba.com/ This patch (of 2): To support different hugetlb allocation strategies during hugetlb migration based on various migration reasons, record the migration reason in the migration_target_control structure as a preparation. Link: https://lkml.kernel.org/r/cover.1709719720.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/7b95d4981e07211f57139fc5b1f7ce91b920cee4.1709719720.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/migrate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 73a052a382f1..bde63010a3cf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2060,6 +2060,7 @@ static int do_move_pages_to_node(struct list_head *pagelist, int node) struct migration_target_control mtc = { .nid = node, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, -- cgit v1.2.3-59-g8ed1b From 42d0c3fbb5811fbfb663d8ede1d7ffba02e7ae18 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 6 Mar 2024 18:13:27 +0800 Subject: mm: hugetlb: make the hugetlb migration strategy consistent As discussed in previous thread [1], there is an inconsistency when handing hugetlb migration. When handling the migration of freed hugetlb, it prevents fallback to other NUMA nodes in alloc_and_dissolve_hugetlb_folio(). However, when dealing with in-use hugetlb, it allows fallback to other NUMA nodes in alloc_hugetlb_folio_nodemask(), which can break the per-node hugetlb pool and might result in unexpected failures when node bound workloads doesn't get what is asssumed available. To make hugetlb migration strategy more clear, we should list all the scenarios of hugetlb migration and analyze whether allocation fallback is permitted: 1) Memory offline: will call dissolve_free_huge_pages() to free the freed hugetlb, and call do_migrate_range() to migrate the in-use hugetlb. Both can break the per-node hugetlb pool, but as this is an explicit offlining operation, no better choice. So should allow the hugetlb allocation fallback. 2) Memory failure: same as memory offline. Should allow fallback to a different node might be the only option to handle it, otherwise the impact of poisoned memory can be amplified. 3) Longterm pinning: will call migrate_longterm_unpinnable_pages() to migrate in-use and not-longterm-pinnable hugetlb, which can break the per-node pool. But we should fail to longterm pinning if can not allocate on current node to avoid breaking the per-node pool. 4) Syscalls (mbind, migrate_pages, move_pages): these are explicit users operation to move pages to other nodes, so fallback to other nodes should not be prohibited. 5) alloc_contig_range: used by CMA allocation and virtio-mem fake-offline to allocate given range of pages. Now the freed hugetlb migration is not allowed to fallback, to keep consistency, the in-use hugetlb migration should be also not allowed to fallback. 6) alloc_contig_pages: used by kfence, pgtable_debug etc. The strategy should be consistent with that of alloc_contig_range(). Based on the analysis of the various scenarios above, introducing a new helper to determine whether fallback is permitted according to the migration reason.. [1] https://lore.kernel.org/all/6f26ce22d2fcd523418a085f2c588fe0776d46e7.1706794035.git.baolin.wang@linux.alibaba.com/ Link: https://lkml.kernel.org/r/3519fcd41522817307a05b40fb551e2e17e68101.1709719720.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 35 +++++++++++++++++++++++++++++++++-- mm/hugetlb.c | 14 ++++++++++++-- mm/mempolicy.c | 3 ++- mm/migrate.c | 3 ++- 4 files changed, 49 insertions(+), 6 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 300de33c6fde..d748628efc5e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -723,7 +723,8 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask, gfp_t gfp_mask); + nodemask_t *nmask, gfp_t gfp_mask, + bool allow_alloc_fallback); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, @@ -946,6 +947,30 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) return modified_mask; } +static inline bool htlb_allow_alloc_fallback(int reason) +{ + bool allowed_fallback = false; + + /* + * Note: the memory offline, memory failure and migration syscalls will + * be allowed to fallback to other nodes due to lack of a better chioce, + * that might break the per-node hugetlb pool. While other cases will + * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool. + */ + switch (reason) { + case MR_MEMORY_HOTPLUG: + case MR_MEMORY_FAILURE: + case MR_SYSCALL: + case MR_MEMPOLICY_MBIND: + allowed_fallback = true; + break; + default: + break; + } + + return allowed_fallback; +} + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { @@ -1041,7 +1066,8 @@ static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, static inline struct folio * alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask, gfp_t gfp_mask) + nodemask_t *nmask, gfp_t gfp_mask, + bool allow_alloc_fallback) { return NULL; } @@ -1157,6 +1183,11 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) return 0; } +static inline bool htlb_allow_alloc_fallback(int reason) +{ + return false; +} + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e4dd0c802489..2e984554ec63 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2598,7 +2598,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, /* folio migration callback function */ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask, gfp_t gfp_mask) + nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { @@ -2613,6 +2613,10 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } spin_unlock_irq(&hugetlb_lock); + /* We cannot fallback to other nodes, as we could break the per-node pool. */ + if (!allow_alloc_fallback) + gfp_mask |= __GFP_THISNODE; + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } @@ -6636,7 +6640,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + /* + * This is used to allocate a temporary hugetlb to hold the copied + * content, which will then be copied again to the final hugetlb + * consuming a reservation. Set the alloc_fallback to false to indicate + * that breaking the per-node hugetlb pool is not allowed in this case. + */ + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false); mpol_cond_put(mpol); return folio; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e128e6b7bbcb..72626b2cefa9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1228,7 +1228,8 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); - return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, + htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } if (folio_test_large(src)) diff --git a/mm/migrate.c b/mm/migrate.c index bde63010a3cf..ab9856f5931b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2022,7 +2022,8 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_hugetlb_folio_nodemask(h, nid, - mtc->nmask, gfp_mask); + mtc->nmask, gfp_mask, + htlb_allow_alloc_fallback(mtc->reason)); } if (folio_test_large(src)) { -- cgit v1.2.3-59-g8ed1b From 7262f208ca681385d133844be8a58d9b4ca185f7 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 22 Mar 2024 15:33:04 -0400 Subject: mm/migrate: split source folio if it is on deferred split list If the source folio is on deferred split list, it is likely some subpages are not used. Split it before migration to avoid migrating unused subpages. Commit 616b8371539a6 ("mm: thp: enable thp migration in generic path") did not check if a THP is on deferred split list before migration, thus, the destination THP is never put on deferred split list even if the source THP might be. The opportunity of reclaiming free pages in a partially mapped THP during deferred list scanning is lost, but no other harmful consequence is present[1]. [1]: https://lore.kernel.org/linux-mm/03CE3A00-917C-48CC-8E1C-6A98713C817C@nvidia.com/ [zi.yan@sent.com: fix an error in migrate_misplaced_folio()] Link: https://lkml.kernel.org/r/20240326150031.569387-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20240322193304.522496-1-zi.yan@sent.com Fixes: 616b8371539a ("mm: thp: enable thp migration in generic path") Signed-off-by: Zi Yan Tested-by: Baolin Wang Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Huang, Ying Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: SeongJae Park Cc: Yang Shi Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/migrate.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index ab9856f5931b..63c97bb639d7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1652,6 +1652,29 @@ static int migrate_pages_batch(struct list_head *from, cond_resched(); + /* + * The rare folio on the deferred split list should + * be split now. It should not count as a failure. + * Only check it without removing it from the list. + * Since the folio can be on deferred_split_scan() + * local list and removing it can cause the local list + * corruption. Folio split process below can handle it + * with the help of folio_ref_freeze(). + * + * nr_pages > 2 is needed to avoid checking order-1 + * page cache folios. They exist, in contrast to + * non-existent order-1 anonymous folios, and do not + * use _deferred_list. + */ + if (nr_pages > 2 && + !list_empty(&folio->_deferred_list)) { + if (try_split_folio(folio, split_folios) == 0) { + stats->nr_thp_split += is_thp; + stats->nr_split++; + continue; + } + } + /* * Large folio migration might be unsupported or * the allocation might be failed so we should retry -- cgit v1.2.3-59-g8ed1b From ebb34f78d72c2320620ba6d55cb22a52949047a1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 27 Feb 2024 21:15:48 +0100 Subject: mm: convert folio_estimated_sharers() to folio_likely_mapped_shared() Callers of folio_estimated_sharers() only care about "mapped shared vs. mapped exclusively", not the exact estimate of sharers. Let's consolidate and unify the condition users are checking. While at it clarify the semantics and extend the discussion on the fuzziness. Use the "likely mapped shared" terminology to better express what the (adjusted) function actually checks. Whether a partially-mappable folio is more likely to not be partially mapped than partially mapped is debatable. In the future, we might be able to improve our estimate for partially-mappable folios, though. Note that we will now consistently detect "mapped shared" only if the first subpage is actually mapped multiple times. When the first subpage is not mapped, we will consistently detect it as "mapped exclusively". This change should currently only affect the usage in madvise_free_pte_range() and queue_folios_pte_range() for large folios: if the first page was already unmapped, we would have skipped the folio. [david@redhat.com: folio_likely_mapped_shared() kerneldoc fixup] Link: https://lkml.kernel.org/r/dd0ad9f2-2d7a-45f3-9ba3-979488c7dd27@redhat.com Link: https://lkml.kernel.org/r/20240227201548.857831-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Khalid Aziz Acked-by: Barry Song Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Ryan Roberts Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 48 ++++++++++++++++++++++++++++++++++++++---------- mm/huge_memory.c | 2 +- mm/madvise.c | 6 +++--- mm/memory.c | 2 +- mm/mempolicy.c | 14 ++++++-------- mm/migrate.c | 8 ++++---- 6 files changed, 53 insertions(+), 27 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1588fe15a38e..fe66b515aaab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2165,21 +2165,49 @@ static inline size_t folio_size(struct folio *folio) } /** - * folio_estimated_sharers - Estimate the number of sharers of a folio. + * folio_likely_mapped_shared - Estimate if the folio is mapped into the page + * tables of more than one MM * @folio: The folio. * - * folio_estimated_sharers() aims to serve as a function to efficiently - * estimate the number of processes sharing a folio. This is done by - * looking at the precise mapcount of the first subpage in the folio, and - * assuming the other subpages are the same. This may not be true for large - * folios. If you want exact mapcounts for exact calculations, look at - * page_mapcount() or folio_total_mapcount(). + * This function checks if the folio is currently mapped into more than one + * MM ("mapped shared"), or if the folio is only mapped into a single MM + * ("mapped exclusively"). * - * Return: The estimated number of processes sharing a folio. + * As precise information is not easily available for all folios, this function + * estimates the number of MMs ("sharers") that are currently mapping a folio + * using the number of times the first page of the folio is currently mapped + * into page tables. + * + * For small anonymous folios (except KSM folios) and anonymous hugetlb folios, + * the return value will be exactly correct, because they can only be mapped + * at most once into an MM, and they cannot be partially mapped. + * + * For other folios, the result can be fuzzy: + * #. For partially-mappable large folios (THP), the return value can wrongly + * indicate "mapped exclusively" (false negative) when the folio is + * only partially mapped into at least one MM. + * #. For pagecache folios (including hugetlb), the return value can wrongly + * indicate "mapped shared" (false positive) when two VMAs in the same MM + * cover the same file range. + * #. For (small) KSM folios, the return value can wrongly indicate "mapped + * shared" (false negative), when the folio is mapped multiple times into + * the same MM. + * + * Further, this function only considers current page table mappings that + * are tracked using the folio mapcount(s). + * + * This function does not consider: + * #. If the folio might get mapped in the (near) future (e.g., swapcache, + * pagecache, temporary unmapping for migration). + * #. If the folio is mapped differently (VM_PFNMAP). + * #. If hugetlb page table sharing applies. Callers might want to check + * hugetlb_pmd_shared(). + * + * Return: Whether the folio is estimated to be mapped into more than one MM. */ -static inline int folio_estimated_sharers(struct folio *folio) +static inline bool folio_likely_mapped_shared(struct folio *folio) { - return page_mapcount(folio_page(folio, 0)); + return page_mapcount(folio_page(folio, 0)) > 1; } #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 16b2c5622fb1..1170fc22ed89 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1821,7 +1821,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto out; if (!folio_trylock(folio)) diff --git a/mm/madvise.c b/mm/madvise.c index a2dd70c4a2e6..7625830d6ae9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -366,7 +366,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio = pfn_folio(pmd_pfn(orig_pmd)); /* Do not interfere with other mappings of this folio */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) @@ -453,7 +453,7 @@ restart: if (folio_test_large(folio)) { int err; - if (folio_estimated_sharers(folio) > 1) + if (folio_likely_mapped_shared(folio)) break; if (pageout_anon_only_filter && !folio_test_anon(folio)) break; @@ -677,7 +677,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (folio_test_large(folio)) { int err; - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) break; if (!folio_trylock(folio)) break; diff --git a/mm/memory.c b/mm/memory.c index 805cebb6fd72..bcd29104f5ea 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5112,7 +5112,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) + if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; nid = folio_nid(folio); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72626b2cefa9..913cff5da5a3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -642,12 +642,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || - (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) + (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) if (!isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: @@ -1032,11 +1031,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, diff --git a/mm/migrate.c b/mm/migrate.c index 63c97bb639d7..a31aa75d223d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2593,11 +2593,11 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, /* * Don't migrate file folios that are mapped in multiple processes * with execute permissions as they are probably shared libraries. - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) && + if (folio_likely_mapped_shared(folio) && folio_is_file_lru(folio) && (vma->vm_flags & VM_EXEC)) goto out; -- cgit v1.2.3-59-g8ed1b From 4dc7d37370951fe86216f03a4e0a6909f9b90a8c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 Mar 2024 17:10:31 +0000 Subject: remove references to page->flags in documentation Mostly rewording, but remove entirely the copy of page_fixed_fake_head() in the documentation; we can refer people to the actual source if necessary. Link: https://lkml.kernel.org/r/20240326171045.410737-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 4 ++-- Documentation/mm/vmemmap_dedup.rst | 22 +--------------------- .../translations/zh_CN/core-api/cachetlb.rst | 2 +- mm/migrate.c | 2 +- mm/rmap.c | 4 ++-- 5 files changed, 7 insertions(+), 27 deletions(-) (limited to 'mm/migrate.c') diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index ca7d9402f6be..46110e6a31bb 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -300,14 +300,14 @@ When oom event notifier is registered, event will be delivered. Lock order is as follows:: - Page lock (PG_locked bit of page->flags) + folio_lock mm->page_table_lock or split pte_lock folio_memcg_lock (memcg->move_lock) mapping->i_pages lock lruvec->lru_lock. Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by -lruvec->lru_lock; PG_lru bit of page->flags is cleared before +lruvec->lru_lock; the folio LRU flag is cleared before isolating a page from its LRU under lruvec->lru_lock. .. _cgroup-v1-memory-kernel-extension: diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index 593ede6d314b..b4a55b6569fa 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -180,27 +180,7 @@ this correctly. There is only **one** head ``struct page``, the tail ``struct page`` with ``PG_head`` are fake head ``struct page``. We need an approach to distinguish between those two different types of ``struct page`` so that ``compound_head()`` can return the real head ``struct page`` when the -parameter is the tail ``struct page`` but with ``PG_head``. The following code -snippet describes how to distinguish between real and fake head ``struct page``. - -.. code-block:: c - - if (test_bit(PG_head, &page->flags)) { - unsigned long head = READ_ONCE(page[1].compound_head); - - if (head & 1) { - if (head == (unsigned long)page + 1) - /* head struct page */ - else - /* tail struct page */ - } else { - /* head struct page */ - } - } - -We can safely access the field of the **page[1]** with ``PG_head`` because the -page is a compound page composed with at least two contiguous pages. -The implementation refers to ``page_fixed_fake_head()``. +parameter is the tail ``struct page`` but with ``PG_head``. Device DAX ========== diff --git a/Documentation/translations/zh_CN/core-api/cachetlb.rst b/Documentation/translations/zh_CN/core-api/cachetlb.rst index b4a76ec75daa..64295c61d1c1 100644 --- a/Documentation/translations/zh_CN/core-api/cachetlb.rst +++ b/Documentation/translations/zh_CN/core-api/cachetlb.rst @@ -260,7 +260,7 @@ HyperSparc cpu就是这样一个具有这种属性的cpu。 如果D-cache别名不是一个问题,这个程序可以简单地定义为该架构上 的nop。 - 在page->flags (PG_arch_1)中有一个位是“架构私有”。内核保证, + 在folio->flags (PG_arch_1)中有一个位是“架构私有”。内核保证, 对于分页缓存的页面,当这样的页面第一次进入分页缓存时,它将清除 这个位。 diff --git a/mm/migrate.c b/mm/migrate.c index a31aa75d223d..285072bca29c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -113,7 +113,7 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) if (!mops->isolate_page(&folio->page, mode)) goto out_no_isolated; - /* Driver shouldn't use PG_isolated bit of page->flags */ + /* Driver shouldn't use the isolated flag */ WARN_ON_ONCE(folio_test_isolated(folio)); folio_set_isolated(folio); folio_unlock(folio); diff --git a/mm/rmap.c b/mm/rmap.c index d52759aa3ff7..5ee9e338d09b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -23,7 +23,7 @@ * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) - * page->flags PG_locked (lock_page) + * folio_lock * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * vma_start_write * mapping->i_mmap_rwsem @@ -50,7 +50,7 @@ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * vma_lock (hugetlb specific lock for pmd_sharing) * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) - * page->flags PG_locked (lock_page) + * folio_lock */ #include -- cgit v1.2.3-59-g8ed1b From 31ce0d7ef8412e3f38cf90bb90b7436130c60614 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 9 Apr 2024 21:22:54 +0200 Subject: mm/migrate: use folio_likely_mapped_shared() in add_page_for_migration() We want to limit the use of page_mapcount() to the places where it is absolutely necessary. In add_page_for_migration(), we actually want to check if the folio is mapped shared, to reject such folios. So let's use folio_likely_mapped_shared() instead. For small folios, fully mapped THP, and hugetlb folios, there is no change. For partially mapped, shared THP, we should now do a better job at rejecting such folios. Link: https://lkml.kernel.org/r/20240409192301.907377-12-david@redhat.com Signed-off-by: David Hildenbrand Cc: Chris Zankel Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Richard Chang Cc: Rich Felker Cc: Ryan Roberts Cc: Yang Shi Cc: Yin Fengwei Cc: Yoshinori Sato Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 285072bca29c..d87ce32645d4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2140,7 +2140,7 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, goto out_putfolio; err = -EACCES; - if (page_mapcount(page) > 1 && !migrate_all) + if (folio_likely_mapped_shared(folio) && !migrate_all) goto out_putfolio; err = -EBUSY; -- cgit v1.2.3-59-g8ed1b From 79899cce33e0887c06d41e767aa543aaaaef48e2 Mon Sep 17 00:00:00 2001 From: "Alex Shi (tencent)" Date: Thu, 11 Apr 2024 14:17:09 +0800 Subject: mm/ksm: convert chain series funcs and replace get_ksm_page In ksm stable tree all page are single, let's convert them to use and folios as well as stable_tree_insert/stable_tree_search funcs. And replace get_ksm_page() by ksm_get_folio() since there is no more needs. It could save a few compound_head calls. Link: https://lkml.kernel.org/r/20240411061713.1847574-9-alexs@kernel.org Signed-off-by: Alex Shi (tencent) Cc: Izik Eidus Cc: Matthew Wilcox Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Chris Wright Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/ksm.c | 136 ++++++++++++++++++++++++++++------------------------------- mm/migrate.c | 2 +- 2 files changed, 66 insertions(+), 72 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/ksm.c b/mm/ksm.c index 2fdd6586a3a7..61a7b5b037a6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -990,14 +990,6 @@ stale: return NULL; } -static struct page *get_ksm_page(struct ksm_stable_node *stable_node, - enum get_ksm_page_flags flags) -{ - struct folio *folio = ksm_get_folio(stable_node, flags); - - return &folio->page; -} - /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. @@ -1632,10 +1624,10 @@ bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) return __is_page_sharing_candidate(stable_node, 0); } -static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, - struct ksm_stable_node **_stable_node, - struct rb_root *root, - bool prune_stale_stable_nodes) +static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; @@ -1748,7 +1740,7 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, } *_stable_node_dup = found; - return &tree_folio->page; + return tree_folio; } static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node, @@ -1765,7 +1757,7 @@ static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stabl } /* - * Like for get_ksm_page, this function can free the *_stable_node and + * Like for ksm_get_folio, this function can free the *_stable_node and * *_stable_node_dup if the returned tree_page is NULL. * * It can also free and overwrite *_stable_node with the found @@ -1778,16 +1770,16 @@ static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stabl * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ -static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, - struct ksm_stable_node **_stable_node, - struct rb_root *root, - bool prune_stale_stable_nodes) +static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; - return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); + return ksm_get_folio(stable_node, GET_KSM_PAGE_NOLOCK); } /* * _stable_node_dup set to NULL means the stable_node @@ -1800,24 +1792,24 @@ static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_du prune_stale_stable_nodes); } -static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d, - struct ksm_stable_node **s_n, - struct rb_root *root) +static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d, + struct ksm_stable_node **s_n, + struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } -static __always_inline struct page *chain(struct ksm_stable_node **s_n_d, - struct ksm_stable_node *s_n, - struct rb_root *root) +static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, + struct ksm_stable_node *s_n, + struct rb_root *root) { struct ksm_stable_node *old_stable_node = s_n; - struct page *tree_page; + struct folio *tree_folio; - tree_page = __stable_node_chain(s_n_d, &s_n, root, false); + tree_folio = __stable_node_chain(s_n_d, &s_n, root, false); /* not pruning dups so s_n cannot have changed */ VM_BUG_ON(s_n != old_stable_node); - return tree_page; + return tree_folio; } /* @@ -1837,28 +1829,30 @@ static struct page *stable_tree_search(struct page *page) struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; struct ksm_stable_node *page_node; + struct folio *folio; - page_node = page_stable_node(page); + folio = page_folio(page); + page_node = folio_stable_node(folio); if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ - get_page(page); - return page; + folio_get(folio); + return &folio->page; } - nid = get_kpfn_nid(page_to_pfn(page)); + nid = get_kpfn_nid(folio_pfn(folio)); root = root_stable_tree + nid; again: new = &root->rb_node; parent = NULL; while (*new) { - struct page *tree_page; + struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; - tree_page = chain_prune(&stable_node_dup, &stable_node, root); + tree_folio = chain_prune(&stable_node_dup, &stable_node, root); /* * NOTE: stable_node may have been freed by * chain_prune() if the returned stable_node_dup is @@ -1892,14 +1886,14 @@ again: * write protected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, - GET_KSM_PAGE_NOLOCK); + tree_folio = ksm_get_folio(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); - if (!tree_page) { + if (!tree_folio) { /* * If we walked over a stale stable_node, - * get_ksm_page() will call rb_erase() and it + * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate @@ -1909,8 +1903,8 @@ again: goto again; } - ret = memcmp_pages(page, tree_page); - put_page(tree_page); + ret = memcmp_pages(page, &tree_folio->page); + folio_put(tree_folio); parent = *new; if (ret < 0) @@ -1953,26 +1947,26 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node_dup, - GET_KSM_PAGE_TRYLOCK); + tree_folio = ksm_get_folio(stable_node_dup, + GET_KSM_PAGE_TRYLOCK); - if (PTR_ERR(tree_page) == -EBUSY) + if (PTR_ERR(tree_folio) == -EBUSY) return ERR_PTR(-EBUSY); - if (unlikely(!tree_page)) + if (unlikely(!tree_folio)) /* * The tree may have been rebalanced, * so re-evaluate parent and new. */ goto again; - unlock_page(tree_page); + folio_unlock(tree_folio); if (get_kpfn_nid(stable_node_dup->kpfn) != NUMA(stable_node_dup->nid)) { - put_page(tree_page); + folio_put(tree_folio); goto replace; } - return tree_page; + return &tree_folio->page; } } @@ -1985,8 +1979,8 @@ again: rb_insert_color(&page_node->node, root); out: if (is_page_sharing_candidate(page_node)) { - get_page(page); - return page; + folio_get(folio); + return &folio->page; } else return NULL; @@ -2011,12 +2005,12 @@ replace: &page_node->node, root); if (is_page_sharing_candidate(page_node)) - get_page(page); + folio_get(folio); else - page = NULL; + folio = NULL; } else { rb_erase(&stable_node_dup->node, root); - page = NULL; + folio = NULL; } } else { VM_BUG_ON(!is_stable_node_chain(stable_node)); @@ -2027,16 +2021,16 @@ replace: DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); if (is_page_sharing_candidate(page_node)) - get_page(page); + folio_get(folio); else - page = NULL; + folio = NULL; } else { - page = NULL; + folio = NULL; } } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); - return page; + return &folio->page; chain_append: /* stable_node_dup could be null if it reached the limit */ @@ -2079,7 +2073,7 @@ chain_append: * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct ksm_stable_node *stable_tree_insert(struct page *kpage) +static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) { int nid; unsigned long kpfn; @@ -2089,7 +2083,7 @@ static struct ksm_stable_node *stable_tree_insert(struct page *kpage) struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; bool need_chain = false; - kpfn = page_to_pfn(kpage); + kpfn = folio_pfn(kfolio); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; again: @@ -2097,13 +2091,13 @@ again: new = &root->rb_node; while (*new) { - struct page *tree_page; + struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; - tree_page = chain(&stable_node_dup, stable_node, root); + tree_folio = chain(&stable_node_dup, stable_node, root); if (!stable_node_dup) { /* * Either all stable_node dups were full in @@ -2125,14 +2119,14 @@ again: * write protected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, - GET_KSM_PAGE_NOLOCK); + tree_folio = ksm_get_folio(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); - if (!tree_page) { + if (!tree_folio) { /* * If we walked over a stale stable_node, - * get_ksm_page() will call rb_erase() and it + * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate @@ -2142,8 +2136,8 @@ again: goto again; } - ret = memcmp_pages(kpage, tree_page); - put_page(tree_page); + ret = memcmp_pages(&kfolio->page, &tree_folio->page); + folio_put(tree_folio); parent = *new; if (ret < 0) @@ -2162,7 +2156,7 @@ again: INIT_HLIST_HEAD(&stable_node_dup->hlist); stable_node_dup->kpfn = kpfn; - set_page_stable_node(kpage, stable_node_dup); + folio_set_stable_node(kfolio, stable_node_dup); stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { @@ -2440,7 +2434,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * node in the stable tree and add both rmap_items. */ lock_page(kpage); - stable_node = stable_tree_insert(kpage); + stable_node = stable_tree_insert(page_folio(kpage)); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); @@ -3244,7 +3238,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) /* * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible - * to get_ksm_page() before it can see that folio->mapping + * to ksm_get_folio() before it can see that folio->mapping * has gone stale (or that folio_test_swapcache has been cleared). */ smp_wmb(); @@ -3271,7 +3265,7 @@ static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) { /* - * Don't get_ksm_page, page has already gone: + * Don't ksm_get_folio, page has already gone: * which is why we keep kpfn instead of page* */ remove_node_from_stable_tree(stable_node); @@ -3359,7 +3353,7 @@ static int ksm_memory_callback(struct notifier_block *self, * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct * pages which have been offlined: prune those from the tree, - * otherwise get_ksm_page() might later try to access a + * otherwise ksm_get_folio() might later try to access a * non-existent struct page. */ ksm_check_stable_tree(mn->start_pfn, diff --git a/mm/migrate.c b/mm/migrate.c index d87ce32645d4..c7692f303fa7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -616,7 +616,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's - * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). + * ksm_get_folio() depends upon ksm_migrate_page() and PageSwapCache(). */ if (folio_test_swapcache(folio)) folio_clear_swapcache(folio); -- cgit v1.2.3-59-g8ed1b From 6e8cda4c2c87b2a44828e651a10705647a6fd542 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:03 +0100 Subject: mm: convert hugetlb_page_mapping_lock_write to folio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The page is only used to get the mapping, so the folio will do just as well. Both callers already have a folio available, so this saves a call to compound_head(). Link: https://lkml.kernel.org/r/20240412193510.2356957-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jane Chu  Reviewed-by: Oscar Salvador Acked-by: Miaohe Lin Cc: Dan Williams Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 6 +++--- mm/memory-failure.c | 2 +- mm/migrate.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1bc93e7e315b..68244bb3637a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -178,7 +178,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; @@ -297,8 +297,8 @@ static inline unsigned long hugetlb_total_pages(void) return 0; } -static inline struct address_space *hugetlb_page_mapping_lock_write( - struct page *hpage) +static inline struct address_space *hugetlb_folio_mapping_lock_write( + struct folio *folio) { return NULL; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3b7d5ddc32ad..417fc5cdb6ee 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2155,13 +2155,13 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, /* * Find and lock address space (mapping) in write mode. * - * Upon entry, the page is locked which means that page_mapping() is + * Upon entry, the folio is locked which means that folio_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller. */ -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) { - struct address_space *mapping = page_mapping(hpage); + struct address_space *mapping = folio_mapping(folio); if (!mapping) return mapping; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4daf581e3878..1a5f3403dd2a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1624,7 +1624,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * TTU_RMAP_LOCKED to indicate we have taken the lock * at this higher level. */ - mapping = hugetlb_page_mapping_lock_write(hpage); + mapping = hugetlb_folio_mapping_lock_write(folio); if (mapping) { try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); i_mmap_unlock_write(mapping); diff --git a/mm/migrate.c b/mm/migrate.c index c7692f303fa7..dd04f578c19c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1425,7 +1425,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock. */ - mapping = hugetlb_page_mapping_lock_write(&src->page); + mapping = hugetlb_folio_mapping_lock_write(src); if (unlikely(!mapping)) goto unlock_put_anon; -- cgit v1.2.3-59-g8ed1b