From a455aa72f7c46b881721668b3ee810713adc7a5b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 15 Oct 2020 20:04:22 -0700 Subject: device-dax/kmem: fix resource release The conversion to request_mem_region() is broken because it assumes that the range is marked busy prior to release. However, due to the way that the kmem driver manipulates the IORESOURCE_BUSY flag (clears it to let {add,remove}_memory() handle busy) it requires a manual release_resource() to perform cleanup. Given that the actual 'struct resource *' needs to be recalled, not just the range, add that tracking to the kmem driver-data. Fixes: 0513bd5bb114 ("device-dax/kmem: replace release_resource() with release_mem_region()") Reported-by: David Hildenbrand Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Cc: Vishal Verma Cc: Dave Hansen Cc: Pavel Tatashin Cc: Brice Goglin Cc: Dave Jiang Cc: Ira Weiny Cc: Jia He Cc: Joao Martins Cc: Jonathan Cameron Link: https://lkml.kernel.org/r/160272252925.3136502.17220638073995895400.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- drivers/dax/kmem.c | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 6c933f2b604e..af04b6d1d263 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -35,11 +35,17 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) return 0; } +struct dax_kmem_data { + const char *res_name; + struct resource *res[]; +}; + static int dev_dax_kmem_probe(struct dev_dax *dev_dax) { struct device *dev = &dev_dax->dev; + struct dax_kmem_data *data; + int rc = -ENOMEM; int i, mapped = 0; - char *res_name; int numa_node; /* @@ -55,14 +61,17 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) return -EINVAL; } - res_name = kstrdup(dev_name(dev), GFP_KERNEL); - if (!res_name) + data = kzalloc(sizeof(*data) + sizeof(struct resource *) * dev_dax->nr_range, GFP_KERNEL); + if (!data) return -ENOMEM; + data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); + if (!data->res_name) + goto err_res_name; + for (i = 0; i < dev_dax->nr_range; i++) { struct resource *res; struct range range; - int rc; rc = dax_kmem_range(dev_dax, i, &range); if (rc) { @@ -72,7 +81,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) } /* Region is permanently reserved if hotremove fails. */ - res = request_mem_region(range.start, range_len(&range), res_name); + res = request_mem_region(range.start, range_len(&range), data->res_name); if (!res) { dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", i, range.start, range.end); @@ -82,9 +91,10 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) */ if (mapped) continue; - kfree(res_name); - return -EBUSY; + rc = -EBUSY; + goto err_request_mem; } + data->res[i] = res; /* * Set flags appropriate for System RAM. Leave ..._BUSY clear @@ -104,18 +114,25 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) if (rc) { dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", i, range.start, range.end); - release_mem_region(range.start, range_len(&range)); + release_resource(res); + kfree(res); + data->res[i] = NULL; if (mapped) continue; - kfree(res_name); - return rc; + goto err_request_mem; } mapped++; } - dev_set_drvdata(dev, res_name); + dev_set_drvdata(dev, data); return 0; + +err_request_mem: + kfree(data->res_name); +err_res_name: + kfree(data); + return rc; } #ifdef CONFIG_MEMORY_HOTREMOVE @@ -123,7 +140,7 @@ static int dev_dax_kmem_remove(struct dev_dax *dev_dax) { int i, success = 0; struct device *dev = &dev_dax->dev; - const char *res_name = dev_get_drvdata(dev); + struct dax_kmem_data *data = dev_get_drvdata(dev); /* * We have one shot for removing memory, if some memory blocks were not @@ -142,7 +159,9 @@ static int dev_dax_kmem_remove(struct dev_dax *dev_dax) rc = remove_memory(dev_dax->target_node, range.start, range_len(&range)); if (rc == 0) { - release_mem_region(range.start, range_len(&range)); + release_resource(data->res[i]); + kfree(data->res[i]); + data->res[i] = NULL; success++; continue; } @@ -153,7 +172,8 @@ static int dev_dax_kmem_remove(struct dev_dax *dev_dax) } if (success >= dev_dax->nr_range) { - kfree(res_name); + kfree(data->res_name); + kfree(data); dev_set_drvdata(dev, NULL); } -- cgit v1.2.3-59-g8ed1b From 392b466981ac42f491173c37f6ed574841645bcc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:26 -0700 Subject: powerpc/mm: add DEBUG_VM WARN for pmd_clear Patch series "mm/debug_vm_pgtable fixes", v4. This patch series includes fixes for debug_vm_pgtable test code so that they follow page table updates rules correctly. The first two patches introduce changes w.r.t ppc64. Hugetlb test is disabled on ppc64 because that needs larger change to satisfy page table update rules. These tests are broken w.r.t page table update rules and results in kernel crash as below. [ 21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304! cpu 0x0: Vector: 700 (Program Check) at [c000000c6d1e76c0] pc: c00000000009a5ec: assert_pte_locked+0x14c/0x380 lr: c0000000005eeeec: pte_update+0x11c/0x190 sp: c000000c6d1e7950 msr: 8000000002029033 current = 0xc000000c6d172c80 paca = 0xc000000003ba0000 irqmask: 0x03 irq_happened: 0x01 pid = 1, comm = swapper/0 kernel BUG at arch/powerpc/mm/pgtable.c:304! [link register ] c0000000005eeeec pte_update+0x11c/0x190 [c000000c6d1e7950] 0000000000000001 (unreliable) [c000000c6d1e79b0] c0000000005eee14 pte_update+0x44/0x190 [c000000c6d1e7a10] c000000001a2ca9c pte_advanced_tests+0x160/0x3d8 [c000000c6d1e7ab0] c000000001a2d4fc debug_vm_pgtable+0x7e8/0x1338 [c000000c6d1e7ba0] c0000000000116ec do_one_initcall+0xac/0x5f0 [c000000c6d1e7c80] c0000000019e4fac kernel_init_freeable+0x4dc/0x5a4 [c000000c6d1e7db0] c000000000012474 kernel_init+0x24/0x160 [c000000c6d1e7e20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c With DEBUG_VM disabled [ 20.530152] BUG: Kernel NULL pointer dereference on read at 0x00000000 [ 20.530183] Faulting instruction address: 0xc0000000000df330 cpu 0x33: Vector: 380 (Data SLB Access) at [c000000c6d19f700] pc: c0000000000df330: memset+0x68/0x104 lr: c00000000009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0 sp: c000000c6d19f990 msr: 8000000002009033 dar: 0 current = 0xc000000c6d177480 paca = 0xc00000001ec4f400 irqmask: 0x03 irq_happened: 0x01 pid = 1, comm = swapper/0 [link register ] c00000000009f6d8 hash__pmdp_huge_get_and_clear+0xe8/0x1b0 [c000000c6d19f990] c00000000009f748 hash__pmdp_huge_get_and_clear+0x158/0x1b0 (unreliable) [c000000c6d19fa10] c0000000019ebf30 pmd_advanced_tests+0x1f0/0x378 [c000000c6d19fab0] c0000000019ed088 debug_vm_pgtable+0x79c/0x1244 [c000000c6d19fba0] c0000000000116ec do_one_initcall+0xac/0x5f0 [c000000c6d19fc80] c0000000019a4fac kernel_init_freeable+0x4dc/0x5a4 [c000000c6d19fdb0] c000000000012474 kernel_init+0x24/0x160 [c000000c6d19fe20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c This patch (of 13): With the hash page table, the kernel should not use pmd_clear for clearing huge pte entries. Add a DEBUG_VM WARN to catch the wrong usage. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Cc: Michael Ellerman Cc: Anshuman Khandual Cc: Aneesh Kumar K.V Cc: Christophe Leroy Link: https://lkml.kernel.org/r/20200902114222.181353-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20200902114222.181353-2-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 495fc0ccb453..e9ada4494514 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -866,6 +866,13 @@ static inline bool pte_ci(pte_t pte) static inline void pmd_clear(pmd_t *pmdp) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) { + /* + * Don't use this if we can possibly have a hash page table + * entry mapping this. + */ + WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE)); + } *pmdp = __pmd(0); } @@ -914,6 +921,13 @@ static inline int pmd_bad(pmd_t pmd) static inline void pud_clear(pud_t *pudp) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) { + /* + * Don't use this if we can possibly have a hash page table + * entry mapping this. + */ + WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE)); + } *pudp = __pud(0); } -- cgit v1.2.3-59-g8ed1b From 379c926d6334427b35fe7747f7758c865e20e9a9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:29 -0700 Subject: powerpc/mm: move setting pte specific flags to pfn_pte powerpc used to set the pte specific flags in set_pte_at(). This is different from other architectures. To be consistent with other architecture update pfn_pte to set _PAGE_PTE on ppc64. Also, drop now unused pte_mkpte. We add a VM_WARN_ON() to catch the usage of calling set_pte_at() without setting _PAGE_PTE bit. We will remove that after a few releases. With respect to huge pmd entries, pmd_mkhuge() takes care of adding the _PAGE_PTE bit. [akpm@linux-foundation.org: whitespace fix, per Christophe] Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Reviewed-by: Christophe Leroy Cc: Anshuman Khandual Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200902114222.181353-3-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 15 +++++++++------ arch/powerpc/include/asm/nohash/pgtable.h | 5 ----- arch/powerpc/mm/pgtable.c | 5 ----- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index e9ada4494514..f6d2c4449aeb 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -615,7 +615,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) VM_BUG_ON(pfn >> (64 - PAGE_SHIFT)); VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK); - return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot)); + return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE); } static inline unsigned long pte_pfn(pte_t pte) @@ -651,11 +651,6 @@ static inline pte_t pte_mkexec(pte_t pte) return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC)); } -static inline pte_t pte_mkpte(pte_t pte) -{ - return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); -} - static inline pte_t pte_mkwrite(pte_t pte) { /* @@ -819,6 +814,14 @@ static inline int pte_none(pte_t pte) static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, int percpu) { + + VM_WARN_ON(!(pte_raw(pte) & cpu_to_be64(_PAGE_PTE))); + /* + * Keep the _PAGE_PTE added till we are sure we handle _PAGE_PTE + * in all the callers. + */ + pte = __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); + if (radix_enabled()) return radix__set_pte_at(mm, addr, ptep, pte, percpu); return hash__set_pte_at(mm, addr, ptep, pte, percpu); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 4b7c3472eab1..6277e7596ae5 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -140,11 +140,6 @@ static inline pte_t pte_mkold(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } -static inline pte_t pte_mkpte(pte_t pte) -{ - return pte; -} - static inline pte_t pte_mkspecial(pte_t pte) { return __pte(pte_val(pte) | _PAGE_SPECIAL); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 9c0547d77af3..ab57b07ef39a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -184,9 +184,6 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - /* Add the pte bit when trying to set a pte */ - pte = pte_mkpte(pte); - /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this * is called. @@ -275,8 +272,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_ */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - pte = pte_mkpte(pte); - pte = set_pte_filter(pte); val = pte_val(pte); -- cgit v1.2.3-59-g8ed1b From cfc5bbc4e75da4e341b86822c3c8ab1f4d804de5 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:33 -0700 Subject: mm/debug_vm_pgtable/ppc64: avoid setting top bits in radom value ppc64 use bit 62 to indicate a pte entry (_PAGE_PTE). Avoid setting that bit in random value. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Reviewed-by: Anshuman Khandual Cc: Christophe Leroy Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200902114222.181353-4-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 086309fb9b6f..00649b47f6e0 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -44,10 +44,17 @@ * entry type. But these bits might affect the ability to clear entries with * pxx_clear() because of how dynamic page table folding works on s390. So * while loading up the entries do not change the lower 4 bits. It does not - * have affect any other platform. + * have affect any other platform. Also avoid the 62nd bit on ppc64 that is + * used to mark a pte entry. */ -#define S390_MASK_BITS 4 -#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS) +#define S390_SKIP_MASK GENMASK(3, 0) +#if __BITS_PER_LONG == 64 +#define PPC64_SKIP_MASK GENMASK(62, 62) +#else +#define PPC64_SKIP_MASK 0x0 +#endif +#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK) +#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) -- cgit v1.2.3-59-g8ed1b From 85a144632dcc71f96dcb9172a15d6f71161144d2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:36 -0700 Subject: mm/debug_vm_pgtables/hugevmap: use the arch helper to identify huge vmap support. ppc64 supports huge vmap only with radix translation. Hence use arch helper to determine the huge vmap support. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Reviewed-by: Anshuman Khandual Cc: Christophe Leroy Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200902114222.181353-5-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 00649b47f6e0..4c73e63b4ceb 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -206,11 +207,12 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pmd_leaf(pmd)); } +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { pmd_t pmd; - if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + if (!arch_ioremap_pmd_supported()) return; pr_debug("Validating PMD huge\n"); @@ -224,6 +226,9 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); } +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { @@ -320,11 +325,12 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pud_leaf(pud)); } +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { pud_t pud; - if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + if (!arch_ioremap_pud_supported()) return; pr_debug("Validating PUD huge\n"); @@ -338,6 +344,10 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) pud = READ_ONCE(*pudp); WARN_ON(!pud_none(pud)); } +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } +#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ + #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } static void __init pud_advanced_tests(struct mm_struct *mm, -- cgit v1.2.3-59-g8ed1b From 4200605b1f80e4644a5cb5abc708bccfb6def11a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:40 -0700 Subject: mm/debug_vm_pgtable/savedwrite: enable savedwrite test with CONFIG_NUMA_BALANCING Saved write support was added to track the write bit of a pte after marking the pte protnone. This was done so that AUTONUMA can convert a write pte to protnone and still track the old write bit. When converting it back we set the pte write bit correctly thereby avoiding a write fault again. Hence enable the test only when CONFIG_NUMA_BALANCING is enabled and use protnone protflags. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Reviewed-by: Anshuman Khandual Cc: Christophe Leroy Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200902114222.181353-6-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 4c73e63b4ceb..8704901f6bd8 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -119,10 +119,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) { pte_t pte = pfn_pte(pfn, prot); + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + pr_debug("Validating PTE saved write\n"); WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte)))); WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte)))); } + #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { @@ -234,6 +238,9 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { pmd_t pmd = pfn_pmd(pfn, prot); + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + pr_debug("Validating PMD saved write\n"); WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); @@ -1019,8 +1026,8 @@ static int __init debug_vm_pgtable(void) pmd_huge_tests(pmdp, pmd_aligned, prot); pud_huge_tests(pudp, pud_aligned, prot); - pte_savedwrite_tests(pte_aligned, prot); - pmd_savedwrite_tests(pmd_aligned, prot); + pte_savedwrite_tests(pte_aligned, protnone); + pmd_savedwrite_tests(pmd_aligned, protnone); pte_unmap_unlock(ptep, ptl); -- cgit v1.2.3-59-g8ed1b From c3824e18d3f3946015ac641d46fdeeefd30bfd66 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:46 -0700 Subject: mm/debug_vm_pgtable/set_pte/pmd/pud: don't use set_*_at to update an existing pte entry set_pte_at() should not be used to set a pte entry at locations that already holds a valid pte entry. Architectures like ppc64 don't do TLB invalidate in set_pte_at() and hence expect it to be used to set locations that are not a valid PTE. Link: https://lkml.kernel.org/r/20200902114222.181353-8-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Anshuman Khandual Cc: Christophe Leroy Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 8704901f6bd8..2ba5ec866f53 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -79,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm, { pte_t pte = pfn_pte(pfn, prot); + /* + * Architectures optimize set_pte_at by avoiding TLB flush. + * This requires set_pte_at to be not used to update an + * existing pte entry. Clear pte before we do set_pte_at + */ + pr_debug("Validating PTE advanced\n"); pte = pfn_pte(pfn, prot); set_pte_at(mm, vaddr, ptep, pte); ptep_set_wrprotect(mm, vaddr, ptep); pte = ptep_get(ptep); WARN_ON(pte_write(pte)); - - pte = pfn_pte(pfn, prot); - set_pte_at(mm, vaddr, ptep, pte); ptep_get_and_clear(mm, vaddr, ptep); pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); @@ -101,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm, ptep_set_access_flags(vma, vaddr, ptep, pte, 1); pte = ptep_get(ptep); WARN_ON(!(pte_write(pte) && pte_dirty(pte))); - - pte = pfn_pte(pfn, prot); - set_pte_at(mm, vaddr, ptep, pte); ptep_get_and_clear_full(mm, vaddr, ptep, 1); pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); + pte = pfn_pte(pfn, prot); pte = pte_mkyoung(pte); set_pte_at(mm, vaddr, ptep, pte); ptep_test_and_clear_young(vma, vaddr, ptep); @@ -169,9 +170,6 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmdp_set_wrprotect(mm, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_write(pmd)); - - pmd = pfn_pmd(pfn, prot); - set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_huge_get_and_clear(mm, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); @@ -185,13 +183,11 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1); pmd = READ_ONCE(*pmdp); WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd))); - - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); - set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1); pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); + pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); pmd = pmd_mkyoung(pmd); set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_test_and_clear_young(vma, vaddr, pmdp); @@ -291,17 +287,9 @@ static void __init pud_advanced_tests(struct mm_struct *mm, WARN_ON(pud_write(pud)); #ifndef __PAGETABLE_PMD_FOLDED - pud = pfn_pud(pfn, prot); - set_pud_at(mm, vaddr, pudp, pud); pudp_huge_get_and_clear(mm, vaddr, pudp); pud = READ_ONCE(*pudp); WARN_ON(!pud_none(pud)); - - pud = pfn_pud(pfn, prot); - set_pud_at(mm, vaddr, pudp, pud); - pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); - pud = READ_ONCE(*pudp); - WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ pud = pfn_pud(pfn, prot); pud = pud_wrprotect(pud); @@ -313,6 +301,13 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pud = READ_ONCE(*pudp); WARN_ON(!(pud_write(pud) && pud_dirty(pud))); +#ifndef __PAGETABLE_PMD_FOLDED + pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); + pud = READ_ONCE(*pudp); + WARN_ON(!pud_none(pud)); +#endif /* __PAGETABLE_PMD_FOLDED */ + + pud = pfn_pud(pfn, prot); pud = pud_mkyoung(pud); set_pud_at(mm, vaddr, pudp, pud); pudp_test_and_clear_young(vma, vaddr, pudp); -- cgit v1.2.3-59-g8ed1b From e8edf0adb95dc311e3beae361917d30d72845e04 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:49 -0700 Subject: mm/debug_vm_pgtable/locks: move non page table modifying test together This will help in adding proper locks in a later patch Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Cc: Anshuman Khandual Cc: Christophe Leroy Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200902114222.181353-9-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 51 ++++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 2ba5ec866f53..3ed3fb68922c 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -984,7 +984,7 @@ static int __init debug_vm_pgtable(void) p4dp = p4d_alloc(mm, pgdp, vaddr); pudp = pud_alloc(mm, p4dp, vaddr); pmdp = pmd_alloc(mm, pudp, vaddr); - ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + ptep = pte_alloc_map(mm, pmdp, vaddr); /* * Save all the page table page addresses as the page table @@ -1004,33 +1004,12 @@ static int __init debug_vm_pgtable(void) p4d_basic_tests(p4d_aligned, prot); pgd_basic_tests(pgd_aligned, prot); - pte_clear_tests(mm, ptep, vaddr); - pmd_clear_tests(mm, pmdp); - pud_clear_tests(mm, pudp); - p4d_clear_tests(mm, p4dp); - pgd_clear_tests(mm, pgdp); - - pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot); - pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); - hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_leaf_tests(pmd_aligned, prot); pud_leaf_tests(pud_aligned, prot); - pmd_huge_tests(pmdp, pmd_aligned, prot); - pud_huge_tests(pudp, pud_aligned, prot); - pte_savedwrite_tests(pte_aligned, protnone); pmd_savedwrite_tests(pmd_aligned, protnone); - pte_unmap_unlock(ptep, ptl); - - pmd_populate_tests(mm, pmdp, saved_ptep); - pud_populate_tests(mm, pudp, saved_pmdp); - p4d_populate_tests(mm, p4dp, saved_pudp); - pgd_populate_tests(mm, pgdp, saved_p4dp); - pte_special_tests(pte_aligned, prot); pte_protnone_tests(pte_aligned, protnone); pmd_protnone_tests(pmd_aligned, protnone); @@ -1048,11 +1027,37 @@ static int __init debug_vm_pgtable(void) pmd_swap_tests(pmd_aligned, prot); swap_migration_tests(); - hugetlb_basic_tests(pte_aligned, prot); pmd_thp_tests(pmd_aligned, prot); pud_thp_tests(pud_aligned, prot); + hugetlb_basic_tests(pte_aligned, prot); + + pte_clear_tests(mm, ptep, vaddr); + pmd_clear_tests(mm, pmdp); + pud_clear_tests(mm, pudp); + p4d_clear_tests(mm, p4dp); + pgd_clear_tests(mm, pgdp); + + ptl = pte_lockptr(mm, pmdp); + spin_lock(ptl); + + pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); + pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot); + pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); + hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); + + + pmd_huge_tests(pmdp, pmd_aligned, prot); + pud_huge_tests(pudp, pud_aligned, prot); + + pte_unmap_unlock(ptep, ptl); + + pmd_populate_tests(mm, pmdp, saved_ptep); + pud_populate_tests(mm, pudp, saved_pmdp); + p4d_populate_tests(mm, p4dp, saved_pudp); + pgd_populate_tests(mm, pgdp, saved_p4dp); + p4d_free(mm, saved_p4dp); pud_free(mm, saved_pudp); pmd_free(mm, saved_pmdp); -- cgit v1.2.3-59-g8ed1b From 6f302e270c999d694a5d00145958760b320b15a7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:53 -0700 Subject: mm/debug_vm_pgtable/locks: take correct page table lock Make sure we call pte accessors with correct lock held. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Cc: Anshuman Khandual Cc: Christophe Leroy Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200902114222.181353-10-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 3ed3fb68922c..d9f00ee07c62 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1033,30 +1033,39 @@ static int __init debug_vm_pgtable(void) hugetlb_basic_tests(pte_aligned, prot); - pte_clear_tests(mm, ptep, vaddr); - pmd_clear_tests(mm, pmdp); - pud_clear_tests(mm, pudp); - p4d_clear_tests(mm, p4dp); - pgd_clear_tests(mm, pgdp); + /* + * Page table modifying tests. They need to hold + * proper page table lock. + */ ptl = pte_lockptr(mm, pmdp); spin_lock(ptl); - + pte_clear_tests(mm, ptep, vaddr); pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot); - pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); - hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - + pte_unmap_unlock(ptep, ptl); + ptl = pmd_lock(mm, pmdp); + pmd_clear_tests(mm, pmdp); + pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot); pmd_huge_tests(pmdp, pmd_aligned, prot); + pmd_populate_tests(mm, pmdp, saved_ptep); + spin_unlock(ptl); + + ptl = pud_lock(mm, pudp); + pud_clear_tests(mm, pudp); + pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); pud_huge_tests(pudp, pud_aligned, prot); + pud_populate_tests(mm, pudp, saved_pmdp); + spin_unlock(ptl); - pte_unmap_unlock(ptep, ptl); + hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_populate_tests(mm, pmdp, saved_ptep); - pud_populate_tests(mm, pudp, saved_pmdp); + spin_lock(&mm->page_table_lock); + p4d_clear_tests(mm, p4dp); + pgd_clear_tests(mm, pgdp); p4d_populate_tests(mm, p4dp, saved_pudp); pgd_populate_tests(mm, pgdp, saved_p4dp); + spin_unlock(&mm->page_table_lock); p4d_free(mm, saved_p4dp); pud_free(mm, saved_pudp); -- cgit v1.2.3-59-g8ed1b From 87f34986de0b3799efb1a5eaaa9baa790047f839 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:56 -0700 Subject: mm/debug_vm_pgtable/thp: use page table depost/withdraw with THP Architectures like ppc64 use deposited page table while updating the huge pte entries. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Reviewed-by: Anshuman Khandual Cc: Christophe Leroy Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200902114222.181353-11-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index d9f00ee07c62..cbf6e0019f62 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -154,7 +154,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, - pgprot_t prot) + pgprot_t prot, pgtable_t pgtable) { pmd_t pmd = pfn_pmd(pfn, prot); @@ -165,6 +165,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, /* Align the address wrt HPAGE_PMD_SIZE */ vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + pgtable_trans_huge_deposit(mm, pmdp, pgtable); + pmd = pfn_pmd(pfn, prot); set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_set_wrprotect(mm, vaddr, pmdp); @@ -193,6 +195,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmdp_test_and_clear_young(vma, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_young(pmd)); + + pgtable = pgtable_trans_huge_withdraw(mm, pmdp); } static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) @@ -369,7 +373,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, - pgprot_t prot) + pgprot_t prot, pgtable_t pgtable) { } static void __init pud_advanced_tests(struct mm_struct *mm, @@ -1046,7 +1050,7 @@ static int __init debug_vm_pgtable(void) ptl = pmd_lock(mm, pmdp); pmd_clear_tests(mm, pmdp); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot); + pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); pmd_huge_tests(pmdp, pmd_aligned, prot); pmd_populate_tests(mm, pmdp, saved_ptep); spin_unlock(ptl); -- cgit v1.2.3-59-g8ed1b From 13af05063033238777e4f155eeffe7c06be2a96c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:04:59 -0700 Subject: mm/debug_vm_pgtable/pmd_clear: don't use pmd/pud_clear on pte entries pmd_clear() should not be used to clear pmd level pte entries. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Cc: Anshuman Khandual Cc: Christophe Leroy Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200902114222.181353-12-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index cbf6e0019f62..2cc299308d10 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -196,6 +196,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmd = READ_ONCE(*pmdp); WARN_ON(pmd_young(pmd)); + /* Clear the pte entries */ + pmdp_huge_get_and_clear(mm, vaddr, pmdp); pgtable = pgtable_trans_huge_withdraw(mm, pmdp); } @@ -317,6 +319,8 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pudp_test_and_clear_young(vma, vaddr, pudp); pud = READ_ONCE(*pudp); WARN_ON(pud_young(pud)); + + pudp_huge_get_and_clear(mm, vaddr, pudp); } static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) @@ -440,8 +444,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, * This entry points to next level page table page. * Hence this must not qualify as pud_bad(). */ - pmd_clear(pmdp); - pud_clear(pudp); pud_populate(mm, pudp, pmdp); pud = READ_ONCE(*pudp); WARN_ON(pud_bad(pud)); @@ -573,7 +575,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, * This entry points to next level page table page. * Hence this must not qualify as pmd_bad(). */ - pmd_clear(pmdp); pmd_populate(mm, pmdp, pgtable); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_bad(pmd)); -- cgit v1.2.3-59-g8ed1b From 2b1dd67a78c36a30f993d0d5366177df0cd7a5db Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:05:03 -0700 Subject: mm/debug_vm_pgtable/hugetlb: disable hugetlb test on ppc64 The seems to be missing quite a lot of details w.r.t allocating the correct pgtable_t page (huge_pte_alloc()), holding the right lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA. ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these. Hence disable the test on ppc64. [anshuman.khandual@arm.com: drop hugetlb_advanced_tests()] Link: https://lore.kernel.org/lkml/289c3fdb-1394-c1af-bdc4-5542907089dc@linux.ibm.com/#t Link: https://lkml.kernel.org/r/1600914446-21890-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Cc: Christophe Leroy Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200902114222.181353-13-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 2cc299308d10..bfc9c6544adf 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -808,57 +808,8 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pte_huge(pte_mkhuge(pte))); #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } - -static void __init hugetlb_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, - pte_t *ptep, unsigned long pfn, - unsigned long vaddr, pgprot_t prot) -{ - struct page *page = pfn_to_page(pfn); - pte_t pte = ptep_get(ptep); - unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK; - - pr_debug("Validating HugeTLB advanced\n"); - pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot)); - set_huge_pte_at(mm, vaddr, ptep, pte); - barrier(); - WARN_ON(!pte_same(pte, huge_ptep_get(ptep))); - huge_pte_clear(mm, vaddr, ptep, PMD_SIZE); - pte = huge_ptep_get(ptep); - WARN_ON(!huge_pte_none(pte)); - - pte = mk_huge_pte(page, prot); - set_huge_pte_at(mm, vaddr, ptep, pte); - barrier(); - huge_ptep_set_wrprotect(mm, vaddr, ptep); - pte = huge_ptep_get(ptep); - WARN_ON(huge_pte_write(pte)); - - pte = mk_huge_pte(page, prot); - set_huge_pte_at(mm, vaddr, ptep, pte); - barrier(); - huge_ptep_get_and_clear(mm, vaddr, ptep); - pte = huge_ptep_get(ptep); - WARN_ON(!huge_pte_none(pte)); - - pte = mk_huge_pte(page, prot); - pte = huge_pte_wrprotect(pte); - set_huge_pte_at(mm, vaddr, ptep, pte); - barrier(); - pte = huge_pte_mkwrite(pte); - pte = huge_pte_mkdirty(pte); - huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1); - pte = huge_ptep_get(ptep); - WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte))); -} #else /* !CONFIG_HUGETLB_PAGE */ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { } -static void __init hugetlb_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, - pte_t *ptep, unsigned long pfn, - unsigned long vaddr, pgprot_t prot) -{ -} #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1063,8 +1014,6 @@ static int __init debug_vm_pgtable(void) pud_populate_tests(mm, pudp, saved_pmdp); spin_unlock(ptl); - hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - spin_lock(&mm->page_table_lock); p4d_clear_tests(mm, p4dp); pgd_clear_tests(mm, pgdp); -- cgit v1.2.3-59-g8ed1b From 401035d5c456691f79fbcc08eb6ee5479c30b606 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:05:06 -0700 Subject: mm/debug_vm_pgtable: avoid none pte in pte_clear_test pte_clear_tests operate on an existing pte entry. Make sure that is not a none pte entry. [aneesh.kumar@linux.ibm.com: avoid kernel crash with riscv] Link: https://lkml.kernel.org/r/20201015033206.140550-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Reviewed-by: Anshuman Khandual Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Guenter Roeck Cc: Paul Walmsley Cc: Albert Ou Cc: Palmer Dabbelt Link: https://lkml.kernel.org/r/20200902114222.181353-14-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bfc9c6544adf..9a0a42a4f08d 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -540,12 +540,15 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, #endif /* PAGETABLE_P4D_FOLDED */ static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, - unsigned long vaddr) + unsigned long pfn, unsigned long vaddr, + pgprot_t prot) { - pte_t pte = ptep_get(ptep); + pte_t pte = pfn_pte(pfn, prot); pr_debug("Validating PTE clear\n"); +#ifndef CONFIG_RISCV pte = __pte(pte_val(pte) | RANDOM_ORVALUE); +#endif set_pte_at(mm, vaddr, ptep, pte); barrier(); pte_clear(mm, vaddr, ptep); @@ -996,7 +999,7 @@ static int __init debug_vm_pgtable(void) ptl = pte_lockptr(mm, pmdp); spin_lock(ptl); - pte_clear_tests(mm, ptep, vaddr); + pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot); pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); pte_unmap_unlock(ptep, ptl); -- cgit v1.2.3-59-g8ed1b From f14312e1ed1e40dfed38bb70eb9a71d97a6b60f4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 Oct 2020 20:05:10 -0700 Subject: mm/debug_vm_pgtable: avoid doing memory allocation with pgtable_t mapped. With highmem, pte_alloc_map() keep the level4 page table mapped using kmap_atomic(). Avoid doing new memory allocation with page table mapped like above. [ 9.409233] BUG: sleeping function called from invalid context at mm/page_alloc.c:4822 [ 9.410557] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper [ 9.411932] no locks held by swapper/1. [ 9.412595] CPU: 0 PID: 1 Comm: swapper Not tainted 5.9.0-rc3-00323-gc50eb1ed654b5 #2 [ 9.413824] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 [ 9.415207] Call Trace: [ 9.415651] ? ___might_sleep.cold+0xa7/0xcc [ 9.416367] ? __alloc_pages_nodemask+0x14c/0x5b0 [ 9.417055] ? swap_migration_tests+0x50/0x293 [ 9.417704] ? debug_vm_pgtable+0x4bc/0x708 [ 9.418287] ? swap_migration_tests+0x293/0x293 [ 9.418911] ? do_one_initcall+0x82/0x3cb [ 9.419465] ? parse_args+0x1bd/0x280 [ 9.419983] ? rcu_read_lock_sched_held+0x36/0x60 [ 9.420673] ? trace_initcall_level+0x1f/0xf3 [ 9.421279] ? trace_initcall_level+0xbd/0xf3 [ 9.421881] ? do_basic_setup+0x9d/0xdd [ 9.422410] ? do_basic_setup+0xc3/0xdd [ 9.422938] ? kernel_init_freeable+0x72/0xa3 [ 9.423539] ? rest_init+0x134/0x134 [ 9.424055] ? kernel_init+0x5/0x12c [ 9.424574] ? ret_from_fork+0x19/0x30 Reported-by: kernel test robot Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200913110327.645310-1-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 9a0a42a4f08d..c05d9dcf7891 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -943,7 +943,13 @@ static int __init debug_vm_pgtable(void) p4dp = p4d_alloc(mm, pgdp, vaddr); pudp = pud_alloc(mm, p4dp, vaddr); pmdp = pmd_alloc(mm, pudp, vaddr); - ptep = pte_alloc_map(mm, pmdp, vaddr); + /* + * Allocate pgtable_t + */ + if (pte_alloc(mm, pmdp)) { + pr_err("pgtable allocation failed\n"); + return 1; + } /* * Save all the page table page addresses as the page table @@ -997,8 +1003,7 @@ static int __init debug_vm_pgtable(void) * proper page table lock. */ - ptl = pte_lockptr(mm, pmdp); - spin_lock(ptl); + ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl); pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot); pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); pte_unmap_unlock(ptep, ptl); -- cgit v1.2.3-59-g8ed1b From 57417cebc96b57122a2207fc84a6077d20c84b4b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:13 -0700 Subject: XArray: add xa_get_order Patch series "Fix read-only THP for non-tmpfs filesystems". As described more verbosely in the [3/3] changelog, we can inadvertently put an order-0 page in the page cache which occupies 512 consecutive entries. Users are running into this if they enable the READ_ONLY_THP_FOR_FS config option; see https://bugzilla.kernel.org/show_bug.cgi?id=206569 and Qian Cai has also reported it here: https://lore.kernel.org/lkml/20200616013309.GB815@lca.pw/ This is a rather intrusive way of fixing the problem, but has the advantage that I've actually been testing it with the THP patches, which means that it sees far more use than it does upstream -- indeed, Song has been entirely unable to reproduce it. It also has the advantage that it removes a few patches from my gargantuan backlog of THP patches. This patch (of 3): This function returns the order of the entry at the index. We need this because there isn't space in the shadow entry to encode its order. [akpm@linux-foundation.org: export xa_get_order to modules] Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: "Kirill A . Shutemov" Cc: Qian Cai Cc: Song Liu Link: https://lkml.kernel.org/r/20200903183029.14930-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200903183029.14930-2-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/xarray.h | 9 +++++++++ lib/test_xarray.c | 21 +++++++++++++++++++++ lib/xarray.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index b4d70e7568b2..5b7f4ebcf4ff 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1505,6 +1505,15 @@ void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); +#ifdef CONFIG_XARRAY_MULTI +int xa_get_order(struct xarray *, unsigned long index); +#else +static inline int xa_get_order(struct xarray *xa, unsigned long index) +{ + return 0; +} +#endif + /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. diff --git a/lib/test_xarray.c b/lib/test_xarray.c index d4f97925dbd8..bdd4d7995f79 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1649,6 +1649,26 @@ static noinline void check_account(struct xarray *xa) #endif } +static noinline void check_get_order(struct xarray *xa) +{ + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + unsigned int order; + unsigned long i, j; + + for (i = 0; i < 3; i++) + XA_BUG_ON(xa, xa_get_order(xa, i) != 0); + + for (order = 0; order < max_order; order++) { + for (i = 0; i < 10; i++) { + xa_store_order(xa, i << order, order, + xa_mk_index(i << order), GFP_KERNEL); + for (j = i << order; j < (i + 1) << order; j++) + XA_BUG_ON(xa, xa_get_order(xa, j) != order); + xa_erase(xa, i << order); + } + } +} + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -1697,6 +1717,7 @@ static int xarray_checks(void) check_reserve(&array); check_reserve(&xa0); check_multi_store(&array); + check_get_order(&array); check_xa_alloc(); check_find(&array); check_find_entry(&array); diff --git a/lib/xarray.c b/lib/xarray.c index e9e641d3c0c3..f24cb9a43af8 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1592,6 +1592,46 @@ unlock: return xas_result(&xas, NULL); } EXPORT_SYMBOL(xa_store_range); + +/** + * xa_get_order() - Get the order of an entry. + * @xa: XArray. + * @index: Index of the entry. + * + * Return: A number between 0 and 63 indicating the order of the entry. + */ +int xa_get_order(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + void *entry; + int order = 0; + + rcu_read_lock(); + entry = xas_load(&xas); + + if (!entry) + goto unlock; + + if (!xas.xa_node) + goto unlock; + + for (;;) { + unsigned int slot = xas.xa_offset + (1 << order); + + if (slot >= XA_CHUNK_SIZE) + break; + if (!xa_is_sibling(xas.xa_node->slots[slot])) + break; + order++; + } + + order += xas.xa_node->shift; +unlock: + rcu_read_unlock(); + + return order; +} +EXPORT_SYMBOL(xa_get_order); #endif /* CONFIG_XARRAY_MULTI */ /** -- cgit v1.2.3-59-g8ed1b From 8fc75643c5e14574c8be59b69182452ece28315a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:16 -0700 Subject: XArray: add xas_split In order to use multi-index entries for huge pages in the page cache, we need to be able to split a multi-index entry (eg if a file is truncated in the middle of a huge page entry). This version does not support splitting more than one level of the tree at a time. This is an acceptable limitation for the page cache as we do not expect to support order-12 pages in the near future. [akpm@linux-foundation.org: export xas_split_alloc() to modules] [willy@infradead.org: fix xarray split] Link: https://lkml.kernel.org/r/20200910175450.GV6583@casper.infradead.org [willy@infradead.org: fix xarray] Link: https://lkml.kernel.org/r/20201001233943.GW20115@casper.infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: "Kirill A . Shutemov" Cc: Qian Cai Cc: Song Liu Link: https://lkml.kernel.org/r/20200903183029.14930-3-willy@infradead.org Signed-off-by: Linus Torvalds --- Documentation/core-api/xarray.rst | 16 ++-- include/linux/xarray.h | 13 +++ lib/test_xarray.c | 44 ++++++++++ lib/xarray.c | 168 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 225 insertions(+), 16 deletions(-) diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index 640934b6f7b4..a137a0e6d068 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -475,13 +475,15 @@ or iterations will move the index to the first index in the range. Each entry will only be returned once, no matter how many indices it occupies. -Using xas_next() or xas_prev() with a multi-index xa_state -is not supported. Using either of these functions on a multi-index entry -will reveal sibling entries; these should be skipped over by the caller. - -Storing ``NULL`` into any index of a multi-index entry will set the entry -at every index to ``NULL`` and dissolve the tie. Splitting a multi-index -entry into entries occupying smaller ranges is not yet supported. +Using xas_next() or xas_prev() with a multi-index xa_state is not +supported. Using either of these functions on a multi-index entry will +reveal sibling entries; these should be skipped over by the caller. + +Storing ``NULL`` into any index of a multi-index entry will set the +entry at every index to ``NULL`` and dissolve the tie. A multi-index +entry can be split into entries occupying smaller ranges by calling +xas_split_alloc() without the xa_lock held, followed by taking the lock +and calling xas_split(). Functions and structures ======================== diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 5b7f4ebcf4ff..5cdf441f6377 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1507,11 +1507,24 @@ void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); +void xas_split(struct xa_state *, void *entry, unsigned int order); +void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } + +static inline void xas_split(struct xa_state *xas, void *entry, + unsigned int order) +{ + xas_store(xas, entry); +} + +static inline void xas_split_alloc(struct xa_state *xas, void *entry, + unsigned int order, gfp_t gfp) +{ +} #endif /** diff --git a/lib/test_xarray.c b/lib/test_xarray.c index bdd4d7995f79..8262c3f05a5d 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1503,6 +1503,49 @@ static noinline void check_store_range(struct xarray *xa) } } +#ifdef CONFIG_XARRAY_MULTI +static void check_split_1(struct xarray *xa, unsigned long index, + unsigned int order) +{ + XA_STATE(xas, xa, index); + void *entry; + unsigned int i = 0; + + xa_store_order(xa, index, order, xa, GFP_KERNEL); + + xas_split_alloc(&xas, xa, order, GFP_KERNEL); + xas_lock(&xas); + xas_split(&xas, xa, order); + xas_unlock(&xas); + + xa_for_each(xa, index, entry) { + XA_BUG_ON(xa, entry != xa); + i++; + } + XA_BUG_ON(xa, i != 1 << order); + + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0)); + + xa_destroy(xa); +} + +static noinline void check_split(struct xarray *xa) +{ + unsigned int order; + + XA_BUG_ON(xa, !xa_empty(xa)); + + for (order = 1; order < 2 * XA_CHUNK_SHIFT; order++) { + check_split_1(xa, 0, order); + check_split_1(xa, 1UL << order, order); + check_split_1(xa, 3UL << order, order); + } +} +#else +static void check_split(struct xarray *xa) { } +#endif + static void check_align_1(struct xarray *xa, char *name) { int i; @@ -1729,6 +1772,7 @@ static int xarray_checks(void) check_store_range(&array); check_store_iter(&array); check_align(&xa0); + check_split(&array); check_workingset(&array, 0); check_workingset(&array, 64); diff --git a/lib/xarray.c b/lib/xarray.c index f24cb9a43af8..b76eea7b314c 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -266,13 +266,14 @@ static void xa_node_free(struct xa_node *node) */ static void xas_destroy(struct xa_state *xas) { - struct xa_node *node = xas->xa_alloc; + struct xa_node *next, *node = xas->xa_alloc; - if (!node) - return; - XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); - kmem_cache_free(radix_tree_node_cachep, node); - xas->xa_alloc = NULL; + while (node) { + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + next = rcu_dereference_raw(node->parent); + radix_tree_node_rcu_free(&node->rcu_head); + xas->xa_alloc = node = next; + } } /** @@ -304,6 +305,7 @@ bool xas_nomem(struct xa_state *xas, gfp_t gfp) xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); if (!xas->xa_alloc) return false; + xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; @@ -339,6 +341,7 @@ static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) } if (!xas->xa_alloc) return false; + xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; @@ -403,7 +406,7 @@ static unsigned long xas_size(const struct xa_state *xas) /* * Use this to calculate the maximum index that will need to be created * in order to add the entry described by @xas. Because we cannot store a - * multiple-index entry at index 0, the calculation is a little more complex + * multi-index entry at index 0, the calculation is a little more complex * than you might expect. */ static unsigned long xas_max(struct xa_state *xas) @@ -946,6 +949,153 @@ void xas_init_marks(const struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_init_marks); +#ifdef CONFIG_XARRAY_MULTI +static unsigned int node_get_marks(struct xa_node *node, unsigned int offset) +{ + unsigned int marks = 0; + xa_mark_t mark = XA_MARK_0; + + for (;;) { + if (node_get_mark(node, offset, mark)) + marks |= 1 << (__force unsigned int)mark; + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } + + return marks; +} + +static void node_set_marks(struct xa_node *node, unsigned int offset, + struct xa_node *child, unsigned int marks) +{ + xa_mark_t mark = XA_MARK_0; + + for (;;) { + if (marks & (1 << (__force unsigned int)mark)) { + node_set_mark(node, offset, mark); + if (child) + node_mark_all(child, mark); + } + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } +} + +/** + * xas_split_alloc() - Allocate memory for splitting an entry. + * @xas: XArray operation state. + * @entry: New entry which will be stored in the array. + * @order: New entry order. + * @gfp: Memory allocation flags. + * + * This function should be called before calling xas_split(). + * If necessary, it will allocate new nodes (and fill them with @entry) + * to prepare for the upcoming split of an entry of @order size into + * entries of the order stored in the @xas. + * + * Context: May sleep if @gfp flags permit. + */ +void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, + gfp_t gfp) +{ + unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; + unsigned int mask = xas->xa_sibs; + + /* XXX: no support for splitting really large entries yet */ + if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order)) + goto nomem; + if (xas->xa_shift + XA_CHUNK_SHIFT > order) + return; + + do { + unsigned int i; + void *sibling; + struct xa_node *node; + + node = kmem_cache_alloc(radix_tree_node_cachep, gfp); + if (!node) + goto nomem; + node->array = xas->xa; + for (i = 0; i < XA_CHUNK_SIZE; i++) { + if ((i & mask) == 0) { + RCU_INIT_POINTER(node->slots[i], entry); + sibling = xa_mk_sibling(0); + } else { + RCU_INIT_POINTER(node->slots[i], sibling); + } + } + RCU_INIT_POINTER(node->parent, xas->xa_alloc); + xas->xa_alloc = node; + } while (sibs-- > 0); + + return; +nomem: + xas_destroy(xas); + xas_set_err(xas, -ENOMEM); +} +EXPORT_SYMBOL_GPL(xas_split_alloc); + +/** + * xas_split() - Split a multi-index entry into smaller entries. + * @xas: XArray operation state. + * @entry: New entry to store in the array. + * @order: New entry order. + * + * The value in the entry is copied to all the replacement entries. + * + * Context: Any context. The caller should hold the xa_lock. + */ +void xas_split(struct xa_state *xas, void *entry, unsigned int order) +{ + unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; + unsigned int offset, marks; + struct xa_node *node; + void *curr = xas_load(xas); + int values = 0; + + node = xas->xa_node; + if (xas_top(node)) + return; + + marks = node_get_marks(node, xas->xa_offset); + + offset = xas->xa_offset + sibs; + do { + if (xas->xa_shift < node->shift) { + struct xa_node *child = xas->xa_alloc; + + xas->xa_alloc = rcu_dereference_raw(child->parent); + child->shift = node->shift - XA_CHUNK_SHIFT; + child->offset = offset; + child->count = XA_CHUNK_SIZE; + child->nr_values = xa_is_value(entry) ? + XA_CHUNK_SIZE : 0; + RCU_INIT_POINTER(child->parent, node); + node_set_marks(node, offset, child, marks); + rcu_assign_pointer(node->slots[offset], + xa_mk_node(child)); + if (xa_is_value(curr)) + values--; + } else { + unsigned int canon = offset - xas->xa_sibs; + + node_set_marks(node, canon, NULL, marks); + rcu_assign_pointer(node->slots[canon], entry); + while (offset > canon) + rcu_assign_pointer(node->slots[offset--], + xa_mk_sibling(canon)); + values += (xa_is_value(entry) - xa_is_value(curr)) * + (xas->xa_sibs + 1); + } + } while (offset-- > xas->xa_offset); + + node->nr_values += values; +} +EXPORT_SYMBOL_GPL(xas_split); +#endif + /** * xas_pause() - Pause a walk to drop a lock. * @xas: XArray operation state. @@ -1407,7 +1557,7 @@ EXPORT_SYMBOL(__xa_store); * @gfp: Memory allocation flags. * * After this function returns, loads from this index will return @entry. - * Storing into an existing multislot entry updates the entry of every index. + * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Any context. Takes and releases the xa_lock. @@ -1549,7 +1699,7 @@ static void xas_set_range(struct xa_state *xas, unsigned long first, * * After this function returns, loads from any index between @first and @last, * inclusive will return @entry. - * Storing into an existing multislot entry updates the entry of every index. + * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Process context. Takes and releases the xa_lock. May sleep -- cgit v1.2.3-59-g8ed1b From 198b62f83eef1d605d70eca32759c92cdcc14175 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:20 -0700 Subject: mm/filemap: fix storing to a THP shadow entry When a THP is removed from the page cache by reclaim, we replace it with a shadow entry that occupies all slots of the XArray previously occupied by the THP. If the user then accesses that page again, we only allocate a single page, but storing it into the shadow entry replaces all entries with that one page. That leads to bugs like page dumped because: VM_BUG_ON_PAGE(page_to_pgoff(page) != offset) ------------[ cut here ]------------ kernel BUG at mm/filemap.c:2529! https://bugzilla.kernel.org/show_bug.cgi?id=206569 This is hard to reproduce with mainline, but happens regularly with the THP patchset (as so many more THPs are created). This solution is take from the THP patchset. It splits the shadow entry into order-0 pieces at the time that we bring a new page into cache. Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS") Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Song Liu Cc: "Kirill A . Shutemov" Cc: Qian Cai Link: https://lkml.kernel.org/r/20200903183029.14930-4-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/filemap.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 38546dca58fe..3b76d79fd0cf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -829,13 +829,12 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page); static int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask, + pgoff_t offset, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); int error; - void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); @@ -846,25 +845,46 @@ static int __add_to_page_cache_locked(struct page *page, page->index = offset; if (!huge) { - error = mem_cgroup_charge(page, current->mm, gfp_mask); + error = mem_cgroup_charge(page, current->mm, gfp); if (error) goto error; } + gfp &= GFP_RECLAIM_MASK; + do { + unsigned int order = xa_get_order(xas.xa, xas.xa_index); + void *entry, *old = NULL; + + if (order > thp_order(page)) + xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), + order, gfp); xas_lock_irq(&xas); - old = xas_load(&xas); - if (old && !xa_is_value(old)) - xas_set_err(&xas, -EEXIST); + xas_for_each_conflict(&xas, entry) { + old = entry; + if (!xa_is_value(entry)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + } + + if (old) { + if (shadowp) + *shadowp = old; + /* entry may have been split before we acquired lock */ + order = xa_get_order(xas.xa, xas.xa_index); + if (order > thp_order(page)) { + xas_split(&xas, old, order); + xas_reset(&xas); + } + } + xas_store(&xas, page); if (xas_error(&xas)) goto unlock; - if (xa_is_value(old)) { + if (old) mapping->nrexceptional--; - if (shadowp) - *shadowp = old; - } mapping->nrpages++; /* hugetlb pages do not participate in page cache accounting */ @@ -872,7 +892,7 @@ static int __add_to_page_cache_locked(struct page *page, __inc_lruvec_page_state(page, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { error = xas_error(&xas); -- cgit v1.2.3-59-g8ed1b From 887b22c628c615811be3b877e02c9c1fe9d85d6d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:23 -0700 Subject: mm/filemap: fix page cache removal for arbitrary sized THPs Patch series "Remove assumptions of THP size". There are a number of places in the VM which assume that a THP is a PMD in size. That's true today, and remains true after this patch series, but this is a prerequisite for switching to arbitrary-sized THPs. thp_nr_pages() still returns either HPAGE_PMD_NR or 1, but will be changed later. This patch (of 11): page_cache_free_page() assumes THPs are PMD_SIZE; fix that assumption. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Acked-by: Kirill A. Shutemov Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200908195539.25896-2-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 3b76d79fd0cf..2937b57329be 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping, freepage(page); if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, HPAGE_PMD_NR); + page_ref_sub(page, thp_nr_pages(page)); VM_BUG_ON_PAGE(page_count(page) <= 0, page); } else { put_page(page); -- cgit v1.2.3-59-g8ed1b From d01ac3c35214ce362f50cada37cb7bab8c801896 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:26 -0700 Subject: mm/memory: remove page fault assumption of compound page size A compound page in the page cache will not necessarily be of PMD size, so check explicitly. [willy@infradead.org: fix remove page fault assumption of compound page size] Link: https://lkml.kernel.org/r/20201001152259.14932-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Huang Ying Cc: Kirill A. Shutemov Link: https://lkml.kernel.org/r/20200908195539.25896-3-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2afb01ea1307..589afe45d0b3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3709,13 +3709,14 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i; - vm_fault_t ret; + vm_fault_t ret = VM_FAULT_FALLBACK; if (!transhuge_vma_suitable(vma, haddr)) - return VM_FAULT_FALLBACK; + return ret; - ret = VM_FAULT_FALLBACK; page = compound_head(page); + if (compound_order(page) != HPAGE_PMD_ORDER) + return ret; /* * Archs like ppc64 need additonal space to store information -- cgit v1.2.3-59-g8ed1b From 8fb156c9ee2db94f7127c930c89917634a1a9f56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:29 -0700 Subject: mm/page_owner: change split_page_owner to take a count The implementation of split_page_owner() prefers a count rather than the old order of the page. When we support a variable size THP, we won't have the order at this point, but we will have the number of pages. So change the interface to what the caller and callee would prefer. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Acked-by: Kirill A. Shutemov Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-4-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/page_owner.h | 6 +++--- mm/huge_memory.c | 2 +- mm/page_alloc.c | 2 +- mm/page_owner.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 8679ccd722e8..3468794f83d2 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,7 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, unsigned int order); +extern void __split_page_owner(struct page *page, unsigned int nr); extern void __copy_page_owner(struct page *oldpage, struct page *newpage); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(struct page *page); @@ -31,10 +31,10 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, unsigned int order) +static inline void split_page_owner(struct page *page, unsigned int nr) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, order); + __split_page_owner(page, nr); } static inline void copy_page_owner(struct page *oldpage, struct page *newpage) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 65c289c13b58..aa612cc5b3fe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2454,7 +2454,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); - split_page_owner(head, HPAGE_PMD_ORDER); + split_page_owner(head, HPAGE_PMD_NR); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05fe1ddb033c..febb2d731f9e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3209,7 +3209,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, order); + split_page_owner(page, 1 << order); } EXPORT_SYMBOL_GPL(split_page); diff --git a/mm/page_owner.c b/mm/page_owner.c index 360461509423..4ca3051a1035 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -204,7 +204,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_owner->last_migrate_reason = reason; } -void __split_page_owner(struct page *page, unsigned int order) +void __split_page_owner(struct page *page, unsigned int nr) { int i; struct page_ext *page_ext = lookup_page_ext(page); @@ -213,7 +213,7 @@ void __split_page_owner(struct page *page, unsigned int order) if (unlikely(!page_ext)) return; - for (i = 0; i < (1 << order); i++) { + for (i = 0; i < nr; i++) { page_owner = get_page_owner(page_ext); page_owner->order = 0; page_ext = page_ext_next(page_ext); -- cgit v1.2.3-59-g8ed1b From 86b562b629728a072e6928bdc02a105d358a007d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 15 Oct 2020 20:05:33 -0700 Subject: mm/huge_memory: fix total_mapcount assumption of page size File THPs may now be of arbitrary order. Signed-off-by: Kirill A. Shutemov Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-5-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aa612cc5b3fe..b63c98516a7c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2494,7 +2494,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, int total_mapcount(struct page *page) { - int i, compound, ret; + int i, compound, nr, ret; VM_BUG_ON_PAGE(PageTail(page), page); @@ -2502,16 +2502,17 @@ int total_mapcount(struct page *page) return atomic_read(&page->_mapcount) + 1; compound = compound_mapcount(page); + nr = compound_nr(page); if (PageHuge(page)) return compound; ret = compound; - for (i = 0; i < HPAGE_PMD_NR; i++) + for (i = 0; i < nr; i++) ret += atomic_read(&page[i]._mapcount) + 1; /* File pages has compound_mapcount included in _mapcount */ if (!PageAnon(page)) - return ret - compound * HPAGE_PMD_NR; + return ret - compound * nr; if (PageDoubleMap(page)) - ret -= HPAGE_PMD_NR; + ret -= nr; return ret; } -- cgit v1.2.3-59-g8ed1b From 8cce54756806e5777069c46011c5f54f9feac717 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 15 Oct 2020 20:05:36 -0700 Subject: mm/huge_memory: fix split assumption of page size File THPs may now be of arbitrary size, and we can't rely on that size after doing the split so remember the number of pages before we start the split. Signed-off-by: Kirill A. Shutemov Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-6-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b63c98516a7c..5934ef722e28 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2335,13 +2335,13 @@ static void unmap_page(struct page *page) VM_BUG_ON_PAGE(!unmap_success, page); } -static void remap_page(struct page *page) +static void remap_page(struct page *page, unsigned int nr) { int i; if (PageTransHuge(page)) { remove_migration_ptes(page, page, true); } else { - for (i = 0; i < HPAGE_PMD_NR; i++) + for (i = 0; i < nr; i++) remove_migration_ptes(page + i, page + i, true); } } @@ -2419,6 +2419,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; + unsigned int nr = thp_nr_pages(head); int i; lruvec = mem_cgroup_page_lruvec(head, pgdat); @@ -2434,7 +2435,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_lock(&swap_cache->i_pages); } - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { + for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ if (head[i].index >= end) { @@ -2454,7 +2455,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); - split_page_owner(head, HPAGE_PMD_NR); + split_page_owner(head, nr); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { @@ -2473,9 +2474,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, spin_unlock_irqrestore(&pgdat->lru_lock, flags); - remap_page(head); + remap_page(head, nr); - for (i = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0; i < nr; i++) { struct page *subpage = head + i; if (subpage == page) continue; @@ -2729,7 +2730,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) fail: if (mapping) xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(&pgdata->lru_lock, flags); - remap_page(head); + remap_page(head, thp_nr_pages(head)); ret = -EBUSY; } -- cgit v1.2.3-59-g8ed1b From 65dfe3c3bc411caa04c4f6c7d81418b90d6a2ce8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:39 -0700 Subject: mm/huge_memory: fix page_trans_huge_mapcount assumption of THP size Ask the page what size it is instead of assuming it's PMD size. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Acked-by: Kirill A. Shutemov Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-7-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5934ef722e28..6fb75df0e8c0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2558,14 +2558,14 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount) page = compound_head(page); _total_mapcount = ret = 0; - for (i = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0; i < thp_nr_pages(page); i++) { mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); _total_mapcount += mapcount; } if (PageDoubleMap(page)) { ret -= 1; - _total_mapcount -= HPAGE_PMD_NR; + _total_mapcount -= thp_nr_pages(page); } mapcount = compound_mapcount(page); ret += mapcount; -- cgit v1.2.3-59-g8ed1b From e2333dad2d4a3543f07beb5da61401046e54f2ff Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:43 -0700 Subject: mm/huge_memory: fix can_split_huge_page assumption of THP size Ask the page how many subpages it has instead of assuming it's PMD size. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Acked-by: Kirill A. Shutemov Acked-by: "Huang, Ying" Link: https://lkml.kernel.org/r/20200908195539.25896-8-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6fb75df0e8c0..4474cc3b1a48 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2582,9 +2582,9 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) /* Additional pins from page cache */ if (PageAnon(page)) - extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; + extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0; else - extra_pins = HPAGE_PMD_NR; + extra_pins = thp_nr_pages(page); if (pextra_pins) *pextra_pins = extra_pins; return total_mapcount(page) == page_count(page) - extra_pins - 1; -- cgit v1.2.3-59-g8ed1b From 5eaf35ab1275c4048d5c64b7bd9e982af9ea6b9f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:46 -0700 Subject: mm/rmap: fix assumptions of THP size Ask the page what size it is instead of assuming it's PMD size. Do this for anon pages as well as file pages for when someone decides to support that. Leave the assumption alone for pages which are PMD mapped; we don't currently grow THPs beyond PMD size, so we don't need to change this code yet. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Acked-by: Kirill A. Shutemov Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-9-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/rmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 9425260774a1..1b84945d655c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1205,7 +1205,7 @@ void page_add_file_rmap(struct page *page, bool compound) VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { if (atomic_inc_and_test(&page[i]._mapcount)) nr++; } @@ -1246,7 +1246,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) /* page still mapped by someone else? */ if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } @@ -1293,7 +1293,7 @@ static void page_remove_anon_compound_rmap(struct page *page) * Subpages can be mapped with PTEs too. Check how many of * them are still mapped. */ - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } @@ -1303,10 +1303,10 @@ static void page_remove_anon_compound_rmap(struct page *page) * page of the compound page is unmapped, but at least one * small page is still mapped. */ - if (nr && nr < HPAGE_PMD_NR) + if (nr && nr < thp_nr_pages(page)) deferred_split_huge_page(page); } else { - nr = HPAGE_PMD_NR; + nr = thp_nr_pages(page); } if (unlikely(PageMlocked(page))) -- cgit v1.2.3-59-g8ed1b From fc3a5ac52827960ca1e17ba243ff4ba29fcc057a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:50 -0700 Subject: mm/truncate: fix truncation for pages of arbitrary size Remove the assumption that a compound page is HPAGE_PMD_SIZE, and the assumption that any page is PAGE_SIZE. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Acked-by: Kirill A. Shutemov Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-10-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/truncate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 6bbe0f0b3ce9..18cec39a9f53 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -168,7 +168,7 @@ void do_invalidatepage(struct page *page, unsigned int offset, * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). * - * We need to bale out if page->mapping is no longer equal to the original + * We need to bail out if page->mapping is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. @@ -177,12 +177,12 @@ static void truncate_cleanup_page(struct address_space *mapping, struct page *page) { if (page_mapped(page)) { - pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1; + unsigned int nr = thp_nr_pages(page); unmap_mapping_pages(mapping, page->index, nr, false); } if (page_has_private(page)) - do_invalidatepage(page, 0, PAGE_SIZE); + do_invalidatepage(page, 0, thp_size(page)); /* * Some filesystems seem to re-dirty the page even after -- cgit v1.2.3-59-g8ed1b From 8854a6a7248f6c372f1aa2a47b58027d3ba600c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:53 -0700 Subject: mm/page-writeback: support tail pages in wait_for_stable_page page->mapping is undefined for tail pages, so operate exclusively on the head page. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Acked-by: Kirill A. Shutemov Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-11-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 358d6f28c627..7709f0e223f5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2849,6 +2849,7 @@ EXPORT_SYMBOL_GPL(wait_on_page_writeback); */ void wait_for_stable_page(struct page *page) { + page = thp_head(page); if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) wait_on_page_writeback(page); } -- cgit v1.2.3-59-g8ed1b From 3efe62e466958a1e59e68e0faa5c66b5b99b17ce Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:56 -0700 Subject: mm/vmscan: allow arbitrary sized pages to be paged out Remove the assumption that a compound page has HPAGE_PMD_NR pins from the page cache. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Acked-by: Kirill A. Shutemov Acked-by: "Huang, Ying" Link: https://lkml.kernel.org/r/20200908195539.25896-12-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 879fb57c5045..fc4dee9b91f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -725,8 +725,7 @@ static inline int is_page_cache_freeable(struct page *page) * that isolated the page, the page cache and optional buffer * heads at page->private. */ - int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ? - HPAGE_PMD_NR : 1; + int page_cache_pins = thp_nr_pages(page); return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } -- cgit v1.2.3-59-g8ed1b From 01c70267053d6718820ac0902d8823d5dd2a6adb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:00 -0700 Subject: fs: add a filesystem flag for THPs The page cache needs to know whether the filesystem supports THPs so that it doesn't send THPs to filesystems which can't handle them. Dave Chinner points out that getting from the page mapping to the filesystem type is too many steps (mapping->host->i_sb->s_type->fs_flags) so cache that information in the address space flags. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Alexander Viro Cc: "Matthew Wilcox (Oracle)" Cc: Hugh Dickins Cc: Song Liu Cc: Rik van Riel Cc: "Kirill A . Shutemov" Cc: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20200916032717.22917-1-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/inode.c | 2 ++ include/linux/fs.h | 1 + include/linux/pagemap.h | 6 ++++++ mm/shmem.c | 2 +- 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 72c4c347afb7..9d78c37b00b8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -181,6 +181,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + if (sb->s_type->fs_flags & FS_THP_SUPPORT) + __set_bit(AS_THP_SUPPORT, &mapping->flags); mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS diff --git a/include/linux/fs.h b/include/linux/fs.h index ae97d87a00d2..72369be23f91 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2209,6 +2209,7 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ +#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3afd3242b54..820c970fd24a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -29,6 +29,7 @@ enum mapping_flags { AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, + AS_THP_SUPPORT = 6, /* THPs supported */ }; /** @@ -120,6 +121,11 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +static inline bool mapping_thp_support(struct address_space *mapping) +{ + return test_bit(AS_THP_SUPPORT, &mapping->flags); +} + void release_pages(struct page **pages, int nr); /* diff --git a/mm/shmem.c b/mm/shmem.c index 6d4ddef4a24f..537c137698f8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3984,7 +3984,7 @@ static struct file_system_type shmem_fs_type = { .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT, }; int __init shmem_init(void) -- cgit v1.2.3-59-g8ed1b From 6f4d2f9770cf154f9867f466d7b1b463a39f05a7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:03 -0700 Subject: fs: do not update nr_thps for mappings which support THPs The nr_thps counter is to support THPs in the page cache when the filesystem doesn't understand THPs. Eventually it will be removed, but we should still support filesystems which do not understand THPs yet. Move the nr_thp manipulation functions to filemap.h since they're page-cache specific. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Alexander Viro Cc: "Matthew Wilcox (Oracle)" Cc: Hugh Dickins Cc: Song Liu Cc: Rik van Riel Cc: "Kirill A . Shutemov" Cc: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20200916032717.22917-2-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/fs.h | 27 --------------------------- include/linux/pagemap.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 72369be23f91..d1d166b46131 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2697,33 +2697,6 @@ static inline errseq_t file_sample_sb_err(struct file *file) return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); } -static inline int filemap_nr_thps(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - return atomic_read(&mapping->nr_thps); -#else - return 0; -#endif -} - -static inline void filemap_nr_thps_inc(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - atomic_inc(&mapping->nr_thps); -#else - WARN_ON_ONCE(1); -#endif -} - -static inline void filemap_nr_thps_dec(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - atomic_dec(&mapping->nr_thps); -#else - WARN_ON_ONCE(1); -#endif -} - extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 820c970fd24a..a0024528a9ee 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -126,6 +126,35 @@ static inline bool mapping_thp_support(struct address_space *mapping) return test_bit(AS_THP_SUPPORT, &mapping->flags); } +static inline int filemap_nr_thps(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + return atomic_read(&mapping->nr_thps); +#else + return 0; +#endif +} + +static inline void filemap_nr_thps_inc(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + if (!mapping_thp_support(mapping)) + atomic_inc(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + +static inline void filemap_nr_thps_dec(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + if (!mapping_thp_support(mapping)) + atomic_dec(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + void release_pages(struct page **pages, int nr); /* -- cgit v1.2.3-59-g8ed1b From c4f9c701f9b44299e6adbc58d1a4bb2c40383494 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 15 Oct 2020 20:06:07 -0700 Subject: mm: fix a race during THP splitting It is reported that the following bug is triggered if the HDD is used as swap device, [ 5758.157556] BUG: kernel NULL pointer dereference, address: 0000000000000007 [ 5758.165331] #PF: supervisor write access in kernel mode [ 5758.171161] #PF: error_code(0x0002) - not-present page [ 5758.176894] PGD 0 P4D 0 [ 5758.179721] Oops: 0002 [#1] SMP PTI [ 5758.183614] CPU: 10 PID: 316 Comm: kswapd1 Kdump: loaded Tainted: G S --------- --- 5.9.0-0.rc3.1.tst.el8.x86_64 #1 [ 5758.196717] Hardware name: Intel Corporation S2600CP/S2600CP, BIOS SE5C600.86B.02.01.0002.082220131453 08/22/2013 [ 5758.208176] RIP: 0010:split_swap_cluster+0x47/0x60 [ 5758.213522] Code: c1 e3 06 48 c1 eb 0f 48 8d 1c d8 48 89 df e8 d0 20 6a 00 80 63 07 fb 48 85 db 74 16 48 89 df c6 07 00 66 66 66 90 31 c0 5b c3 <80> 24 25 07 00 00 00 fb 31 c0 5b c3 b8 f0 ff ff ff 5b c3 66 0f 1f [ 5758.234478] RSP: 0018:ffffb147442d7af0 EFLAGS: 00010246 [ 5758.240309] RAX: 0000000000000000 RBX: 000000000014b217 RCX: ffffb14779fd9000 [ 5758.248281] RDX: 000000000014b217 RSI: ffff9c52f2ab1400 RDI: 000000000014b217 [ 5758.256246] RBP: ffffe00c51168080 R08: ffffe00c5116fe08 R09: ffff9c52fffd3000 [ 5758.264208] R10: ffffe00c511537c8 R11: ffff9c52fffd3c90 R12: 0000000000000000 [ 5758.272172] R13: ffffe00c51170000 R14: ffffe00c51170000 R15: ffffe00c51168040 [ 5758.280134] FS: 0000000000000000(0000) GS:ffff9c52f2a80000(0000) knlGS:0000000000000000 [ 5758.289163] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5758.295575] CR2: 0000000000000007 CR3: 0000000022a0e003 CR4: 00000000000606e0 [ 5758.303538] Call Trace: [ 5758.306273] split_huge_page_to_list+0x88b/0x950 [ 5758.311433] deferred_split_scan+0x1ca/0x310 [ 5758.316202] do_shrink_slab+0x12c/0x2a0 [ 5758.320491] shrink_slab+0x20f/0x2c0 [ 5758.324482] shrink_node+0x240/0x6c0 [ 5758.328469] balance_pgdat+0x2d1/0x550 [ 5758.332652] kswapd+0x201/0x3c0 [ 5758.336157] ? finish_wait+0x80/0x80 [ 5758.340147] ? balance_pgdat+0x550/0x550 [ 5758.344525] kthread+0x114/0x130 [ 5758.348126] ? kthread_park+0x80/0x80 [ 5758.352214] ret_from_fork+0x22/0x30 [ 5758.356203] Modules linked in: fuse zram rfkill sunrpc intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp mgag200 iTCO_wdt crct10dif_pclmul iTCO_vendor_support drm_kms_helper crc32_pclmul ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops cec rapl joydev intel_cstate ipmi_si ipmi_devintf drm intel_uncore i2c_i801 ipmi_msghandler pcspkr lpc_ich mei_me i2c_smbus mei ioatdma ip_tables xfs libcrc32c sr_mod sd_mod cdrom t10_pi sg igb ahci libahci i2c_algo_bit crc32c_intel libata dca wmi dm_mirror dm_region_hash dm_log dm_mod [ 5758.412673] CR2: 0000000000000007 [ 0.000000] Linux version 5.9.0-0.rc3.1.tst.el8.x86_64 (mockbuild@x86-vm-15.build.eng.bos.redhat.com) (gcc (GCC) 8.3.1 20191121 (Red Hat 8.3.1-5), GNU ld version 2.30-79.el8) #1 SMP Wed Sep 9 16:03:34 EDT 2020 After further digging it's found that the following race condition exists in the original implementation, CPU1 CPU2 ---- ---- deferred_split_scan() split_huge_page(page) /* page isn't compound head */ split_huge_page_to_list(page, NULL) __split_huge_page(page, ) ClearPageCompound(head) /* unlock all subpages except page (not head) */ add_to_swap(head) /* not THP */ get_swap_page(head) add_to_swap_cache(head, ) SetPageSwapCache(head) if PageSwapCache(head) split_swap_cluster(/* swap entry of head */) /* Deref sis->cluster_info: NULL accessing! */ So, in split_huge_page_to_list(), PageSwapCache() is called for the already split and unlocked "head", which may be added to swap cache in another CPU. So split_swap_cluster() may be called wrongly. To fix the race, the call to split_swap_cluster() is moved to __split_huge_page() before all subpages are unlocked. So that the PageSwapCache() is stable. Fixes: 59807685a7e77 ("mm, THP, swap: support splitting THP for THP swap out") Reported-by: Rafael Aquini Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton Tested-by: Rafael Aquini Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Matthew Wilcox Link: https://lkml.kernel.org/r/20201009073647.1531083-1-ying.huang@intel.com Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4474cc3b1a48..9474dbc150ed 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2476,6 +2476,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, remap_page(head, nr); + if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + split_swap_cluster(entry); + } + for (i = 0; i < nr; i++) { struct page *subpage = head + i; if (subpage == page) @@ -2711,12 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } __split_huge_page(page, list, end, flags); - if (PageSwapCache(head)) { - swp_entry_t entry = { .val = page_private(head) }; - - ret = split_swap_cluster(entry); - } else - ret = 0; + ret = 0; } else { if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { pr_alert("total_mapcount: %u, page_count(): %u\n", -- cgit v1.2.3-59-g8ed1b From 1aa83cfa5a20a6bbd39d2355a89c95152e4b37b4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:10 -0700 Subject: mm/readahead: add DEFINE_READAHEAD Patch series "Readahead patches for 5.9/5.10". These are infrastructure for both the THP patchset and for the fscache rewrite, For both pieces of infrastructure being build on top of this patchset, we want the ractl to be available higher in the call-stack. For David's work, he wants to add the 'critical page' to the ractl so that he knows which page NEEDS to be brought in from storage, and which ones are nice-to-have. We might want something similar in block storage too. It used to be simple -- the first page was the critical one, but then mmap added fault-around and so for that usecase, the middle page is the critical one. Anyway, I don't have any code to show that yet, we just know that the lowest point in the callchain where we have that information is do_sync_mmap_readahead() and so the ractl needs to start its life there. For THP, we havew the code that needs it. It's actually the apex patch to the series; the one which finally starts to allocate THPs and present them to consenting filesystems: http://git.infradead.org/users/willy/pagecache.git/commitdiff/798bcf30ab2eff278caad03a9edca74d2f8ae760 This patch (of 8): Allow for a more concise definition of a struct readahead_control. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Eric Biggers Cc: David Howells Link: https://lkml.kernel.org/r/20200903140844.14194-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200903140844.14194-3-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 7 +++++++ mm/readahead.c | 6 +----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a0024528a9ee..63c81b512e80 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -812,6 +812,13 @@ struct readahead_control { unsigned int _batch_count; }; +#define DEFINE_READAHEAD(rac, f, m, i) \ + struct readahead_control rac = { \ + .file = f, \ + .mapping = m, \ + ._index = i, \ + } + /** * readahead_page - Get the next page to read. * @rac: The current readahead request. diff --git a/mm/readahead.c b/mm/readahead.c index 3c9a8dd7c56c..2126a2754e22 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -179,11 +179,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping, { LIST_HEAD(page_pool); gfp_t gfp_mask = readahead_gfp_mask(mapping); - struct readahead_control rac = { - .mapping = mapping, - .file = file, - ._index = index, - }; + DEFINE_READAHEAD(rac, file, mapping, index); unsigned long i; /* -- cgit v1.2.3-59-g8ed1b From 73bb49da50cd460bb3ba31250ed2e7fbf2115edf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:14 -0700 Subject: mm/readahead: make page_cache_ra_unbounded take a readahead_control Define it in the callers instead of in page_cache_ra_unbounded(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: David Howells Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-4-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/ext4/verity.c | 4 ++-- fs/f2fs/verity.c | 4 ++-- include/linux/pagemap.h | 5 ++--- mm/readahead.c | 30 ++++++++++++++---------------- 4 files changed, 20 insertions(+), 23 deletions(-) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index bbd5e7e0632b..5b7ba8f71153 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -349,6 +349,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -358,8 +359,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 9eb0dba851e8..054ec852b5ea 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -228,6 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -237,8 +238,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 63c81b512e80..37f209ccef0f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -768,9 +768,8 @@ void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, void page_cache_async_readahead(struct address_space *, struct file_ra_state *, struct file *, struct page *, pgoff_t index, unsigned long req_count); -void page_cache_readahead_unbounded(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_count); +void page_cache_ra_unbounded(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_count); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: diff --git a/mm/readahead.c b/mm/readahead.c index 2126a2754e22..a444943781bb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -158,10 +158,8 @@ out: } /** - * page_cache_readahead_unbounded - Start unchecked readahead. - * @mapping: File address space. - * @file: This instance of the open file; used for authentication. - * @index: First page index to read. + * page_cache_ra_unbounded - Start unchecked readahead. + * @ractl: Readahead control. * @nr_to_read: The number of pages to read. * @lookahead_size: Where to start the next readahead. * @@ -173,13 +171,13 @@ out: * Context: File is referenced by caller. Mutexes may be held by caller. * May sleep, but will not reenter filesystem to reclaim memory. */ -void page_cache_readahead_unbounded(struct address_space *mapping, - struct file *file, pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_size) +void page_cache_ra_unbounded(struct readahead_control *ractl, + unsigned long nr_to_read, unsigned long lookahead_size) { + struct address_space *mapping = ractl->mapping; + unsigned long index = readahead_index(ractl); LIST_HEAD(page_pool); gfp_t gfp_mask = readahead_gfp_mask(mapping); - DEFINE_READAHEAD(rac, file, mapping, index); unsigned long i; /* @@ -200,7 +198,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping, for (i = 0; i < nr_to_read; i++) { struct page *page = xa_load(&mapping->i_pages, index + i); - BUG_ON(index + i != rac._index + rac._nr_pages); + BUG_ON(index + i != ractl->_index + ractl->_nr_pages); if (page && !xa_is_value(page)) { /* @@ -211,7 +209,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping, * have a stable reference to this page, and it's * not worth getting one just for that. */ - read_pages(&rac, &page_pool, true); + read_pages(ractl, &page_pool, true); continue; } @@ -224,12 +222,12 @@ void page_cache_readahead_unbounded(struct address_space *mapping, } else if (add_to_page_cache_lru(page, mapping, index + i, gfp_mask) < 0) { put_page(page); - read_pages(&rac, &page_pool, true); + read_pages(ractl, &page_pool, true); continue; } if (i == nr_to_read - lookahead_size) SetPageReadahead(page); - rac._nr_pages++; + ractl->_nr_pages++; } /* @@ -237,10 +235,10 @@ void page_cache_readahead_unbounded(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - read_pages(&rac, &page_pool, false); + read_pages(ractl, &page_pool, false); memalloc_nofs_restore(nofs); } -EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded); +EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* * __do_page_cache_readahead() actually reads a chunk of disk. It allocates @@ -252,6 +250,7 @@ void __do_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size) { + DEFINE_READAHEAD(ractl, file, mapping, index); struct inode *inode = mapping->host; loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ @@ -266,8 +265,7 @@ void __do_page_cache_readahead(struct address_space *mapping, if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; - page_cache_readahead_unbounded(mapping, file, index, nr_to_read, - lookahead_size); + page_cache_ra_unbounded(&ractl, nr_to_read, lookahead_size); } /* -- cgit v1.2.3-59-g8ed1b From 8238287eadb2ddaedd920ba06021c33ec36d0320 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:17 -0700 Subject: mm/readahead: make do_page_cache_ra take a readahead_control Rename __do_page_cache_readahead() to do_page_cache_ra() and call it directly from ondemand_readahead() instead of indirecting via ra_submit(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: David Howells Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-5-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/internal.h | 11 +++++------ mm/readahead.c | 28 +++++++++++++++------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index a801a4d51f26..41ade6c184ec 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -51,18 +51,17 @@ void unmap_page_range(struct mmu_gather *tlb, void force_page_cache_readahead(struct address_space *, struct file *, pgoff_t index, unsigned long nr_to_read); -void __do_page_cache_readahead(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_size); +void do_page_cache_ra(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_size); /* * Submit IO for the read-ahead request in file_ra_state. */ static inline void ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) + struct address_space *mapping, struct file *file) { - __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); + DEFINE_READAHEAD(ractl, file, mapping, ra->start); + do_page_cache_ra(&ractl, ra->size, ra->async_size); } struct page *find_get_entry(struct address_space *mapping, pgoff_t index); diff --git a/mm/readahead.c b/mm/readahead.c index a444943781bb..577f180d9252 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -241,17 +241,16 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * do_page_cache_ra() actually reads a chunk of disk. It allocates * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ -void __do_page_cache_readahead(struct address_space *mapping, - struct file *file, pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_size) +void do_page_cache_ra(struct readahead_control *ractl, + unsigned long nr_to_read, unsigned long lookahead_size) { - DEFINE_READAHEAD(ractl, file, mapping, index); - struct inode *inode = mapping->host; + struct inode *inode = ractl->mapping->host; + unsigned long index = readahead_index(ractl); loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ @@ -265,7 +264,7 @@ void __do_page_cache_readahead(struct address_space *mapping, if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; - page_cache_ra_unbounded(&ractl, nr_to_read, lookahead_size); + page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); } /* @@ -273,10 +272,11 @@ void __do_page_cache_readahead(struct address_space *mapping, * memory at once. */ void force_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t index, unsigned long nr_to_read) + struct file *file, pgoff_t index, unsigned long nr_to_read) { + DEFINE_READAHEAD(ractl, file, mapping, index); struct backing_dev_info *bdi = inode_to_bdi(mapping->host); - struct file_ra_state *ra = &filp->f_ra; + struct file_ra_state *ra = &file->f_ra; unsigned long max_pages; if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && @@ -294,7 +294,7 @@ void force_page_cache_readahead(struct address_space *mapping, if (this_chunk > nr_to_read) this_chunk = nr_to_read; - __do_page_cache_readahead(mapping, filp, index, this_chunk, 0); + do_page_cache_ra(&ractl, this_chunk, 0); index += this_chunk; nr_to_read -= this_chunk; @@ -432,10 +432,11 @@ static int try_context_readahead(struct address_space *mapping, * A minimal readahead algorithm for trivial sequential/random reads. */ static void ondemand_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, + struct file_ra_state *ra, struct file *file, bool hit_readahead_marker, pgoff_t index, unsigned long req_size) { + DEFINE_READAHEAD(ractl, file, mapping, index); struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; unsigned long add_pages; @@ -516,7 +517,7 @@ static void ondemand_readahead(struct address_space *mapping, * standalone, small random read * Read as is, and do not pollute the readahead state. */ - __do_page_cache_readahead(mapping, filp, index, req_size, 0); + do_page_cache_ra(&ractl, req_size, 0); return; initial_readahead: @@ -542,7 +543,8 @@ readit: } } - ra_submit(ra, mapping, filp); + ractl._index = ra->start; + do_page_cache_ra(&ractl, ra->size, ra->async_size); } /** -- cgit v1.2.3-59-g8ed1b From 6e4af69ae9b7ca1bc2f82d3bf061ef8abedeabc4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Oct 2020 20:06:21 -0700 Subject: mm/readahead: make ondemand_readahead take a readahead_control Make ondemand_readahead() take a readahead_control struct in preparation for making do_sync_mmap_readahead() pass down an RAC struct. Signed-off-by: David Howells Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-6-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 577f180d9252..73110c4148f8 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -431,15 +431,14 @@ static int try_context_readahead(struct address_space *mapping, /* * A minimal readahead algorithm for trivial sequential/random reads. */ -static void ondemand_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *file, - bool hit_readahead_marker, pgoff_t index, +static void ondemand_readahead(struct readahead_control *ractl, + struct file_ra_state *ra, bool hit_readahead_marker, unsigned long req_size) { - DEFINE_READAHEAD(ractl, file, mapping, index); - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); unsigned long max_pages = ra->ra_pages; unsigned long add_pages; + unsigned long index = readahead_index(ractl); pgoff_t prev_index; /* @@ -477,7 +476,8 @@ static void ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_miss(mapping, index + 1, max_pages); + start = page_cache_next_miss(ractl->mapping, index + 1, + max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) @@ -510,14 +510,15 @@ static void ondemand_readahead(struct address_space *mapping, * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, index, req_size, max_pages)) + if (try_context_readahead(ractl->mapping, ra, index, req_size, + max_pages)) goto readit; /* * standalone, small random read * Read as is, and do not pollute the readahead state. */ - do_page_cache_ra(&ractl, req_size, 0); + do_page_cache_ra(ractl, req_size, 0); return; initial_readahead: @@ -543,8 +544,8 @@ readit: } } - ractl._index = ra->start; - do_page_cache_ra(&ractl, ra->size, ra->async_size); + ractl->_index = ra->start; + do_page_cache_ra(ractl, ra->size, ra->async_size); } /** @@ -564,6 +565,8 @@ void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, pgoff_t index, unsigned long req_count) { + DEFINE_READAHEAD(ractl, filp, mapping, index); + /* no read-ahead */ if (!ra->ra_pages) return; @@ -578,7 +581,7 @@ void page_cache_sync_readahead(struct address_space *mapping, } /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, false, index, req_count); + ondemand_readahead(&ractl, ra, false, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_readahead); @@ -602,6 +605,8 @@ page_cache_async_readahead(struct address_space *mapping, struct page *page, pgoff_t index, unsigned long req_count) { + DEFINE_READAHEAD(ractl, filp, mapping, index); + /* no read-ahead */ if (!ra->ra_pages) return; @@ -624,7 +629,7 @@ page_cache_async_readahead(struct address_space *mapping, return; /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, true, index, req_count); + ondemand_readahead(&ractl, ra, true, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_readahead); -- cgit v1.2.3-59-g8ed1b From 7b3df3b9ac7e807bf1b6fcc03077bd80068f3d3c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Oct 2020 20:06:24 -0700 Subject: mm/readahead: pass readahead_control to force_page_cache_ra Reimplement force_page_cache_readahead() as a wrapper around force_page_cache_ra(). Pass the existing readahead_control from page_cache_sync_readahead(). Signed-off-by: David Howells Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-7-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/internal.h | 13 +++++++++---- mm/readahead.c | 18 ++++++++++-------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 41ade6c184ec..059ad0115ec1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,10 +49,15 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -void force_page_cache_readahead(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read); -void do_page_cache_ra(struct readahead_control *, - unsigned long nr_to_read, unsigned long lookahead_size); +void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, + unsigned long lookahead_size); +void force_page_cache_ra(struct readahead_control *, unsigned long nr); +static inline void force_page_cache_readahead(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read) +{ + DEFINE_READAHEAD(ractl, file, mapping, index); + force_page_cache_ra(&ractl, nr_to_read); +} /* * Submit IO for the read-ahead request in file_ra_state. diff --git a/mm/readahead.c b/mm/readahead.c index 73110c4148f8..3115ced5faae 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -271,13 +271,13 @@ void do_page_cache_ra(struct readahead_control *ractl, * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ -void force_page_cache_readahead(struct address_space *mapping, - struct file *file, pgoff_t index, unsigned long nr_to_read) +void force_page_cache_ra(struct readahead_control *ractl, + unsigned long nr_to_read) { - DEFINE_READAHEAD(ractl, file, mapping, index); + struct address_space *mapping = ractl->mapping; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); - struct file_ra_state *ra = &file->f_ra; - unsigned long max_pages; + struct file_ra_state *ra = &ractl->file->f_ra; + unsigned long max_pages, index; if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && !mapping->a_ops->readahead)) @@ -287,14 +287,16 @@ void force_page_cache_readahead(struct address_space *mapping, * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ + index = readahead_index(ractl); max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); - nr_to_read = min(nr_to_read, max_pages); + nr_to_read = min_t(unsigned long, nr_to_read, max_pages); while (nr_to_read) { unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; - do_page_cache_ra(&ractl, this_chunk, 0); + ractl->_index = index; + do_page_cache_ra(ractl, this_chunk, 0); index += this_chunk; nr_to_read -= this_chunk; @@ -576,7 +578,7 @@ void page_cache_sync_readahead(struct address_space *mapping, /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { - force_page_cache_readahead(mapping, filp, index, req_count); + force_page_cache_ra(&ractl, req_count); return; } -- cgit v1.2.3-59-g8ed1b From fefa7c478fdafe71c64b5ddf817ac0271aed1146 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:28 -0700 Subject: mm/readahead: add page_cache_sync_ra and page_cache_async_ra Reimplement page_cache_sync_readahead() and page_cache_async_readahead() as wrappers around versions of the function which take a readahead_control in preparation for making do_sync_mmap_readahead() pass down an RAC struct. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: David Howells Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-8-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 64 +++++++++++++++++++++++++++++++++++++++++-------- mm/readahead.c | 58 ++++++++++---------------------------------- 2 files changed, 66 insertions(+), 56 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 37f209ccef0f..c77b7c31b2e4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -761,16 +761,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) - -void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, - struct file *, pgoff_t index, unsigned long req_count); -void page_cache_async_readahead(struct address_space *, struct file_ra_state *, - struct file *, struct page *, pgoff_t index, - unsigned long req_count); -void page_cache_ra_unbounded(struct readahead_control *, - unsigned long nr_to_read, unsigned long lookahead_count); - /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. @@ -818,6 +808,60 @@ struct readahead_control { ._index = i, \ } +#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) + +void page_cache_ra_unbounded(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_count); +void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *, + unsigned long req_count); +void page_cache_async_ra(struct readahead_control *, struct file_ra_state *, + struct page *, unsigned long req_count); + +/** + * page_cache_sync_readahead - generic file readahead + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @file: Used by the filesystem for authentication. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. + * + * page_cache_sync_readahead() should be called when a cache miss happened: + * it will submit the read. The readahead logic may decide to piggyback more + * pages onto the read request if access patterns suggest it will improve + * performance. + */ +static inline +void page_cache_sync_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, pgoff_t index, + unsigned long req_count) +{ + DEFINE_READAHEAD(ractl, file, mapping, index); + page_cache_sync_ra(&ractl, ra, req_count); +} + +/** + * page_cache_async_readahead - file readahead for marked pages + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @file: Used by the filesystem for authentication. + * @page: The page at @index which triggered the readahead call. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. + * + * page_cache_async_readahead() should be called when a page is used which + * is marked as PageReadahead; this is a marker to suggest that the application + * has used up enough of the readahead window that we should start pulling in + * more pages. + */ +static inline +void page_cache_async_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + struct page *page, pgoff_t index, unsigned long req_count) +{ + DEFINE_READAHEAD(ractl, file, mapping, index); + page_cache_async_ra(&ractl, ra, page, req_count); +} + /** * readahead_page - Get the next page to read. * @rac: The current readahead request. diff --git a/mm/readahead.c b/mm/readahead.c index 3115ced5faae..620ac83f35cc 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -550,25 +550,9 @@ readit: do_page_cache_ra(ractl, ra->size, ra->async_size); } -/** - * page_cache_sync_readahead - generic file readahead - * @mapping: address_space which holds the pagecache and I/O vectors - * @ra: file_ra_state which holds the readahead state - * @filp: passed on to ->readpage() and ->readpages() - * @index: Index of first page to be read. - * @req_count: Total number of pages being read by the caller. - * - * page_cache_sync_readahead() should be called when a cache miss happened: - * it will submit the read. The readahead logic may decide to piggyback more - * pages onto the read request if access patterns suggest it will improve - * performance. - */ -void page_cache_sync_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - pgoff_t index, unsigned long req_count) +void page_cache_sync_ra(struct readahead_control *ractl, + struct file_ra_state *ra, unsigned long req_count) { - DEFINE_READAHEAD(ractl, filp, mapping, index); - /* no read-ahead */ if (!ra->ra_pages) return; @@ -577,38 +561,20 @@ void page_cache_sync_readahead(struct address_space *mapping, return; /* be dumb */ - if (filp && (filp->f_mode & FMODE_RANDOM)) { - force_page_cache_ra(&ractl, req_count); + if (ractl->file && (ractl->file->f_mode & FMODE_RANDOM)) { + force_page_cache_ra(ractl, req_count); return; } /* do read-ahead */ - ondemand_readahead(&ractl, ra, false, req_count); + ondemand_readahead(ractl, ra, false, req_count); } -EXPORT_SYMBOL_GPL(page_cache_sync_readahead); +EXPORT_SYMBOL_GPL(page_cache_sync_ra); -/** - * page_cache_async_readahead - file readahead for marked pages - * @mapping: address_space which holds the pagecache and I/O vectors - * @ra: file_ra_state which holds the readahead state - * @filp: passed on to ->readpage() and ->readpages() - * @page: The page at @index which triggered the readahead call. - * @index: Index of first page to be read. - * @req_count: Total number of pages being read by the caller. - * - * page_cache_async_readahead() should be called when a page is used which - * is marked as PageReadahead; this is a marker to suggest that the application - * has used up enough of the readahead window that we should start pulling in - * more pages. - */ -void -page_cache_async_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - struct page *page, pgoff_t index, - unsigned long req_count) +void page_cache_async_ra(struct readahead_control *ractl, + struct file_ra_state *ra, struct page *page, + unsigned long req_count) { - DEFINE_READAHEAD(ractl, filp, mapping, index); - /* no read-ahead */ if (!ra->ra_pages) return; @@ -624,16 +590,16 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (inode_read_congested(mapping->host)) + if (inode_read_congested(ractl->mapping->host)) return; if (blk_cgroup_congested()) return; /* do read-ahead */ - ondemand_readahead(&ractl, ra, true, req_count); + ondemand_readahead(ractl, ra, true, req_count); } -EXPORT_SYMBOL_GPL(page_cache_async_readahead); +EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { -- cgit v1.2.3-59-g8ed1b From db660d462525c4a152b25e033c3dfa9c25d188e6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Oct 2020 20:06:31 -0700 Subject: mm/filemap: fold ra_submit into do_sync_mmap_readahead Fold ra_submit() into its last remaining user and pass the readahead_control struct to both do_page_cache_ra() and page_cache_sync_ra(). Signed-off-by: David Howells Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-9-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/filemap.c | 10 +++++----- mm/internal.h | 10 ---------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 2937b57329be..8aad142ec4be 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2588,8 +2588,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff); struct file *fpin = NULL; - pgoff_t offset = vmf->pgoff; unsigned int mmap_miss; /* If we don't want any read-ahead, don't bother */ @@ -2600,8 +2600,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (vmf->vma->vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_readahead(mapping, ra, file, offset, - ra->ra_pages); + page_cache_sync_ra(&ractl, ra, ra->ra_pages); return fpin; } @@ -2621,10 +2620,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * mmap read-around */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ra->start = max_t(long, 0, offset - ra->ra_pages / 2); + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; - ra_submit(ra, mapping, file); + ractl._index = ra->start; + do_page_cache_ra(&ractl, ra->size, ra->async_size); return fpin; } diff --git a/mm/internal.h b/mm/internal.h index 059ad0115ec1..03654797a3a9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -59,16 +59,6 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, nr_to_read); } -/* - * Submit IO for the read-ahead request in file_ra_state. - */ -static inline void ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *file) -{ - DEFINE_READAHEAD(ractl, file, mapping, ra->start); - do_page_cache_ra(&ractl, ra->size, ra->async_size); -} - struct page *find_get_entry(struct address_space *mapping, pgoff_t index); struct page *find_lock_entry(struct address_space *mapping, pgoff_t index); -- cgit v1.2.3-59-g8ed1b From b1647dc0deef6e923e30c46f28bf5679125c9a10 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Oct 2020 20:06:35 -0700 Subject: mm/readahead: pass a file_ra_state into force_page_cache_ra The file_ra_state being passed into page_cache_sync_readahead() was being ignored in favour of using the one embedded in the struct file. The only caller for which this makes a difference is the fsverity code if the file has been marked as POSIX_FADV_RANDOM, but it's confusing and worth fixing. Signed-off-by: David Howells Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-10-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/internal.h | 5 +++-- mm/readahead.c | 5 ++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 03654797a3a9..6345b08ce86c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -51,12 +51,13 @@ void unmap_page_range(struct mmu_gather *tlb, void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_size); -void force_page_cache_ra(struct readahead_control *, unsigned long nr); +void force_page_cache_ra(struct readahead_control *, struct file_ra_state *, + unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { DEFINE_READAHEAD(ractl, file, mapping, index); - force_page_cache_ra(&ractl, nr_to_read); + force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); } struct page *find_get_entry(struct address_space *mapping, pgoff_t index); diff --git a/mm/readahead.c b/mm/readahead.c index 620ac83f35cc..c6ffb76827da 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -272,11 +272,10 @@ void do_page_cache_ra(struct readahead_control *ractl, * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, - unsigned long nr_to_read) + struct file_ra_state *ra, unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); - struct file_ra_state *ra = &ractl->file->f_ra; unsigned long max_pages, index; if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && @@ -562,7 +561,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, /* be dumb */ if (ractl->file && (ractl->file->f_mode & FMODE_RANDOM)) { - force_page_cache_ra(ractl, req_count); + force_page_cache_ra(ractl, ra, req_count); return; } -- cgit v1.2.3-59-g8ed1b From 7d9d46ac87f91b8dedad5241d64382b650e26487 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 15 Oct 2020 20:06:38 -0700 Subject: mm,hwpoison: cleanup unused PageHuge() check Patch series "HWPOISON: soft offline rework", v7. This patchset fixes a couple of issues that the patchset Naoya sent [1] contained due to rebasing problems and a misunterdansting. Main focus of this series is to stabilize soft offline. Historically soft offlined pages have suffered from racy conditions because PageHWPoison is used to a little too aggressively, which (directly or indirectly) invades other mm code which cares little about hwpoison. This results in unexpected behavior or kernel panic, which is very far from soft offline's "do not disturb userspace or other kernel component" policy. An example of this can be found here [2]. Along with several cleanups, this code refactors and changes the way soft offline work. Main point of this change set is to contain target page "via buddy allocator" or in migrating path. For ther former we first free the target page as we do for normal pages, and once it has reached buddy and it has been taken off the freelists, we flag it as HWpoison. For the latter we never get to release the page in unmap_and_move, so the page is under our control and we can handle it in hwpoison code. [1] https://patchwork.kernel.org/cover/11704083/ [2] https://lore.kernel.org/linux-mm/20190826104144.GA7849@linux/T/#u This patch (of 14): Drop the PageHuge check, which is dead code since memory_failure() forks into memory_failure_hugetlb() for hugetlb pages. memory_failure() and memory_failure_hugetlb() shares some functions like hwpoison_user_mappings() and identify_page_state(), so they should properly handle 4kB page, thp, and hugetlb. Signed-off-by: Naoya Horiguchi Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Reviewed-by: Mike Kravetz Cc: Michal Hocko Cc: Tony Luck Cc: David Hildenbrand Cc: Aneesh Kumar K.V Cc: Dmitry Yakunin Cc: Qian Cai Cc: Dave Hansen Cc: "Aneesh Kumar K.V" Cc: Aristeu Rozanski Cc: Oscar Salvador Link: https://lkml.kernel.org/r/20200922135650.1634-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20200922135650.1634-2-osalvador@suse.de Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 990e3b2e37d5..484f968641d1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1381,10 +1381,7 @@ int memory_failure(unsigned long pfn, int flags) * page_remove_rmap() in try_to_unmap_one(). So to determine page status * correctly, we save a copy of the page flags at this time. */ - if (PageHuge(p)) - page_flags = hpage->flags; - else - page_flags = p->flags; + page_flags = p->flags; /* * unpoison always clear PG_hwpoison inside page lock -- cgit v1.2.3-59-g8ed1b From 1b473becde09d1aec17334a34af70ccdee9fe680 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 15 Oct 2020 20:06:42 -0700 Subject: mm, hwpoison: remove recalculating hpage hpage is never used after try_to_split_thp_page() in memory_failure(), so we don't have to update hpage. So let's not recalculate/use hpage. Suggested-by: "Aneesh Kumar K.V" Signed-off-by: Naoya Horiguchi Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Reviewed-by: Mike Kravetz Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-3-osalvador@suse.de Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 484f968641d1..6876757f4b85 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1341,7 +1341,6 @@ int memory_failure(unsigned long pfn, int flags) } unlock_page(p); VM_BUG_ON_PAGE(!page_count(p), p); - hpage = compound_head(p); } /* @@ -1413,11 +1412,8 @@ int memory_failure(unsigned long pfn, int flags) /* * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. - * - * When the raw error page is thp tail page, hpage points to the raw - * page after thp split. */ - if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) { + if (!hwpoison_user_mappings(p, pfn, flags, &p)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; -- cgit v1.2.3-59-g8ed1b From fd476720c9ba3cf16617d074a94a0852be468545 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 15 Oct 2020 20:06:46 -0700 Subject: mm,hwpoison-inject: don't pin for hwpoison_filter Another memory error injection interface debugfs:hwpoison/corrupt-pfn also takes bogus refcount for hwpoison_filter(). It's justified because this does a coarse filter, expecting that memory_failure() redoes the check for sure. Signed-off-by: Naoya Horiguchi Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-4-osalvador@suse.de Signed-off-by: Linus Torvalds --- mm/hwpoison-inject.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e488876b168a..1ae1ebc2b9b1 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -26,11 +26,6 @@ static int hwpoison_inject(void *data, u64 val) p = pfn_to_page(pfn); hpage = compound_head(p); - /* - * This implies unable to support free buddy pages. - */ - if (!get_hwpoison_page(p)) - return 0; if (!hwpoison_filter_enable) goto inject; @@ -40,23 +35,20 @@ static int hwpoison_inject(void *data, u64 val) * This implies unable to support non-LRU pages. */ if (!PageLRU(hpage) && !PageHuge(p)) - goto put_out; + return 0; /* - * do a racy check with elevated page count, to make sure PG_hwpoison - * will only be set for the targeted owner (or on a free page). + * do a racy check to make sure PG_hwpoison will only be set for + * the targeted owner (or on a free page). * memory_failure() will redo the check reliably inside page lock. */ err = hwpoison_filter(hpage); if (err) - goto put_out; + return 0; inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); - return memory_failure(pfn, MF_COUNT_INCREASED); -put_out: - put_hwpoison_page(p); - return 0; + return memory_failure(pfn, 0); } static int hwpoison_unpoison(void *data, u64 val) -- cgit v1.2.3-59-g8ed1b From 7e27f22c9e40b66186e0675376f0495725ff1b0a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:06:50 -0700 Subject: mm,hwpoison: unexport get_hwpoison_page and make it static Since get_hwpoison_page is only used in memory-failure code now, let us un-export it and make it private to that code. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-5-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/memory-failure.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 620961e4f32b..1977c09afe7a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3025,7 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern int get_hwpoison_page(struct page *page); #define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6876757f4b85..95d5d8f3496c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -924,7 +924,7 @@ static int page_action(struct page_state *ps, struct page *p, * Return: return 0 if failed to grab the refcount, otherwise true (some * non-zero value.) */ -int get_hwpoison_page(struct page *page) +static int get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); @@ -953,7 +953,6 @@ int get_hwpoison_page(struct page *page) return 0; } -EXPORT_SYMBOL_GPL(get_hwpoison_page); /* * Do all that is necessary to remove user space mappings. Unmap -- cgit v1.2.3-59-g8ed1b From dc7560b496f9e045c675ae160afda010ec0c77f6 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:06:53 -0700 Subject: mm,hwpoison: refactor madvise_inject_error Make a proper if-else condition for {hard,soft}-offline. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: Michal Hocko Cc: Qian Cai Cc: Tony Luck Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Mike Kravetz Link: https://lkml.kernel.org/r/20200908075626.11976-3-osalvador@suse.de Signed-off-by: Linus Torvalds --- mm/madvise.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 9b065d412e5f..0039bfe4ca9f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -872,7 +872,6 @@ static long madvise_remove(struct vm_area_struct *vma, static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { - struct page *page; struct zone *zone; unsigned long size; @@ -882,6 +881,7 @@ static int madvise_inject_error(int behavior, for (; start < end; start += size) { unsigned long pfn; + struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); @@ -903,25 +903,21 @@ static int madvise_inject_error(int behavior, if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", - pfn, start); - + pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); - if (ret) - return ret; - continue; + } else { + pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", + pfn, start); + /* + * Drop the page reference taken by get_user_pages_fast(). In + * the absence of MF_COUNT_INCREASED the memory_failure() + * routine is responsible for pinning the page to prevent it + * from being released back to the page allocator. + */ + put_page(page); + ret = memory_failure(pfn, 0); } - pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", - pfn, start); - - /* - * Drop the page reference taken by get_user_pages_fast(). In - * the absence of MF_COUNT_INCREASED the memory_failure() - * routine is responsible for pinning the page to prevent it - * from being released back to the page allocator. - */ - put_page(page); - ret = memory_failure(pfn, 0); if (ret) return ret; } -- cgit v1.2.3-59-g8ed1b From dd6e2402fad966290f35dc687294fb6049714aac Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:06:57 -0700 Subject: mm,hwpoison: kill put_hwpoison_page After commit 4e41a30c6d50 ("mm: hwpoison: adjust for new thp refcounting"), put_hwpoison_page got reduced to a put_page. Let us just use put_page instead. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-7-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/memory-failure.c | 30 +++++++++++++++--------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1977c09afe7a..ab038a3521b4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3025,7 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -#define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 95d5d8f3496c..4a9f8dc01160 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1143,7 +1143,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_dec(); unlock_page(head); - put_hwpoison_page(head); + put_page(head); return 0; } @@ -1335,7 +1335,7 @@ int memory_failure(unsigned long pfn, int flags) pfn); if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); - put_hwpoison_page(p); + put_page(p); return -EBUSY; } unlock_page(p); @@ -1388,14 +1388,14 @@ int memory_failure(unsigned long pfn, int flags) pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_dec(); unlock_page(p); - put_hwpoison_page(p); + put_page(p); return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unlock_page(p); - put_hwpoison_page(p); + put_page(p); return 0; } @@ -1629,9 +1629,9 @@ int unpoison_memory(unsigned long pfn) } unlock_page(page); - put_hwpoison_page(page); + put_page(page); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) - put_hwpoison_page(page); + put_page(page); return 0; } @@ -1692,7 +1692,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) /* * Try to free it. */ - put_hwpoison_page(page); + put_page(page); shake_page(page, 1); /* @@ -1701,7 +1701,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) ret = __get_any_page(page, pfn, 0); if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ - put_hwpoison_page(page); + put_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", pfn, page->flags, &page->flags); return -EIO; @@ -1724,7 +1724,7 @@ static int soft_offline_huge_page(struct page *page, int flags) lock_page(hpage); if (PageHWPoison(hpage)) { unlock_page(hpage); - put_hwpoison_page(hpage); + put_page(hpage); pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); return -EBUSY; } @@ -1735,7 +1735,7 @@ static int soft_offline_huge_page(struct page *page, int flags) * get_any_page() and isolate_huge_page() takes a refcount each, * so need to drop one here. */ - put_hwpoison_page(hpage); + put_page(hpage); if (!ret) { pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); return -EBUSY; @@ -1784,7 +1784,7 @@ static int __soft_offline_page(struct page *page, int flags) wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); - put_hwpoison_page(page); + put_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1799,7 +1799,7 @@ static int __soft_offline_page(struct page *page, int flags) * would need to fix isolation locking first. */ if (ret == 1) { - put_hwpoison_page(page); + put_page(page); pr_info("soft_offline: %#lx: invalidated\n", pfn); SetPageHWPoison(page); num_poisoned_pages_inc(); @@ -1819,7 +1819,7 @@ static int __soft_offline_page(struct page *page, int flags) * Drop page reference which is came from get_any_page() * successful isolate_lru_page() already took another one. */ - put_hwpoison_page(page); + put_page(page); if (!ret) { LIST_HEAD(pagelist); /* @@ -1863,7 +1863,7 @@ static int soft_offline_in_use_page(struct page *page, int flags) pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); else pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_hwpoison_page(page); + put_page(page); return -EBUSY; } unlock_page(page); @@ -1936,7 +1936,7 @@ int soft_offline_page(unsigned long pfn, int flags) if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); if (flags & MF_COUNT_INCREASED) - put_hwpoison_page(page); + put_page(page); return -EBUSY; } -- cgit v1.2.3-59-g8ed1b From 694bf0b0cdf91be50e6f037ca93733ed83ca1187 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:01 -0700 Subject: mm,hwpoison: unify THP handling for hard and soft offline Place the THP's page handling in a helper and use it from both hard and soft-offline machinery, so we get rid of some duplicated code. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-8-osalvador@suse.de Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4a9f8dc01160..2ff4ba7fa99f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1102,6 +1102,25 @@ static int identify_page_state(unsigned long pfn, struct page *p, return page_action(ps, p, pfn); } +static int try_to_split_thp_page(struct page *page, const char *msg) +{ + lock_page(page); + if (!PageAnon(page) || unlikely(split_huge_page(page))) { + unsigned long pfn = page_to_pfn(page); + + unlock_page(page); + if (!PageAnon(page)) + pr_info("%s: %#lx: non anonymous thp\n", msg, pfn); + else + pr_info("%s: %#lx: thp split failed\n", msg, pfn); + put_page(page); + return -EBUSY; + } + unlock_page(page); + + return 0; +} + static int memory_failure_hugetlb(unsigned long pfn, int flags) { struct page *p = pfn_to_page(pfn); @@ -1324,21 +1343,8 @@ int memory_failure(unsigned long pfn, int flags) } if (PageTransHuge(hpage)) { - lock_page(p); - if (!PageAnon(p) || unlikely(split_huge_page(p))) { - unlock_page(p); - if (!PageAnon(p)) - pr_err("Memory failure: %#lx: non anonymous thp\n", - pfn); - else - pr_err("Memory failure: %#lx: thp split failed\n", - pfn); - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - put_page(p); + if (try_to_split_thp_page(p, "Memory Failure") < 0) return -EBUSY; - } - unlock_page(p); VM_BUG_ON_PAGE(!page_count(p), p); } @@ -1855,19 +1861,9 @@ static int soft_offline_in_use_page(struct page *page, int flags) int mt; struct page *hpage = compound_head(page); - if (!PageHuge(page) && PageTransHuge(hpage)) { - lock_page(page); - if (!PageAnon(page) || unlikely(split_huge_page(page))) { - unlock_page(page); - if (!PageAnon(page)) - pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); - else - pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_page(page); + if (!PageHuge(page) && PageTransHuge(hpage)) + if (try_to_split_thp_page(page, "soft offline") < 0) return -EBUSY; - } - unlock_page(page); - } /* * Setting MIGRATE_ISOLATE here ensures that the page will be linked -- cgit v1.2.3-59-g8ed1b From 06be6ff3d2ec8be806b859fc054a1909b16d2473 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:05 -0700 Subject: mm,hwpoison: rework soft offline for free pages When trying to soft-offline a free page, we need to first take it off the buddy allocator. Once we know is out of reach, we can safely flag it as poisoned. take_page_off_buddy will be used to take a page meant to be poisoned off the buddy allocator. take_page_off_buddy calls break_down_buddy_pages, which splits a higher-order page in case our page belongs to one. Once the page is under our control, we call page_handle_poison to set it as poisoned and grab a refcount on it. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-9-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 1 + mm/memory-failure.c | 18 ++++++++---- mm/page_alloc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 6 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 38ded408bd4c..a02b6d0221db 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -432,6 +432,7 @@ PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) extern bool set_hwpoison_free_buddy_page(struct page *page); +extern bool take_page_off_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison) static inline bool set_hwpoison_free_buddy_page(struct page *page) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2ff4ba7fa99f..af7fb23b93ae 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -65,6 +65,13 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); +static void page_handle_poison(struct page *page) +{ + SetPageHWPoison(page); + page_ref_inc(page); + num_poisoned_pages_inc(); +} + #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) u32 hwpoison_filter_enable = 0; @@ -1884,14 +1891,13 @@ static int soft_offline_in_use_page(struct page *page, int flags) static int soft_offline_free_page(struct page *page) { - int rc = dissolve_free_huge_page(page); + int rc = -EBUSY; - if (!rc) { - if (set_hwpoison_free_buddy_page(page)) - num_poisoned_pages_inc(); - else - rc = -EBUSY; + if (!dissolve_free_huge_page(page) && take_page_off_buddy(page)) { + page_handle_poison(page); + rc = 0; } + return rc; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index febb2d731f9e..12dcfb3dc588 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8777,6 +8777,74 @@ bool is_free_buddy_page(struct page *page) } #ifdef CONFIG_MEMORY_FAILURE +/* + * Break down a higher-order page in sub-pages, and keep our target out of + * buddy allocator. + */ +static void break_down_buddy_pages(struct zone *zone, struct page *page, + struct page *target, int low, int high, + int migratetype) +{ + unsigned long size = 1 << high; + struct page *current_buddy, *next_page; + + while (high > low) { + high--; + size >>= 1; + + if (target >= &page[size]) { + next_page = page + size; + current_buddy = page; + } else { + next_page = page; + current_buddy = page + size; + } + + if (set_page_guard(zone, current_buddy, high, migratetype)) + continue; + + if (current_buddy != target) { + add_to_free_list(current_buddy, zone, high, migratetype); + set_page_order(current_buddy, high); + page = next_page; + } + } +} + +/* + * Take a page that will be marked as poisoned off the buddy allocator. + */ +bool take_page_off_buddy(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + unsigned int order; + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + int buddy_order = page_order(page_head); + + if (PageBuddy(page_head) && buddy_order >= order) { + unsigned long pfn_head = page_to_pfn(page_head); + int migratetype = get_pfnblock_migratetype(page_head, + pfn_head); + + del_page_from_free_list(page_head, zone, buddy_order); + break_down_buddy_pages(zone, page_head, page, 0, + buddy_order, migratetype); + ret = true; + break; + } + if (page_count(page_head) > 0) + break; + } + spin_unlock_irqrestore(&zone->lock, flags); + return ret; +} + /* * Set PG_hwpoison flag if a given page is confirmed to be a free page. This * test is performed under the zone lock to prevent a race against page -- cgit v1.2.3-59-g8ed1b From 79f5f8fab482dfff62948214468ac4ebbf0a016f Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:09 -0700 Subject: mm,hwpoison: rework soft offline for in-use pages This patch changes the way we set and handle in-use poisoned pages. Until now, poisoned pages were released to the buddy allocator, trusting that the checks that take place at allocation time would act as a safe net and would skip that page. This has proved to be wrong, as we got some pfn walkers out there, like compaction, that all they care is the page to be in a buddy freelist. Although this might not be the only user, having poisoned pages in the buddy allocator seems a bad idea as we should only have free pages that are ready and meant to be used as such. Before explaining the taken approach, let us break down the kind of pages we can soft offline. - Anonymous THP (after the split, they end up being 4K pages) - Hugetlb - Order-0 pages (that can be either migrated or invalited) * Normal pages (order-0 and anon-THP) - If they are clean and unmapped page cache pages, we invalidate then by means of invalidate_inode_page(). - If they are mapped/dirty, we do the isolate-and-migrate dance. Either way, do not call put_page directly from those paths. Instead, we keep the page and send it to page_handle_poison to perform the right handling. page_handle_poison sets the HWPoison flag and does the last put_page. Down the chain, we placed a check for HWPoison page in free_pages_prepare, that just skips any poisoned page, so those pages do not end up in any pcplist/freelist. After that, we set the refcount on the page to 1 and we increment the poisoned pages counter. If we see that the check in free_pages_prepare creates trouble, we can always do what we do for free pages: - wait until the page hits buddy's freelists - take it off, and flag it The downside of the above approach is that we could race with an allocation, so by the time we want to take the page off the buddy, the page has been already allocated so we cannot soft offline it. But the user could always retry it. * Hugetlb pages - We isolate-and-migrate them After the migration has been successful, we call dissolve_free_huge_page, and we set HWPoison on the page if we succeed. Hugetlb has a slightly different handling though. While for non-hugetlb pages we cared about closing the race with an allocation, doing so for hugetlb pages requires quite some additional and intrusive code (we would need to hook in free_huge_page and some other places). So I decided to not make the code overly complicated and just fail normally if the page we allocated in the meantime. We can always build on top of this. As a bonus, because of the way we handle now in-use pages, we no longer need the put-as-isolation-migratetype dance, that was guarding for poisoned pages to end up in pcplists. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-10-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 ----- mm/memory-failure.c | 43 ++++++++++++++----------------------------- mm/migrate.c | 11 +++-------- mm/page_alloc.c | 39 +++++++++++---------------------------- 4 files changed, 28 insertions(+), 70 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a02b6d0221db..4f6ba9379112 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -431,14 +431,9 @@ PAGEFLAG_FALSE(Uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) -extern bool set_hwpoison_free_buddy_page(struct page *page); extern bool take_page_off_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison) -static inline bool set_hwpoison_free_buddy_page(struct page *page) -{ - return 0; -} #define __PG_HWPOISON 0 #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index af7fb23b93ae..962129c50be2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -65,9 +65,11 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); -static void page_handle_poison(struct page *page) +static void page_handle_poison(struct page *page, bool release) { SetPageHWPoison(page); + if (release) + put_page(page); page_ref_inc(page); num_poisoned_pages_inc(); } @@ -1765,19 +1767,13 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = -EIO; } else { /* - * We set PG_hwpoison only when the migration source hugepage - * was successfully dissolved, because otherwise hwpoisoned - * hugepage remains on free hugepage list, then userspace will - * find it as SIGBUS by allocation failure. That's not expected - * in soft-offlining. + * We set PG_hwpoison only when we were able to take the page + * off the buddy. */ - ret = dissolve_free_huge_page(page); - if (!ret) { - if (set_hwpoison_free_buddy_page(page)) - num_poisoned_pages_inc(); - else - ret = -EBUSY; - } + if (!dissolve_free_huge_page(page) && take_page_off_buddy(page)) + page_handle_poison(page, false); + else + ret = -EBUSY; } return ret; } @@ -1812,10 +1808,8 @@ static int __soft_offline_page(struct page *page, int flags) * would need to fix isolation locking first. */ if (ret == 1) { - put_page(page); pr_info("soft_offline: %#lx: invalidated\n", pfn); - SetPageHWPoison(page); - num_poisoned_pages_inc(); + page_handle_poison(page, true); return 0; } @@ -1846,7 +1840,9 @@ static int __soft_offline_page(struct page *page, int flags) list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); - if (ret) { + if (!ret) { + page_handle_poison(page, true); + } else { if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); @@ -1865,27 +1861,16 @@ static int __soft_offline_page(struct page *page, int flags) static int soft_offline_in_use_page(struct page *page, int flags) { int ret; - int mt; struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) if (try_to_split_thp_page(page, "soft offline") < 0) return -EBUSY; - /* - * Setting MIGRATE_ISOLATE here ensures that the page will be linked - * to free list immediately (not via pcplist) when released after - * successful page migration. Otherwise we can't guarantee that the - * page is really free after put_page() returns, so - * set_hwpoison_free_buddy_page() highly likely fails. - */ - mt = get_pageblock_migratetype(page); - set_pageblock_migratetype(page, MIGRATE_ISOLATE); if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); else ret = __soft_offline_page(page, flags); - set_pageblock_migratetype(page, mt); return ret; } @@ -1894,7 +1879,7 @@ static int soft_offline_free_page(struct page *page) int rc = -EBUSY; if (!dissolve_free_huge_page(page) && take_page_off_buddy(page)) { - page_handle_poison(page); + page_handle_poison(page, false); rc = 0; } diff --git a/mm/migrate.c b/mm/migrate.c index f94d7c7eeddf..4cf1af88c1dd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1223,16 +1223,11 @@ out: * we want to retry. */ if (rc == MIGRATEPAGE_SUCCESS) { - put_page(page); - if (reason == MR_MEMORY_FAILURE) { + if (reason != MR_MEMORY_FAILURE) /* - * Set PG_HWPoison on just freed page - * intentionally. Although it's rather weird, - * it's how HWPoison flag works at the moment. + * We release the page in page_handle_poison. */ - if (set_hwpoison_free_buddy_page(page)) - num_poisoned_pages_inc(); - } + put_page(page); } else { if (rc != -EAGAIN) { if (likely(!__PageMovable(page))) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12dcfb3dc588..7fa55d9a3fe4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1174,6 +1174,17 @@ static __always_inline bool free_pages_prepare(struct page *page, trace_mm_page_free(page, order); + if (unlikely(PageHWPoison(page)) && !order) { + /* + * Do not let hwpoison pages hit pcplists/buddy + * Untie memcg state and reset page's owner + */ + if (memcg_kmem_enabled() && PageKmemcg(page)) + __memcg_kmem_uncharge_page(page, order); + reset_page_owner(page, order); + return false; + } + /* * Check tail pages before head page information is cleared to * avoid checking PageCompound for order-0 pages. @@ -8844,32 +8855,4 @@ bool take_page_off_buddy(struct page *page) spin_unlock_irqrestore(&zone->lock, flags); return ret; } - -/* - * Set PG_hwpoison flag if a given page is confirmed to be a free page. This - * test is performed under the zone lock to prevent a race against page - * allocation. - */ -bool set_hwpoison_free_buddy_page(struct page *page) -{ - struct zone *zone = page_zone(page); - unsigned long pfn = page_to_pfn(page); - unsigned long flags; - unsigned int order; - bool hwpoisoned = false; - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { - struct page *page_head = page - (pfn & ((1 << order) - 1)); - - if (PageBuddy(page_head) && page_order(page_head) >= order) { - if (!TestSetPageHWPoison(page)) - hwpoisoned = true; - break; - } - } - spin_unlock_irqrestore(&zone->lock, flags); - - return hwpoisoned; -} #endif -- cgit v1.2.3-59-g8ed1b From 6b9a217eda4a13cc72914fdf7433712122ff595b Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:13 -0700 Subject: mm,hwpoison: refactor soft_offline_huge_page and __soft_offline_page Merging soft_offline_huge_page and __soft_offline_page let us get rid of quite some duplicated code, and makes the code much easier to follow. Now, __soft_offline_page will handle both normal and hugetlb pages. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-11-osalvador@suse.de Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 182 +++++++++++++++++++++++----------------------------- 1 file changed, 82 insertions(+), 100 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 962129c50be2..93b32d4534e3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -65,13 +65,31 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); -static void page_handle_poison(struct page *page, bool release) +static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) { + if (hugepage_or_freepage) { + /* + * Doing this check for free pages is also fine since dissolve_free_huge_page + * returns 0 for non-hugetlb pages as well. + */ + if (dissolve_free_huge_page(page) || !take_page_off_buddy(page)) + /* + * We could fail to take off the target page from buddy + * for example due to racy page allocaiton, but that's + * acceptable because soft-offlined page is not broken + * and if someone really want to use it, they should + * take it. + */ + return false; + } + SetPageHWPoison(page); if (release) put_page(page); page_ref_inc(page); num_poisoned_pages_inc(); + + return true; } #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) @@ -1725,63 +1743,51 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) return ret; } -static int soft_offline_huge_page(struct page *page, int flags) +static bool isolate_page(struct page *page, struct list_head *pagelist) { - int ret; - unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_head(page); - LIST_HEAD(pagelist); + bool isolated = false; + bool lru = PageLRU(page); - /* - * This double-check of PageHWPoison is to avoid the race with - * memory_failure(). See also comment in __soft_offline_page(). - */ - lock_page(hpage); - if (PageHWPoison(hpage)) { - unlock_page(hpage); - put_page(hpage); - pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); - return -EBUSY; + if (PageHuge(page)) { + isolated = isolate_huge_page(page, pagelist); + } else { + if (lru) + isolated = !isolate_lru_page(page); + else + isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE); + + if (isolated) + list_add(&page->lru, pagelist); } - unlock_page(hpage); - ret = isolate_huge_page(hpage, &pagelist); + if (isolated && lru) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_lru(page)); + /* - * get_any_page() and isolate_huge_page() takes a refcount each, - * so need to drop one here. + * If we succeed to isolate the page, we grabbed another refcount on + * the page, so we can safely drop the one we got from get_any_pages(). + * If we failed to isolate the page, it means that we cannot go further + * and we will return an error, so drop the reference we got from + * get_any_pages() as well. */ - put_page(hpage); - if (!ret) { - pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); - return -EBUSY; - } - - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC, MR_MEMORY_FAILURE); - if (ret) { - pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n", - pfn, ret, page->flags, &page->flags); - if (!list_empty(&pagelist)) - putback_movable_pages(&pagelist); - if (ret > 0) - ret = -EIO; - } else { - /* - * We set PG_hwpoison only when we were able to take the page - * off the buddy. - */ - if (!dissolve_free_huge_page(page) && take_page_off_buddy(page)) - page_handle_poison(page, false); - else - ret = -EBUSY; - } - return ret; + put_page(page); + return isolated; } -static int __soft_offline_page(struct page *page, int flags) +/* + * __soft_offline_page handles hugetlb-pages and non-hugetlb pages. + * If the page is a non-dirty unmapped page-cache page, it simply invalidates. + * If the page is mapped, it migrates the contents over. + */ +static int __soft_offline_page(struct page *page) { - int ret; + int ret = 0; unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + char const *msg_page[] = {"page", "hugepage"}; + bool huge = PageHuge(page); + LIST_HEAD(pagelist); /* * Check PageHWPoison again inside page lock because PageHWPoison @@ -1790,98 +1796,74 @@ static int __soft_offline_page(struct page *page, int flags) * so there's no race between soft_offline_page() and memory_failure(). */ lock_page(page); - wait_on_page_writeback(page); + if (!PageHuge(page)) + wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); put_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } - /* - * Try to invalidate first. This should work for - * non dirty unmapped page cache pages. - */ - ret = invalidate_inode_page(page); + + if (!PageHuge(page)) + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); unlock_page(page); + /* * RED-PEN would be better to keep it isolated here, but we * would need to fix isolation locking first. */ - if (ret == 1) { + if (ret) { pr_info("soft_offline: %#lx: invalidated\n", pfn); - page_handle_poison(page, true); + page_handle_poison(page, false, true); return 0; } - /* - * Simple invalidation didn't work. - * Try to migrate to a new page instead. migrate.c - * handles a large number of cases for us. - */ - if (PageLRU(page)) - ret = isolate_lru_page(page); - else - ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); - /* - * Drop page reference which is came from get_any_page() - * successful isolate_lru_page() already took another one. - */ - put_page(page); - if (!ret) { - LIST_HEAD(pagelist); - /* - * After isolated lru page, the PageLRU will be cleared, - * so use !__PageMovable instead for LRU page's mapping - * cannot have PAGE_MAPPING_MOVABLE. - */ - if (!__PageMovable(page)) - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); - list_add(&page->lru, &pagelist); + if (isolate_page(hpage, &pagelist)) { ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (!ret) { - page_handle_poison(page, true); + bool release = !huge; + + if (!page_handle_poison(page, huge, release)) + ret = -EBUSY; } else { if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", - pfn, ret, page->flags, &page->flags); + pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n", + pfn, msg_page[huge], ret, page->flags, &page->flags); if (ret > 0) ret = -EIO; } } else { - pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n", - pfn, ret, page_count(page), page->flags, &page->flags); + pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n", + pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags); + ret = -EBUSY; } return ret; } -static int soft_offline_in_use_page(struct page *page, int flags) +static int soft_offline_in_use_page(struct page *page) { - int ret; struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) if (try_to_split_thp_page(page, "soft offline") < 0) return -EBUSY; - - if (PageHuge(page)) - ret = soft_offline_huge_page(page, flags); - else - ret = __soft_offline_page(page, flags); - return ret; + return __soft_offline_page(page); } static int soft_offline_free_page(struct page *page) { - int rc = -EBUSY; + int rc = 0; - if (!dissolve_free_huge_page(page) && take_page_off_buddy(page)) { - page_handle_poison(page, false); - rc = 0; - } + if (!page_handle_poison(page, true, false)) + rc = -EBUSY; return rc; } @@ -1932,7 +1914,7 @@ int soft_offline_page(unsigned long pfn, int flags) put_online_mems(); if (ret > 0) - ret = soft_offline_in_use_page(page, flags); + ret = soft_offline_in_use_page(page); else if (ret == 0) ret = soft_offline_free_page(page); -- cgit v1.2.3-59-g8ed1b From 5a2ffca3c23333a41cf8604f62994cfd28e4267b Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:17 -0700 Subject: mm,hwpoison: return 0 if the page is already poisoned in soft-offline Currently, there is an inconsistency when calling soft-offline from different paths on a page that is already poisoned. 1) madvise: madvise_inject_error skips any poisoned page and continues the loop. If that was the only page to madvise, it returns 0. 2) /sys/devices/system/memory/: When calling soft_offline_page_store()->soft_offline_page(), we return -EBUSY in case the page is already poisoned. This is inconsistent with a) the above example and b) memory_failure, where we return 0 if the page was poisoned. Fix this by dropping the PageHWPoison() check in madvise_inject_error, and let soft_offline_page return 0 if it finds the page already poisoned. Please, note that this represents a user-api change, since now the return error when calling soft_offline_page_store()->soft_offline_page() will be different. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-12-osalvador@suse.de Signed-off-by: Linus Torvalds --- mm/madvise.c | 5 ----- mm/memory-failure.c | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 0039bfe4ca9f..68b761c359d0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -896,11 +896,6 @@ static int madvise_inject_error(int behavior, */ size = page_size(compound_head(page)); - if (PageHWPoison(page)) { - put_page(page); - continue; - } - if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 93b32d4534e3..1fb290faf28e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1802,7 +1802,7 @@ static int __soft_offline_page(struct page *page) unlock_page(page); put_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); - return -EBUSY; + return 0; } if (!PageHuge(page)) @@ -1906,7 +1906,7 @@ int soft_offline_page(unsigned long pfn, int flags) pr_info("soft offline: %#lx page already poisoned\n", pfn); if (flags & MF_COUNT_INCREASED) put_page(page); - return -EBUSY; + return 0; } get_online_mems(); -- cgit v1.2.3-59-g8ed1b From 5d1fd5dc877bc1c670e7b1c174aa659b76c07de1 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 15 Oct 2020 20:07:21 -0700 Subject: mm,hwpoison: introduce MF_MSG_UNSPLIT_THP memory_failure() is supposed to call action_result() when it handles a memory error event, but there's one missing case. So let's add it. I find that include/ras/ras_event.h has some other MF_MSG_* undefined, so this patch also adds them. Signed-off-by: Naoya Horiguchi Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-13-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/ras/ras_event.h | 3 +++ mm/memory-failure.c | 5 ++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ab038a3521b4..a9df46309e07 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3064,6 +3064,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_BUDDY_2ND, MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 36c5c5e38c1d..0bdbc0d17d2f 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -361,6 +361,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ + EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \ EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ @@ -373,6 +374,8 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ EM ( MF_MSG_BUDDY, "free buddy page" ) \ EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \ + EM ( MF_MSG_DAX, "dax page" ) \ + EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1fb290faf28e..f9fa9982b5d4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -582,6 +582,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", + [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1370,8 +1371,10 @@ int memory_failure(unsigned long pfn, int flags) } if (PageTransHuge(hpage)) { - if (try_to_split_thp_page(p, "Memory Failure") < 0) + if (try_to_split_thp_page(p, "Memory Failure") < 0) { + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); return -EBUSY; + } VM_BUG_ON_PAGE(!page_count(p), p); } -- cgit v1.2.3-59-g8ed1b From 1f2481ddbe444de5bed72f167d7180d1b2708e56 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 15 Oct 2020 20:07:25 -0700 Subject: mm,hwpoison: double-check page count in __get_any_page() Soft offlining could fail with EIO due to the race condition with hugepage migration. This issuse became visible due to the change by previous patch that makes soft offline handler take page refcount by its own. We have no way to directly pin zero refcount page, and the page considered as a zero refcount page could be allocated just after the first check. This patch adds the second check to find the race and gives us chance to handle it more reliably. Reported-by: Qian Cai Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-14-osalvador@suse.de Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f9fa9982b5d4..7c63ba9ff6e7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1707,6 +1707,9 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) } else if (is_free_buddy_page(p)) { pr_info("%s: %#lx free buddy page\n", __func__, pfn); ret = 0; + } else if (page_count(p)) { + /* raced with allocation */ + ret = -EBUSY; } else { pr_info("%s: %#lx: unknown zero refcount page type %lx\n", __func__, pfn, p->flags); @@ -1723,6 +1726,9 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) { int ret = __get_any_page(page, pfn, flags); + if (ret == -EBUSY) + ret = __get_any_page(page, pfn, flags); + if (ret == 1 && !PageHuge(page) && !PageLRU(page) && !__PageMovable(page)) { /* -- cgit v1.2.3-59-g8ed1b From b94e02822debdf0cc473556aad7dcc859f216653 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:29 -0700 Subject: mm,hwpoison: try to narrow window race for free pages Aristeu Rozanski reported that a customer test case started to report -EBUSY after the hwpoison rework patchset. There is a race window between spotting a free page and taking it off its buddy freelist, so it might be that by the time we try to take it off, the page has been already allocated. This patch tries to handle such race window by trying to handle the new type of page again if the page was allocated under us. Reported-by: Aristeu Rozanski Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Tested-by: Aristeu Rozanski Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-15-osalvador@suse.de Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7c63ba9ff6e7..a2184b721fbf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1903,6 +1903,7 @@ int soft_offline_page(unsigned long pfn, int flags) { int ret; struct page *page; + bool try_again = true; if (!pfn_valid(pfn)) return -ENXIO; @@ -1918,6 +1919,7 @@ int soft_offline_page(unsigned long pfn, int flags) return 0; } +retry: get_online_mems(); ret = get_any_page(page, pfn, flags); put_online_mems(); @@ -1925,7 +1927,10 @@ int soft_offline_page(unsigned long pfn, int flags) if (ret > 0) ret = soft_offline_in_use_page(page); else if (ret == 0) - ret = soft_offline_free_page(page); + if (soft_offline_free_page(page) && try_again) { + try_again = false; + goto retry; + } return ret; } -- cgit v1.2.3-59-g8ed1b From 11c9c7edae06da789abfdeefe5123162a3f1c7dc Mon Sep 17 00:00:00 2001 From: Mateusz Nosek Date: Thu, 15 Oct 2020 20:07:33 -0700 Subject: mm/page_poison.c: replace bool variable with static key Variable 'want_page_poisoning' is a switch deciding if page poisoning should be enabled. This patch changes it to be static key. Signed-off-by: Mateusz Nosek Signed-off-by: Andrew Morton Cc: Naoya Horiguchi Cc: Oscar Salvador Link: https://lkml.kernel.org/r/20200921152931.938-1-mateusznosek0@gmail.com Signed-off-by: Linus Torvalds --- mm/page_poison.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/page_poison.c b/mm/page_poison.c index 34b9181ee5d1..ae0482cded87 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -8,13 +8,23 @@ #include #include -static bool want_page_poisoning __read_mostly; +static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning); static int __init early_page_poison_param(char *buf) { - if (!buf) - return -EINVAL; - return strtobool(buf, &want_page_poisoning); + int ret; + bool tmp; + + ret = strtobool(buf, &tmp); + if (ret) + return ret; + + if (tmp) + static_branch_enable(&want_page_poisoning); + else + static_branch_disable(&want_page_poisoning); + + return 0; } early_param("page_poison", early_page_poison_param); @@ -31,7 +41,7 @@ bool page_poisoning_enabled(void) * Page poisoning is debug page alloc for some arches. If * either of those options are enabled, enable poisoning. */ - return (want_page_poisoning || + return (static_branch_unlikely(&want_page_poisoning) || (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && debug_pagealloc_enabled())); } -- cgit v1.2.3-59-g8ed1b From 406100762ae9ae5b7709c737ada3089160a0f77a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 15 Oct 2020 20:07:36 -0700 Subject: mm/vmstat.c: use helper macro abs() Use helper macro abs() to simplify the "x > t || x < -t" cmp. Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: https://lkml.kernel.org/r/20200905084008.15748-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- mm/vmstat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 4f7b4ee6aa12..698bc0bc18d1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -325,7 +325,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, t = __this_cpu_read(pcp->stat_threshold); - if (unlikely(x > t || x < -t)) { + if (unlikely(abs(x) > t)) { zone_page_state_add(x, zone, item); x = 0; } @@ -350,7 +350,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, t = __this_cpu_read(pcp->stat_threshold); - if (unlikely(x > t || x < -t)) { + if (unlikely(abs(x) > t)) { node_page_state_add(x, pgdat, item); x = 0; } @@ -511,7 +511,7 @@ static inline void mod_zone_state(struct zone *zone, o = this_cpu_read(*p); n = delta + o; - if (n > t || n < -t) { + if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; /* Overflow must be added to zone counters */ @@ -573,7 +573,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, o = this_cpu_read(*p); n = delta + o; - if (n > t || n < -t) { + if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; /* Overflow must be added to node counters */ -- cgit v1.2.3-59-g8ed1b From 295a17302348bc38704ef0f11ca2ce4ce58db2e9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 15 Oct 2020 20:07:39 -0700 Subject: mm/util.c: update the kerneldoc for kstrdup_const() Memory allocated with kstrdup_const() must not be passed to regular krealloc() as it is not aware of the possibility of the chunk residing in .rodata. Since there are no potential users of krealloc_const() at the moment, let's just update the doc to make it explicit. Signed-off-by: Bartosz Golaszewski Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200817173927.23389-1-brgl@bgdev.pl Signed-off-by: Linus Torvalds --- mm/util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index 4e21fe7eae27..4ddb6e186dd5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -69,7 +69,8 @@ EXPORT_SYMBOL(kstrdup); * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * - * Note: Strings allocated by kstrdup_const should be freed by kfree_const. + * Note: Strings allocated by kstrdup_const should be freed by kfree_const and + * must not be passed to krealloc(). * * Return: source string if it is in .rodata section otherwise * fallback to kstrdup. -- cgit v1.2.3-59-g8ed1b From c9682d10271e1025ebfbb1675c7afffbef5c6856 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:07:43 -0700 Subject: mm/mmu_notifier: fix mmget() assert in __mmu_interval_notifier_insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment talks about having to hold mmget() (which means mm_users), but the actual check is on mm_count (which would be mmgrab()). Given that MMU notifiers are torn down in mmput() -> __mmput() -> exit_mmap() -> mmu_notifier_release(), I believe that the comment is correct and the check should be on mm->mm_users. Fix it up accordingly. Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier") Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Reviewed-by: Jason Gunthorpe Cc: John Hubbard Cc: Christoph Hellwig Cc: Christian König --- mm/mmu_notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 4fc918163dd3..5654dd19addc 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -913,7 +913,7 @@ static int __mmu_interval_notifier_insert( return -EOVERFLOW; /* Must call with a mmget() held */ - if (WARN_ON(atomic_read(&mm->mm_count) <= 0)) + if (WARN_ON(atomic_read(&mm->mm_users) <= 0)) return -EINVAL; /* pairs with mmdrop in mmu_interval_notifier_remove() */ -- cgit v1.2.3-59-g8ed1b From 73a11c9658577d876d4c70730719feab3fa328b0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:07:46 -0700 Subject: mm/memory_hotplug: inline __offline_pages() into offline_pages() Patch series "mm/memory_hotplug: online_pages()/offline_pages() cleanups", v2. These are a bunch of cleanups for online_pages()/offline_pages() and related code, mostly getting rid of memory hole handling that is no longer necessary. There is only a single walk_system_ram_range() call left in offline_pages(), to make sure we don't have any memory holes. I had some of these patches lying around for a longer time but didn't have time to polish them. In addition, the last patch marks all pageblocks of memory to get onlined MIGRATE_ISOLATE, so pages that have just been exposed to the buddy cannot get allocated before onlining is complete. Once heavy lifting is done, the pageblocks are set to MIGRATE_MOVABLE, such that allocations are possible. I played with DIMMs and virtio-mem on x86-64 and didn't spot any surprises. I verified that the numer of isolated pageblocks is correctly handled when onlining/offlining. This patch (of 10): There is only a single user, offline_pages(). Let's inline, to make it look more similar to online_pages(). Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Reviewed-by: Pankaj Gupta Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Michal Hocko Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Cc: Mel Gorman Link: https://lkml.kernel.org/r/20200819175957.28465-1-david@redhat.com Link: https://lkml.kernel.org/r/20200819175957.28465-2-david@redhat.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8e9e2d44cdad..ba201ab4c0e4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1484,11 +1484,10 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } -static int __ref __offline_pages(unsigned long start_pfn, - unsigned long end_pfn) +int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) { - unsigned long pfn, nr_pages = 0; - unsigned long offlined_pages = 0; + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn, system_ram_pages = 0, offlined_pages = 0; int ret, node, nr_isolate_pageblock; unsigned long flags; struct zone *zone; @@ -1505,9 +1504,9 @@ static int __ref __offline_pages(unsigned long start_pfn, * memory holes PG_reserved, don't need pfn_valid() checks, and can * avoid using walk_system_ram_range() later. */ - walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages, + walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, count_system_ram_pages_cb); - if (nr_pages != end_pfn - start_pfn) { + if (system_ram_pages != nr_pages) { ret = -EINVAL; reason = "memory holes"; goto failed_removal; @@ -1656,11 +1655,6 @@ failed_removal: return ret; } -int offline_pages(unsigned long start_pfn, unsigned long nr_pages) -{ - return __offline_pages(start_pfn, start_pfn + nr_pages); -} - static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); -- cgit v1.2.3-59-g8ed1b From 4986fac160b3eb1432ca9970d51c55f4f93b1a2e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:07:50 -0700 Subject: mm/memory_hotplug: enforce section granularity when onlining/offlining Already two people (including me) tried to offline subsections, because the function looks like it can deal with it. But we really can only online/offline full sections that are properly aligned (e.g., we can only mark full sections online/offline via SECTION_IS_ONLINE). Add a simple safety net to document the restriction now. Current users (core and powernv/memtrace) respect these restrictions. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-3-david@redhat.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ba201ab4c0e4..03a8825c5620 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -809,6 +809,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int ret; struct memory_notify arg; + /* We can only online full sections (e.g., SECTION_IS_ONLINE) */ + if (WARN_ON_ONCE(!nr_pages || + !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION))) + return -EINVAL; + mem_hotplug_begin(); /* associate pfn range with the zone */ @@ -1494,6 +1499,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) struct memory_notify arg; char *reason; + /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ + if (WARN_ON_ONCE(!nr_pages || + !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION))) + return -EINVAL; + mem_hotplug_begin(); /* -- cgit v1.2.3-59-g8ed1b From 0a1a9a0008bb7b22a8b6a75264b48bd7f9d885ea Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:07:54 -0700 Subject: mm/memory_hotplug: simplify page offlining We make sure that we cannot have any memory holes right at the beginning of offline_pages(). We no longer need walk_system_ram_range() and can call test_pages_isolated() and __offline_isolated_pages() directly. offlined_pages always corresponds to nr_pages, so we can simplify that. [akpm@linux-foundation.org: patch conflict resolution] Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-4-david@redhat.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 44 ++++++++++---------------------------------- 1 file changed, 10 insertions(+), 34 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 03a8825c5620..c3977ce28061 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1384,28 +1384,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) return ret; } -/* Mark all sections offline and remove all free pages from the buddy. */ -static int -offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, - void *data) -{ - unsigned long *offlined_pages = (unsigned long *)data; - - *offlined_pages += __offline_isolated_pages(start, start + nr_pages); - return 0; -} - -/* - * Check all pages in range, recorded as memory resource, are isolated. - */ -static int -check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, - void *data) -{ - return test_pages_isolated(start_pfn, start_pfn + nr_pages, - MEMORY_OFFLINE); -} - static int __init cmdline_parse_movable_node(char *p) { movable_node_enabled = true; @@ -1492,7 +1470,7 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) { const unsigned long end_pfn = start_pfn + nr_pages; - unsigned long pfn, system_ram_pages = 0, offlined_pages = 0; + unsigned long pfn, system_ram_pages = 0; int ret, node, nr_isolate_pageblock; unsigned long flags; struct zone *zone; @@ -1590,9 +1568,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) reason = "failure to dissolve huge pages"; goto failed_removal_isolated; } - /* check again */ - ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, - NULL, check_pages_isolated_cb); + /* * per-cpu pages are drained in start_isolate_page_range, but if * there are still pages that are not free, make sure that we @@ -1605,15 +1581,15 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) * because has_unmovable_pages explicitly checks for * PageBuddy on freed pages on other zones. */ + ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); if (ret) drain_all_pages(zone); } while (ret); - /* Ok, all of our target is isolated. - We cannot do rollback at this point. */ - walk_system_ram_range(start_pfn, end_pfn - start_pfn, - &offlined_pages, offline_isolated_pages_cb); - pr_info("Offlined Pages %ld\n", offlined_pages); + /* Mark all sections offline and remove free pages from the buddy. */ + __offline_isolated_pages(start_pfn, end_pfn); + pr_info("Offlined Pages %ld\n", nr_pages); + /* * Onlining will reset pagetype flags and makes migrate type * MOVABLE, so just need to decrease the number of isolated @@ -1624,11 +1600,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) spin_unlock_irqrestore(&zone->lock, flags); /* removal success */ - adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); - zone->present_pages -= offlined_pages; + adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); + zone->present_pages -= nr_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages -= offlined_pages; + zone->zone_pgdat->node_present_pages -= nr_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); init_per_zone_wmark_min(); -- cgit v1.2.3-59-g8ed1b From 257bea71582d895894201b604990a900df489103 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:07:59 -0700 Subject: mm/page_alloc: simplify __offline_isolated_pages() offline_pages() is the only user. __offline_isolated_pages() never gets called with ranges that contain memory holes and we no longer care about the return value. Drop the return value handling and all pfn_valid() checks. Update the documentation. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-5-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++-- mm/page_alloc.c | 27 ++++----------------------- 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c0faa7a30c46..76b314031f09 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -103,8 +103,8 @@ extern int online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); -extern unsigned long __offline_isolated_pages(unsigned long start_pfn, - unsigned long end_pfn); +extern void __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7fa55d9a3fe4..acc04ca1831d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8704,35 +8704,21 @@ void zone_pcp_reset(struct zone *zone) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * All pages in the range must be in a single zone and isolated - * before calling this. + * All pages in the range must be in a single zone, must not contain holes, + * must span full sections, and must be isolated before calling this function. */ -unsigned long -__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { + unsigned long pfn = start_pfn; struct page *page; struct zone *zone; unsigned int order; - unsigned long pfn; unsigned long flags; - unsigned long offlined_pages = 0; - - /* find the first valid pfn */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) - if (pfn_valid(pfn)) - break; - if (pfn == end_pfn) - return offlined_pages; offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); - pfn = start_pfn; while (pfn < end_pfn) { - if (!pfn_valid(pfn)) { - pfn++; - continue; - } page = pfn_to_page(pfn); /* * The HWPoisoned page may be not in buddy system, and @@ -8740,7 +8726,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) */ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; - offlined_pages++; continue; } /* @@ -8751,20 +8736,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(PageBuddy(page)); pfn++; - offlined_pages++; continue; } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); order = page_order(page); - offlined_pages += 1 << order; del_page_from_free_list(page, zone, order); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); - - return offlined_pages; } #endif -- cgit v1.2.3-59-g8ed1b From ea15153c3d46777be28d84c1484ad80a9e119234 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:03 -0700 Subject: mm/memory_hotplug: drop nr_isolate_pageblock in offline_pages() We make sure that we cannot have any memory holes right at the beginning of offline_pages() and we only support to online/offline full sections. Both, sections and pageblocks are a power of two in size, and sections always span full pageblocks. We can directly calculate the number of isolated pageblocks from nr_pages. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-6-david@redhat.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c3977ce28061..90524446aafd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1471,10 +1471,10 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, system_ram_pages = 0; - int ret, node, nr_isolate_pageblock; unsigned long flags; struct zone *zone; struct memory_notify arg; + int ret, node; char *reason; /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ @@ -1518,7 +1518,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) reason = "failure to isolate range"; goto failed_removal; } - nr_isolate_pageblock = ret; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1596,7 +1595,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) * pageblocks zone counter here. */ spin_lock_irqsave(&zone->lock, flags); - zone->nr_isolate_pageblock -= nr_isolate_pageblock; + zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); /* removal success */ -- cgit v1.2.3-59-g8ed1b From 3fa0c7c79d2499af390fbfaf60c94753e7f630f5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:07 -0700 Subject: mm/page_isolation: simplify return value of start_isolate_page_range() Callers no longer need the number of isolated pageblocks. Let's simplify. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-7-david@redhat.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 2 +- mm/page_isolation.c | 7 ++----- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 90524446aafd..d61e5be6d965 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1514,7 +1514,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, MEMORY_OFFLINE | REPORT_FAILURE); - if (ret < 0) { + if (ret) { reason = "failure to isolate range"; goto failed_removal; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index acc04ca1831d..19807b688171 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8468,7 +8468,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype, 0); - if (ret < 0) + if (ret) return ret; /* diff --git a/mm/page_isolation.c b/mm/page_isolation.c index aa94afb63823..abfe26ad59fd 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -173,8 +173,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * (e.g. __offline_pages will need to call it after check for isolated range for * a next retry). * - * Return: the number of isolated pageblocks on success and -EBUSY if any part - * of range cannot be isolated. + * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype, int flags) @@ -182,7 +181,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; unsigned long undo_pfn; struct page *page; - int nr_isolate_pageblock = 0; BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); @@ -196,10 +194,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, undo_pfn = pfn; goto undo; } - nr_isolate_pageblock++; } } - return nr_isolate_pageblock; + return 0; undo: for (pfn = start_pfn; pfn < undo_pfn; -- cgit v1.2.3-59-g8ed1b From aac65321ba69425badc33f84f5307af7b21ef963 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:11 -0700 Subject: mm/memory_hotplug: simplify page onlining We don't allow to offline memory with holes, all boot memory is online, and all hotplugged memory cannot have holes. We can now simplify onlining of pages. As we only allow to online/offline full sections and sections always span full MAX_ORDER_NR_PAGES, we can just process MAX_ORDER - 1 pages without further special handling. The number of onlined pages simply corresponds to the number of pages we were requested to online. While at it, refine the comment regarding the callback not exposing all pages to the buddy. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-8-david@redhat.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d61e5be6d965..113edf95b908 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -625,31 +625,22 @@ void generic_online_page(struct page *page, unsigned int order) } EXPORT_SYMBOL_GPL(generic_online_page); -static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg) +static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn; - int order; /* - * Online the pages. The callback might decide to keep some pages - * PG_reserved (to add them to the buddy later), but we still account - * them as being online/belonging to this zone ("present"). + * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might + * decide to not expose all pages to the buddy (e.g., expose them + * later). We account all pages as being online and belonging to this + * zone ("present"). */ - for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) { - order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn))); - /* __free_pages_core() wants pfns to be aligned to the order */ - if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order))) - order = 0; - (*online_page_callback)(pfn_to_page(pfn), order); - } + for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) + (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1); /* mark all involved sections as online */ online_mem_sections(start_pfn, end_pfn); - - *(unsigned long *)arg += nr_pages; - return 0; } /* check which state of node_states will be changed when online memory */ @@ -803,7 +794,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid) { unsigned long flags; - unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; int ret; @@ -839,19 +829,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, setup_zone_pageset(zone); } - ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, - online_pages_range); - if (ret) { - /* not a single memory resource was applicable */ - if (need_zonelists_rebuild) - zone_pcp_reset(zone); - goto failed_addition; - } - - zone->present_pages += onlined_pages; + online_pages_range(pfn, nr_pages); + zone->present_pages += nr_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages += onlined_pages; + zone->zone_pgdat->node_present_pages += nr_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); /* -- cgit v1.2.3-59-g8ed1b From 4eb29bd9d089977cbf592e53cd074e3c5092fd15 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:15 -0700 Subject: mm/page_alloc: drop stale pageblock comment in memmap_init_zone*() Commit ac5d2539b238 ("mm: meminit: reduce number of times pageblocks are set during struct page init") moved the actual zone range check, leaving only the alignment check for pageblocks. Let's drop the stale comment and make the pageblock check easier to read. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Mel Gorman Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-9-david@redhat.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19807b688171..7a99ed299443 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6042,13 +6042,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * to reserve their blocks rather than leaking throughout * the address space during boot when many long-lived * kernel allocations are made. - * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. */ - if (!(pfn & (pageblock_nr_pages - 1))) { + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } @@ -6111,15 +6106,10 @@ void __ref memmap_init_zone_device(struct zone *zone, * the address space during boot when many long-lived * kernel allocations are made. * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. - * * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ - if (!(pfn & (pageblock_nr_pages - 1))) { + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } -- cgit v1.2.3-59-g8ed1b From d882c0067d99d0f2add9a41628703cc99511a639 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:19 -0700 Subject: mm: pass migratetype into memmap_init_zone() and move_pfn_range_to_zone() On the memory onlining path, we want to start with MIGRATE_ISOLATE, to un-isolate the pages after memory onlining is complete. Let's allow passing in the migratetype. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Tony Luck Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: Dan Williams Cc: Mike Rapoport Cc: "Matthew Wilcox (Oracle)" Cc: Michel Lespinasse Cc: Charan Teja Reddy Cc: Mel Gorman Link: https://lkml.kernel.org/r/20200819175957.28465-10-david@redhat.com Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 4 ++-- include/linux/memory_hotplug.h | 3 ++- include/linux/mm.h | 2 +- mm/memory_hotplug.c | 11 ++++++++--- mm/memremap.c | 3 ++- mm/page_alloc.c | 21 ++++++++++++--------- 6 files changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d8686bf3ae2f..ef12e097f318 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -537,7 +537,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); return 0; } @@ -547,7 +547,7 @@ memmap_init (unsigned long size, int nid, unsigned long zone, { if (!vmem_map) { memmap_init_zone(size, nid, zone, start_pfn, - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } else { struct page *start; struct memmap_init_callback_data args; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 76b314031f09..51a877fec8da 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,7 +351,8 @@ extern int add_memory_resource(int nid, struct resource *resource); extern int add_memory_driver_managed(int nid, u64 start, u64 size, const char *resource_name); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype); extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index a9df46309e07..61a2633fcc7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2440,7 +2440,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum meminit_context, struct vmem_altmap *); + enum meminit_context, struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 113edf95b908..bb30e99b7383 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -701,9 +701,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this * call, all affected pages are PG_reserved. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; @@ -728,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * are reserved so nobody should be touching them so we should be safe */ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, - MEMINIT_HOTPLUG, altmap); + MEMINIT_HOTPLUG, altmap, migratetype); set_zone_contiguous(zone); } @@ -808,7 +813,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, /* associate pfn range with the zone */ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE); arg.start_pfn = pfn; arg.nr_pages = nr_pages; diff --git a/mm/memremap.c b/mm/memremap.c index 198083453182..73a206d0f645 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -266,7 +266,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), params->altmap); + PHYS_PFN(range_len(range)), params->altmap, + MIGRATE_MOVABLE); } mem_hotplug_done(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a99ed299443..f7f292f1d108 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5990,10 +5990,15 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is * done. Non-atomic initialization, single-pass. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum meminit_context context, - struct vmem_altmap *altmap) + unsigned long start_pfn, + enum meminit_context context, + struct vmem_altmap *altmap, int migratetype) { unsigned long pfn, end_pfn = start_pfn + size; struct page *page; @@ -6037,14 +6042,12 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, __SetPageReserved(page); /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. + * Usually, we want to mark the pageblock MIGRATE_MOVABLE, + * such that unmovable allocations won't be scattered all + * over the place during system boot. */ if (IS_ALIGNED(pfn, pageblock_nr_pages)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + set_pageblock_migratetype(page, migratetype); cond_resched(); } pfn++; @@ -6144,7 +6147,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; memmap_init_zone(size, nid, zone, start_pfn, - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } } -- cgit v1.2.3-59-g8ed1b From b30c59279d6891bbd583aed2e1f0cd2a73ab475a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:23 -0700 Subject: mm/memory_hotplug: mark pageblocks MIGRATE_ISOLATE while onlining memory Currently, it can happen that pages are allocated (and freed) via the buddy before we finished basic memory onlining. For example, pages are exposed to the buddy and can be allocated before we actually mark the sections online. Allocated pages could suddenly fail pfn_to_online_page() checks. We had similar issues with pcp handling, when pages are allocated+freed before we reach zone_pcp_update() in online_pages() [1]. Instead, mark all pageblocks MIGRATE_ISOLATE, such that allocations are impossible. Once done with the heavy lifting, use undo_isolate_page_range() to move the pages to the MIGRATE_MOVABLE freelist, marking them ready for allocation. Similar to offline_pages(), we have to manually adjust zone->nr_isolate_pageblock. [1] https://lkml.kernel.org/r/1597150703-19003-1-git-send-email-charante@codeaurora.org Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-11-david@redhat.com Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- mm/memory_hotplug.c | 32 ++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index e72e61c1d62e..c7f30f8b282b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -152,6 +152,7 @@ config HAVE_BOOTMEM_INFO_NODE # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" + select MEMORY_ISOLATION depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG depends on 64BIT || BROKEN @@ -178,7 +179,6 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE config MEMORY_HOTREMOVE bool "Allow for memory hot remove" - select MEMORY_ISOLATION select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bb30e99b7383..1b40eebae3e4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -813,7 +813,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, /* associate pfn range with the zone */ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -824,6 +824,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, if (ret) goto failed_addition; + /* + * Fixup the number of isolated pageblocks before marking the sections + * onlining, such that undo_isolate_page_range() works correctly. + */ + spin_lock_irqsave(&zone->lock, flags); + zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages; + spin_unlock_irqrestore(&zone->lock, flags); + /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. @@ -841,21 +849,25 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, zone->zone_pgdat->node_present_pages += nr_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); + node_states_set_node(nid, &arg); + if (need_zonelists_rebuild) + build_all_zonelists(NULL); + zone_pcp_update(zone); + + /* Basic onlining is complete, allow allocation of onlined pages. */ + undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); + /* * When exposing larger, physically contiguous memory areas to the * buddy, shuffling in the buddy (when freeing onlined pages, putting * them either to the head or the tail of the freelist) is only helpful * for maintaining the shuffle, but not for creating the initial * shuffle. Shuffle the whole zone to make sure the just onlined pages - * are properly distributed across the whole freelist. + * are properly distributed across the whole freelist. Make sure to + * shuffle once pageblocks are no longer isolated. */ shuffle_zone(zone); - node_states_set_node(nid, &arg); - if (need_zonelists_rebuild) - build_all_zonelists(NULL); - zone_pcp_update(zone); - init_per_zone_wmark_min(); kswapd_run(nid); @@ -1577,9 +1589,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) pr_info("Offlined Pages %ld\n", nr_pages); /* - * Onlining will reset pagetype flags and makes migrate type - * MOVABLE, so just need to decrease the number of isolated - * pageblocks zone counter here. + * The memory sections are marked offline, and the pageblock flags + * effectively stale; nobody should be touching them. Fixup the number + * of isolated pageblocks, memory onlining will properly revert this. */ spin_lock_irqsave(&zone->lock, flags); zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; -- cgit v1.2.3-59-g8ed1b From ec62d04e3fdc4ba3a7912cd7f6da1a4e787a0d75 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:28 -0700 Subject: kernel/resource: make release_mem_region_adjustable() never fail Patch series "selective merging of system ram resources", v4. Some add_memory*() users add memory in small, contiguous memory blocks. Examples include virtio-mem, hyper-v balloon, and the XEN balloon. This can quickly result in a lot of memory resources, whereby the actual resource boundaries are not of interest (e.g., it might be relevant for DIMMs, exposed via /proc/iomem to user space). We really want to merge added resources in this scenario where possible. Resources are effectively stored in a list-based tree. Having a lot of resources not only wastes memory, it also makes traversing that tree more expensive, and makes /proc/iomem explode in size (e.g., requiring kexec-tools to manually merge resources when creating a kdump header. The current kexec-tools resource count limit does not allow for more than ~100GB of memory with a memory block size of 128MB on x86-64). Let's allow to selectively merge system ram resources by specifying a new flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only tested with virtio-mem. This patch (of 8): Let's make sure splitting a resource on memory hotunplug will never fail. This will become more relevant once we merge selected System RAM resources - then, we'll trigger that case more often on memory hotunplug. In general, this function is already unlikely to fail. When we remove memory, we free up quite a lot of metadata (memmap, page tables, memory block device, etc.). The only reason it could really fail would be when injecting allocation errors. All other error cases inside release_mem_region_adjustable() seem to be sanity checks if the function would be abused in different context - let's add WARN_ON_ONCE() in these cases so we can catch them. [natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable] Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com Link: https://github.com/ClangBuiltLinux/linux/issues/1159 Signed-off-by: David Hildenbrand Signed-off-by: Nathan Chancellor Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monn Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 ++-- kernel/resource.c | 49 ++++++++++++++++++++++++++++--------------------- mm/memory_hotplug.c | 22 +--------------------- 3 files changed, 31 insertions(+), 44 deletions(-) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 6c2b06fe8beb..52a91f5fa1a3 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -248,8 +248,8 @@ extern struct resource * __request_region(struct resource *, extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE -extern int release_mem_region_adjustable(struct resource *, resource_size_t, - resource_size_t); +extern void release_mem_region_adjustable(struct resource *, resource_size_t, + resource_size_t); #endif /* Wrappers for managed devices */ diff --git a/kernel/resource.c b/kernel/resource.c index f1175ce93a1d..4c1c487abbfb 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1258,21 +1258,28 @@ EXPORT_SYMBOL(__release_region); * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ -int release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) +void release_mem_region_adjustable(struct resource *parent, + resource_size_t start, resource_size_t size) { + struct resource *new_res = NULL; + bool alloc_nofail = false; struct resource **p; struct resource *res; - struct resource *new_res; resource_size_t end; - int ret = -EINVAL; end = start + size - 1; - if ((start < parent->start) || (end > parent->end)) - return ret; + if (WARN_ON_ONCE((start < parent->start) || (end > parent->end))) + return; - /* The alloc_resource() result gets checked later */ - new_res = alloc_resource(GFP_KERNEL); + /* + * We free up quite a lot of memory on memory hotunplug (esp., memap), + * just before releasing the region. This is highly unlikely to + * fail - let's play save and make it never fail as the caller cannot + * perform any error handling (e.g., trying to re-add memory will fail + * similarly). + */ +retry: + new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0)); p = &parent->child; write_lock(&resource_lock); @@ -1298,7 +1305,6 @@ int release_mem_region_adjustable(struct resource *parent, * so if we are dealing with them, let us just back off here. */ if (!(res->flags & IORESOURCE_SYSRAM)) { - ret = 0; break; } @@ -1315,20 +1321,23 @@ int release_mem_region_adjustable(struct resource *parent, /* free the whole entry */ *p = res->sibling; free_resource(res); - ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ - ret = __adjust_resource(res, end + 1, - res->end - end); + WARN_ON_ONCE(__adjust_resource(res, end + 1, + res->end - end)); } else if (res->start != start && res->end == end) { /* adjust the end */ - ret = __adjust_resource(res, res->start, - start - res->start); + WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start)); } else { - /* split into two entries */ + /* split into two entries - we need a new resource */ if (!new_res) { - ret = -ENOMEM; - break; + new_res = alloc_resource(GFP_ATOMIC); + if (!new_res) { + alloc_nofail = true; + write_unlock(&resource_lock); + goto retry; + } } new_res->name = res->name; new_res->start = end + 1; @@ -1339,9 +1348,8 @@ int release_mem_region_adjustable(struct resource *parent, new_res->sibling = res->sibling; new_res->child = NULL; - ret = __adjust_resource(res, res->start, - start - res->start); - if (ret) + if (WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start))) break; res->sibling = new_res; new_res = NULL; @@ -1352,7 +1360,6 @@ int release_mem_region_adjustable(struct resource *parent, write_unlock(&resource_lock); free_resource(new_res); - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1b40eebae3e4..5d9c41ec0556 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1727,26 +1727,6 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); -static void __release_memory_resource(resource_size_t start, - resource_size_t size) -{ - int ret; - - /* - * When removing memory in the same granularity as it was added, - * this function never fails. It might only fail if resources - * have to be adjusted or split. We'll ignore the error, as - * removing of memory cannot fail. - */ - ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } -} - static int __ref try_remove_memory(int nid, u64 start, u64 size) { int rc = 0; @@ -1780,7 +1760,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) memblock_remove(start, size); } - __release_memory_resource(start, size); + release_mem_region_adjustable(&iomem_resource, start, size); try_offline_node(nid); -- cgit v1.2.3-59-g8ed1b From 7cf603d17d9bddbda90c424b6f30c7bc2e6f48f2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:33 -0700 Subject: kernel/resource: move and rename IORESOURCE_MEM_DRIVER_MANAGED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IORESOURCE_MEM_DRIVER_MANAGED currently uses an unused PnP bit, which is always set to 0 by hardware. This is far from beautiful (and confusing), and the bit only applies to SYSRAM. So let's move it out of the bus-specific (PnP) defined bits. We'll add another SYSRAM specific bit soon. If we ever need more bits for other purposes, we can steal some from "desc", or reshuffle/regroup what we have. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Eric Biederman Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monné Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-3-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 +++- kernel/kexec_file.c | 2 +- mm/memory_hotplug.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 52a91f5fa1a3..d7620d7c941a 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -58,6 +58,9 @@ struct resource { #define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */ #define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */ +/* IORESOURCE_SYSRAM specific bits. */ +#define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ + #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ #define IORESOURCE_DISABLED 0x10000000 @@ -103,7 +106,6 @@ struct resource { #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) -#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 84f7316792a7..e21f6b9234f7 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -521,7 +521,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) /* Returning 0 will take to next memory range */ /* Don't use memory that will be detected and handled by a driver. */ - if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) return 0; if (sz < kbuf->memsz) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5d9c41ec0556..9a7193970e77 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -105,7 +105,7 @@ static struct resource *register_memory_resource(u64 start, u64 size, unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; if (strcmp(resource_name, "System RAM")) - flags |= IORESOURCE_MEM_DRIVER_MANAGED; + flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; /* * Make sure value parsed from 'mem=' only restricts memory adding @@ -1161,7 +1161,7 @@ EXPORT_SYMBOL_GPL(add_memory); * * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided * memory map") are created. Also, the created memory resource is flagged - * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case + * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case * this memory as well (esp., not place kexec images onto it). * * The resource_name (visible via /proc/iomem) has to have the format -- cgit v1.2.3-59-g8ed1b From 3a0aaefe4134951b4e89feb873c457428154530c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:39 -0700 Subject: mm/memory_hotplug: guard more declarations by CONFIG_MEMORY_HOTPLUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We soon want to pass flags via a new type to add_memory() and friends. That revealed that we currently don't guard some declarations by CONFIG_MEMORY_HOTPLUG. While some definitions could be moved to different places, let's keep it minimal for now and use CONFIG_MEMORY_HOTPLUG for all functions only compiled with CONFIG_MEMORY_HOTPLUG. Wrap sparse_decode_mem_map() into CONFIG_MEMORY_HOTPLUG, it's only called from CONFIG_MEMORY_HOTPLUG code. While at it, remove allow_online_pfn_range(), which is no longer around, and mhp_notimplemented(), which is unused. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Gunthorpe Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: Kees Cook Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monné Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-4-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 12 +++--------- mm/sparse.c | 2 ++ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 51a877fec8da..1504b4d5ae6c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -247,13 +247,6 @@ static inline void zone_span_writelock(struct zone *zone) {} static inline void zone_span_writeunlock(struct zone *zone) {} static inline void zone_seqlock_init(struct zone *zone) {} -static inline int mhp_notimplemented(const char *func) -{ - printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); - dump_stack(); - return -ENOSYS; -} - static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } @@ -344,6 +337,7 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {} extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); +#ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); @@ -364,8 +358,8 @@ extern void sparse_remove_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, - int online_type); extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages); +#endif /* CONFIG_MEMORY_HOTPLUG */ + #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/sparse.c b/mm/sparse.c index b25ad8e64839..7bd23f9d6cef 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -312,6 +312,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p return coded_mem_map; } +#ifdef CONFIG_MEMORY_HOTPLUG /* * Decode mem_map from the coded memmap */ @@ -321,6 +322,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn coded_mem_map &= SECTION_MAP_MASK; return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); } +#endif /* CONFIG_MEMORY_HOTPLUG */ static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, -- cgit v1.2.3-59-g8ed1b From b6117199787c60539105d2de0d010146e8396fc3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:44 -0700 Subject: mm/memory_hotplug: prepare passing flags to add_memory() and friends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We soon want to pass flags, e.g., to mark added System RAM resources. mergeable. Prepare for that. This patch is based on a similar patch by Oscar Salvador: https://lkml.kernel.org/r/20190625075227.15193-3-osalvador@suse.de Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Juergen Gross # Xen related part Reviewed-by: Pankaj Gupta Acked-by: Wei Liu Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Baoquan He Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Greg Kroah-Hartman Cc: Vishal Verma Cc: Dave Jiang Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: David Hildenbrand Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Boris Ostrovsky Cc: Stefano Stabellini Cc: "Oliver O'Halloran" Cc: Pingfan Liu Cc: Nathan Lynch Cc: Libor Pechacek Cc: Anton Blanchard Cc: Leonardo Bras Cc: Ard Biesheuvel Cc: Eric Biederman Cc: Julien Grall Cc: Kees Cook Cc: Roger Pau Monné Cc: Thomas Gleixner Cc: Wei Yang Link: https://lkml.kernel.org/r/20200911103459.10306-5-david@redhat.com Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/powernv/memtrace.c | 2 +- arch/powerpc/platforms/pseries/hotplug-memory.c | 2 +- drivers/acpi/acpi_memhotplug.c | 3 ++- drivers/base/memory.c | 3 ++- drivers/dax/kmem.c | 2 +- drivers/hv/hv_balloon.c | 2 +- drivers/s390/char/sclp_cmd.c | 2 +- drivers/virtio/virtio_mem.c | 2 +- drivers/xen/balloon.c | 2 +- include/linux/memory_hotplug.h | 16 ++++++++++++---- mm/memory_hotplug.c | 14 +++++++------- 11 files changed, 30 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 13b369d2cc45..6828108486f8 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -224,7 +224,7 @@ static int memtrace_online(void) ent->mem = 0; } - if (add_memory(ent->nid, ent->start, ent->size)) { + if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) { pr_err("Failed to add trace memory to node %d\n", ent->nid); ret += 1; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 5d545b78111f..d8bbf0cc1601 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -606,7 +606,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) block_sz = memory_block_size_bytes(); /* Add the memory */ - rc = __add_memory(lmb->nid, lmb->base_addr, block_sz); + rc = __add_memory(lmb->nid, lmb->base_addr, block_sz, MHP_NONE); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index ad6e90fbc813..b02fd51e5589 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -194,7 +194,8 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) if (node < 0) node = memory_add_physaddr_to_nid(info->start_addr); - result = __add_memory(node, info->start_addr, info->length); + result = __add_memory(node, info->start_addr, info->length, + MHP_NONE); /* * If the memory block has been used by the kernel, add_memory() diff --git a/drivers/base/memory.c b/drivers/base/memory.c index adf828dfccf0..eef4ffb6122c 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -432,7 +432,8 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr, nid = memory_add_physaddr_to_nid(phys_addr); ret = __add_memory(nid, phys_addr, - MIN_MEMORY_BLOCK_SIZE * sections_per_block); + MIN_MEMORY_BLOCK_SIZE * sections_per_block, + MHP_NONE); if (ret) goto out; diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index af04b6d1d263..b4368c5b6a0c 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -109,7 +109,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) * this as RAM automatically. */ rc = add_memory_driver_managed(numa_node, range.start, - range_len(&range), kmem_name); + range_len(&range), kmem_name, MHP_NONE); if (rc) { dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 32e3bc0aa665..3c0d52e24452 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), - (HA_CHUNK << PAGE_SHIFT)); + (HA_CHUNK << PAGE_SHIFT), MHP_NONE); if (ret) { pr_err("hot_add memory failed error is %d\n", ret); diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index a864b21af602..f6e97f0830f6 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -406,7 +406,7 @@ static void __init add_memory_merged(u16 rn) if (!size) goto skip_add; for (addr = start; addr < start + size; addr += block_size) - add_memory(0, addr, block_size); + add_memory(0, addr, block_size, MHP_NONE); skip_add: first_rn = rn; num = 1; diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 834b7c13ef3d..ed99e4335401 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -424,7 +424,7 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); return add_memory_driver_managed(nid, addr, memory_block_size_bytes(), - vm->resource_name); + vm->resource_name, MHP_NONE); } /* diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 51427c752b37..9f40a294d398 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -331,7 +331,7 @@ static enum bp_state reserve_additional_memory(void) mutex_unlock(&balloon_mutex); /* add_memory_resource() requires the device_hotplug lock */ lock_device_hotplug(); - rc = add_memory_resource(nid, resource); + rc = add_memory_resource(nid, resource, MHP_NONE); unlock_device_hotplug(); mutex_lock(&balloon_mutex); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 1504b4d5ae6c..33eb80fdba22 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -57,6 +57,12 @@ enum { MMOP_ONLINE_MOVABLE, }; +/* Flags for add_memory() and friends to specify memory hotplug details. */ +typedef int __bitwise mhp_t; + +/* No special request */ +#define MHP_NONE ((__force mhp_t)0) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -339,11 +345,13 @@ extern void clear_zone_contiguous(struct zone *zone); #ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(int nid); -extern int __add_memory(int nid, u64 start, u64 size); -extern int add_memory(int nid, u64 start, u64 size); -extern int add_memory_resource(int nid, struct resource *resource); +extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); +extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); +extern int add_memory_resource(int nid, struct resource *resource, + mhp_t mhp_flags); extern int add_memory_driver_managed(int nid, u64 start, u64 size, - const char *resource_name); + const char *resource_name, + mhp_t mhp_flags); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, int migratetype); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9a7193970e77..dba71bc34daa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1039,7 +1039,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res) +int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = PAGE_KERNEL }; u64 start, size; @@ -1119,7 +1119,7 @@ error: } /* requires device_hotplug_lock, see add_memory_resource() */ -int __ref __add_memory(int nid, u64 start, u64 size) +int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { struct resource *res; int ret; @@ -1128,18 +1128,18 @@ int __ref __add_memory(int nid, u64 start, u64 size) if (IS_ERR(res)) return PTR_ERR(res); - ret = add_memory_resource(nid, res); + ret = add_memory_resource(nid, res, mhp_flags); if (ret < 0) release_memory_resource(res); return ret; } -int add_memory(int nid, u64 start, u64 size) +int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { int rc; lock_device_hotplug(); - rc = __add_memory(nid, start, size); + rc = __add_memory(nid, start, size, mhp_flags); unlock_device_hotplug(); return rc; @@ -1168,7 +1168,7 @@ EXPORT_SYMBOL_GPL(add_memory); * "System RAM ($DRIVER)". */ int add_memory_driver_managed(int nid, u64 start, u64 size, - const char *resource_name) + const char *resource_name, mhp_t mhp_flags) { struct resource *res; int rc; @@ -1186,7 +1186,7 @@ int add_memory_driver_managed(int nid, u64 start, u64 size, goto out_unlock; } - rc = add_memory_resource(nid, res); + rc = add_memory_resource(nid, res, mhp_flags); if (rc < 0) release_memory_resource(res); -- cgit v1.2.3-59-g8ed1b From 9ca6551ee24368a4d2b09566ea4d10fe87860379 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:49 -0700 Subject: mm/memory_hotplug: MEMHP_MERGE_RESOURCE to specify merging of System RAM resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some add_memory*() users add memory in small, contiguous memory blocks. Examples include virtio-mem, hyper-v balloon, and the XEN balloon. This can quickly result in a lot of memory resources, whereby the actual resource boundaries are not of interest (e.g., it might be relevant for DIMMs, exposed via /proc/iomem to user space). We really want to merge added resources in this scenario where possible. Let's provide a flag (MEMHP_MERGE_RESOURCE) to specify that a resource either created within add_memory*() or passed via add_memory_resource() shall be marked mergeable and merged with applicable siblings. To implement that, we need a kernel/resource interface to mark selected System RAM resources mergeable (IORESOURCE_SYSRAM_MERGEABLE) and trigger merging. Note: We really want to merge after the whole operation succeeded, not directly when adding a resource to the resource tree (it would break add_memory_resource() and require splitting resources again when the operation failed - e.g., due to -ENOMEM). Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Pankaj Gupta Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Thomas Gleixner Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Roger Pau Monné Cc: Julien Grall Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Jason Wang Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Vasily Gorbik Cc: Vishal Verma Link: https://lkml.kernel.org/r/20200911103459.10306-6-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 +++ include/linux/memory_hotplug.h | 7 +++++ kernel/resource.c | 60 ++++++++++++++++++++++++++++++++++++++++++ mm/memory_hotplug.c | 7 +++++ 4 files changed, 78 insertions(+) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index d7620d7c941a..7e61389dcb01 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -60,6 +60,7 @@ struct resource { /* IORESOURCE_SYSRAM specific bits. */ #define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ +#define IORESOURCE_SYSRAM_MERGEABLE 0x04000000 /* Resource can be merged. */ #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ @@ -253,6 +254,9 @@ extern void __release_region(struct resource *, resource_size_t, extern void release_mem_region_adjustable(struct resource *, resource_size_t, resource_size_t); #endif +#ifdef CONFIG_MEMORY_HOTPLUG +extern void merge_system_ram_resource(struct resource *res); +#endif /* Wrappers for managed devices */ struct device; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 33eb80fdba22..d65c6fdc5cfc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -62,6 +62,13 @@ typedef int __bitwise mhp_t; /* No special request */ #define MHP_NONE ((__force mhp_t)0) +/* + * Allow merging of the added System RAM resource with adjacent, + * mergeable resources. After a successful call to add_memory_resource() + * with this flag set, the resource pointer must no longer be used as it + * might be stale, or the resource might have changed. + */ +#define MEMHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) /* * Extended parameters for memory hotplug: diff --git a/kernel/resource.c b/kernel/resource.c index 4c1c487abbfb..92026827d95b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1363,6 +1363,66 @@ retry: } #endif /* CONFIG_MEMORY_HOTREMOVE */ +#ifdef CONFIG_MEMORY_HOTPLUG +static bool system_ram_resources_mergeable(struct resource *r1, + struct resource *r2) +{ + /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */ + return r1->flags == r2->flags && r1->end + 1 == r2->start && + r1->name == r2->name && r1->desc == r2->desc && + !r1->child && !r2->child; +} + +/* + * merge_system_ram_resource - mark the System RAM resource mergeable and try to + * merge it with adjacent, mergeable resources + * @res: resource descriptor + * + * This interface is intended for memory hotplug, whereby lots of contiguous + * system ram resources are added (e.g., via add_memory*()) by a driver, and + * the actual resource boundaries are not of interest (e.g., it might be + * relevant for DIMMs). Only resources that are marked mergeable, that have the + * same parent, and that don't have any children are considered. All mergeable + * resources must be immutable during the request. + * + * Note: + * - The caller has to make sure that no pointers to resources that are + * marked mergeable are used anymore after this call - the resource might + * be freed and the pointer might be stale! + * - release_mem_region_adjustable() will split on demand on memory hotunplug + */ +void merge_system_ram_resource(struct resource *res) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *cur; + + if (WARN_ON_ONCE((res->flags & flags) != flags)) + return; + + write_lock(&resource_lock); + res->flags |= IORESOURCE_SYSRAM_MERGEABLE; + + /* Try to merge with next item in the list. */ + cur = res->sibling; + if (cur && system_ram_resources_mergeable(res, cur)) { + res->end = cur->end; + res->sibling = cur->sibling; + free_resource(cur); + } + + /* Try to merge with previous item in the list. */ + cur = res->parent->child; + while (cur && cur->sibling != res) + cur = cur->sibling; + if (cur && system_ram_resources_mergeable(cur, res)) { + cur->end = res->end; + cur->sibling = res->sibling; + free_resource(res); + } + write_unlock(&resource_lock); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + /* * Managed region resource */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dba71bc34daa..dc7d82ece6cb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1103,6 +1103,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); + /* + * In case we're allowed to merge the resource, flag it and trigger + * merging now that adding succeeded. + */ + if (mhp_flags & MEMHP_MERGE_RESOURCE) + merge_system_ram_resource(res); + /* online pages if requested */ if (memhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); -- cgit v1.2.3-59-g8ed1b From 9b24247a24471e1333fb556a12d0e3be30d2a750 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:56 -0700 Subject: virtio-mem: try to merge system ram resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virtio-mem adds memory in memory block granularity, to be able to remove it in the same granularity again later, and to grow slowly on demand. This, however, results in quite a lot of resources when adding a lot of memory. Resources are effectively stored in a list-based tree. Having a lot of resources not only wastes memory, it also makes traversing that tree more expensive, and makes /proc/iomem explode in size (e.g., requiring kexec-tools to manually merge resources later when e.g., trying to create a kdump header). Before this patch, we get (/proc/iomem) when hotplugging 2G via virtio-mem on x86-64: [...] 100000000-13fffffff : System RAM 140000000-33fffffff : virtio0 140000000-147ffffff : System RAM (virtio_mem) 148000000-14fffffff : System RAM (virtio_mem) 150000000-157ffffff : System RAM (virtio_mem) 158000000-15fffffff : System RAM (virtio_mem) 160000000-167ffffff : System RAM (virtio_mem) 168000000-16fffffff : System RAM (virtio_mem) 170000000-177ffffff : System RAM (virtio_mem) 178000000-17fffffff : System RAM (virtio_mem) 180000000-187ffffff : System RAM (virtio_mem) 188000000-18fffffff : System RAM (virtio_mem) 190000000-197ffffff : System RAM (virtio_mem) 198000000-19fffffff : System RAM (virtio_mem) 1a0000000-1a7ffffff : System RAM (virtio_mem) 1a8000000-1afffffff : System RAM (virtio_mem) 1b0000000-1b7ffffff : System RAM (virtio_mem) 1b8000000-1bfffffff : System RAM (virtio_mem) 3280000000-32ffffffff : PCI Bus 0000:00 With this patch, we get (/proc/iomem): [...] fffc0000-ffffffff : Reserved 100000000-13fffffff : System RAM 140000000-33fffffff : virtio0 140000000-1bfffffff : System RAM (virtio_mem) 3280000000-32ffffffff : PCI Bus 0000:00 Of course, with more hotplugged memory, it gets worse. When unplugging memory blocks again, try_remove_memory() (via offline_and_remove_memory()) will properly split the resource up again. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Pankaj Gupta Cc: Michal Hocko Cc: Dan Williams Cc: Michael S. Tsirkin Cc: Jason Wang Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Gunthorpe Cc: Juergen Gross Cc: Julien Grall Cc: Kees Cook Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monné Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-7-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/virtio/virtio_mem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index ed99e4335401..ba4de598f663 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -424,7 +424,8 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); return add_memory_driver_managed(nid, addr, memory_block_size_bytes(), - vm->resource_name, MHP_NONE); + vm->resource_name, + MEMHP_MERGE_RESOURCE); } /* -- cgit v1.2.3-59-g8ed1b From 1b989d5d72abb237ae1ee8c4ce3e15cd5a364a68 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:01 -0700 Subject: xen/balloon: try to merge system ram resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's try to merge system ram resources we add, to minimize the number of resources in /proc/iomem. We don't care about the boundaries of individual chunks we added. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Juergen Gross Cc: Michal Hocko Cc: Boris Ostrovsky Cc: Stefano Stabellini Cc: Roger Pau Monné Cc: Julien Grall Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Gunthorpe Cc: Jason Wang Cc: Kees Cook Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-8-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/xen/balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 9f40a294d398..b57b2067ecbf 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -331,7 +331,7 @@ static enum bp_state reserve_additional_memory(void) mutex_unlock(&balloon_mutex); /* add_memory_resource() requires the device_hotplug lock */ lock_device_hotplug(); - rc = add_memory_resource(nid, resource, MHP_NONE); + rc = add_memory_resource(nid, resource, MEMHP_MERGE_RESOURCE); unlock_device_hotplug(); mutex_lock(&balloon_mutex); -- cgit v1.2.3-59-g8ed1b From 2c76e7f6c42bcee0e25194b54140cbce866d191a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:07 -0700 Subject: hv_balloon: try to merge system ram resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's try to merge system ram resources we add, to minimize the number of resources in /proc/iomem. We don't care about the boundaries of individual chunks we added. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Liu Cc: Michal Hocko Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Jason Gunthorpe Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: Kees Cook Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monné Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Link: https://lkml.kernel.org/r/20200911103459.10306-9-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/hv/hv_balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 3c0d52e24452..b64d2efbefe7 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), - (HA_CHUNK << PAGE_SHIFT), MHP_NONE); + (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE); if (ret) { pr_err("hot_add memory failed error is %d\n", ret); -- cgit v1.2.3-59-g8ed1b From cb8e3c8b4f45e4ed8987a581956dc9c3827a5bcf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:12 -0700 Subject: kernel/resource: make iomem_resource implicit in release_mem_region_adjustable() "mem" in the name already indicates the root, similar to release_mem_region() and devm_request_mem_region(). Make it implicit. The only single caller always passes iomem_resource, other parents are not applicable. Suggested-by: Wei Yang Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Link: https://lkml.kernel.org/r/20200916073041.10355-1-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 3 +-- kernel/resource.c | 5 ++--- mm/memory_hotplug.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 7e61389dcb01..5135d4b86cd6 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -251,8 +251,7 @@ extern struct resource * __request_region(struct resource *, extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE -extern void release_mem_region_adjustable(struct resource *, resource_size_t, - resource_size_t); +extern void release_mem_region_adjustable(resource_size_t, resource_size_t); #endif #ifdef CONFIG_MEMORY_HOTPLUG extern void merge_system_ram_resource(struct resource *res); diff --git a/kernel/resource.c b/kernel/resource.c index 92026827d95b..3ae2f56cc79d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1240,7 +1240,6 @@ EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE /** * release_mem_region_adjustable - release a previously reserved memory region - * @parent: parent resource descriptor * @start: resource start address * @size: resource region size * @@ -1258,9 +1257,9 @@ EXPORT_SYMBOL(__release_region); * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ -void release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) +void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { + struct resource *parent = &iomem_resource; struct resource *new_res = NULL; bool alloc_nofail = false; struct resource **p; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dc7d82ece6cb..1efc92f66b9d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1767,7 +1767,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) memblock_remove(start, size); } - release_mem_region_adjustable(&iomem_resource, start, size); + release_mem_region_adjustable(start, size); try_offline_node(nid); -- cgit v1.2.3-59-g8ed1b From 90c7eaeb14a325a760d732184ff1fbed47e5fa98 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 15 Oct 2020 20:09:15 -0700 Subject: mm: don't panic when links can't be created in sysfs At boot time, or when doing memory hot-add operations, if the links in sysfs can't be created, the system is still able to run, so just report the error in the kernel log rather than BUG_ON and potentially make system unusable because the callpath can be called with locks held. Since the number of memory blocks managed could be high, the messages are rate limited. As a consequence, link_mem_sections() has no status to report anymore. Signed-off-by: Laurent Dufour Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Fenghua Yu Cc: Nathan Lynch Cc: "Rafael J . Wysocki" Cc: Scott Cheloha Cc: Tony Luck Link: https://lkml.kernel.org/r/20200915094143.79181-4-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds --- drivers/base/node.c | 33 +++++++++++++++++++++------------ include/linux/node.h | 16 +++++++--------- mm/memory_hotplug.c | 5 ++--- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 43d21f9e88b1..6ffa470e2984 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -772,8 +772,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn) return pfn_to_nid(pfn); } -static int do_register_memory_block_under_node(int nid, - struct memory_block *mem_blk) +static void do_register_memory_block_under_node(int nid, + struct memory_block *mem_blk) { int ret; @@ -786,12 +786,19 @@ static int do_register_memory_block_under_node(int nid, ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); - if (ret) - return ret; + if (ret && ret != -EEXIST) + dev_err_ratelimited(&node_devices[nid]->dev, + "can't create link to %s in sysfs (%d)\n", + kobject_name(&mem_blk->dev.kobj), ret); - return sysfs_create_link_nowarn(&mem_blk->dev.kobj, + ret = sysfs_create_link_nowarn(&mem_blk->dev.kobj, &node_devices[nid]->dev.kobj, kobject_name(&node_devices[nid]->dev.kobj)); + if (ret && ret != -EEXIST) + dev_err_ratelimited(&mem_blk->dev, + "can't create link to %s in sysfs (%d)\n", + kobject_name(&node_devices[nid]->dev.kobj), + ret); } /* register memory section under specified node if it spans that node */ @@ -827,7 +834,8 @@ static int register_mem_block_under_node_early(struct memory_block *mem_blk, if (page_nid != nid) continue; - return do_register_memory_block_under_node(nid, mem_blk); + do_register_memory_block_under_node(nid, mem_blk); + return 0; } /* mem section does not span the specified node */ return 0; @@ -842,7 +850,8 @@ static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, { int nid = *(int *)arg; - return do_register_memory_block_under_node(nid, mem_blk); + do_register_memory_block_under_node(nid, mem_blk); + return 0; } /* @@ -860,8 +869,8 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); } -int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, - enum meminit_context context) +void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, + enum meminit_context context) { walk_memory_blocks_func_t func; @@ -870,9 +879,9 @@ int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, else func = register_mem_block_under_node_early; - return walk_memory_blocks(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), (void *)&nid, - func); + walk_memory_blocks(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), + (void *)&nid, func); + return; } #ifdef CONFIG_HUGETLBFS diff --git a/include/linux/node.h b/include/linux/node.h index 014ba3ab2efd..8e5a29897936 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -99,15 +99,14 @@ extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) -int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context); +void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context); #else -static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context) +static inline void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context) { - return 0; } #endif @@ -130,8 +129,7 @@ static inline int register_one_node(int nid) if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, start_pfn, end_pfn, - MEMINIT_EARLY); + link_mem_sections(nid, start_pfn, end_pfn, MEMINIT_EARLY); } return error; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1efc92f66b9d..d397af38f9ce 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1092,9 +1092,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) } /* link memory sections under this node.*/ - ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), - MEMINIT_HOTPLUG); - BUG_ON(ret); + link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) -- cgit v1.2.3-59-g8ed1b From f04a5d5d913fa81b5a3456512c3991aea7227912 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:20 -0700 Subject: mm/page_alloc: convert "report" flag of __free_one_page() to a proper flag Patch series "mm: place pages to the freelist tail when onlining and undoing isolation", v2. When adding separate memory blocks via add_memory*() and onlining them immediately, the metadata (especially the memmap) of the next block will be placed onto one of the just added+onlined block. This creates a chain of unmovable allocations: If the last memory block cannot get offlined+removed() so will all dependent ones. We directly have unmovable allocations all over the place. This can be observed quite easily using virtio-mem, however, it can also be observed when using DIMMs. The freshly onlined pages will usually be placed to the head of the freelists, meaning they will be allocated next, turning the just-added memory usually immediately un-removable. The fresh pages are cold, prefering to allocate others (that might be hot) also feels to be the natural thing to do. It also applies to the hyper-v balloon xen-balloon, and ppc64 dlpar: when adding separate, successive memory blocks, each memory block will have unmovable allocations on them - for example gigantic pages will fail to allocate. While the ZONE_NORMAL doesn't provide any guarantees that memory can get offlined+removed again (any kind of fragmentation with unmovable allocations is possible), there are many scenarios (hotplugging a lot of memory, running workload, hotunplug some memory/as much as possible) where we can offline+remove quite a lot with this patchset. a) To visualize the problem, a very simple example: Start a VM with 4GB and 8GB of virtio-mem memory: [root@localhost ~]# lsmem RANGE SIZE STATE REMOVABLE BLOCK 0x0000000000000000-0x00000000bfffffff 3G online yes 0-23 0x0000000100000000-0x000000033fffffff 9G online yes 32-103 Memory block size: 128M Total online memory: 12G Total offline memory: 0B Then try to unplug as much as possible using virtio-mem. Observe which memory blocks are still around. Without this patch set: [root@localhost ~]# lsmem RANGE SIZE STATE REMOVABLE BLOCK 0x0000000000000000-0x00000000bfffffff 3G online yes 0-23 0x0000000100000000-0x000000013fffffff 1G online yes 32-39 0x0000000148000000-0x000000014fffffff 128M online yes 41 0x0000000158000000-0x000000015fffffff 128M online yes 43 0x0000000168000000-0x000000016fffffff 128M online yes 45 0x0000000178000000-0x000000017fffffff 128M online yes 47 0x0000000188000000-0x0000000197ffffff 256M online yes 49-50 0x00000001a0000000-0x00000001a7ffffff 128M online yes 52 0x00000001b0000000-0x00000001b7ffffff 128M online yes 54 0x00000001c0000000-0x00000001c7ffffff 128M online yes 56 0x00000001d0000000-0x00000001d7ffffff 128M online yes 58 0x00000001e0000000-0x00000001e7ffffff 128M online yes 60 0x00000001f0000000-0x00000001f7ffffff 128M online yes 62 0x0000000200000000-0x0000000207ffffff 128M online yes 64 0x0000000210000000-0x0000000217ffffff 128M online yes 66 0x0000000220000000-0x0000000227ffffff 128M online yes 68 0x0000000230000000-0x0000000237ffffff 128M online yes 70 0x0000000240000000-0x0000000247ffffff 128M online yes 72 0x0000000250000000-0x0000000257ffffff 128M online yes 74 0x0000000260000000-0x0000000267ffffff 128M online yes 76 0x0000000270000000-0x0000000277ffffff 128M online yes 78 0x0000000280000000-0x0000000287ffffff 128M online yes 80 0x0000000290000000-0x0000000297ffffff 128M online yes 82 0x00000002a0000000-0x00000002a7ffffff 128M online yes 84 0x00000002b0000000-0x00000002b7ffffff 128M online yes 86 0x00000002c0000000-0x00000002c7ffffff 128M online yes 88 0x00000002d0000000-0x00000002d7ffffff 128M online yes 90 0x00000002e0000000-0x00000002e7ffffff 128M online yes 92 0x00000002f0000000-0x00000002f7ffffff 128M online yes 94 0x0000000300000000-0x0000000307ffffff 128M online yes 96 0x0000000310000000-0x0000000317ffffff 128M online yes 98 0x0000000320000000-0x0000000327ffffff 128M online yes 100 0x0000000330000000-0x000000033fffffff 256M online yes 102-103 Memory block size: 128M Total online memory: 8.1G Total offline memory: 0B With this patch set: [root@localhost ~]# lsmem RANGE SIZE STATE REMOVABLE BLOCK 0x0000000000000000-0x00000000bfffffff 3G online yes 0-23 0x0000000100000000-0x000000013fffffff 1G online yes 32-39 Memory block size: 128M Total online memory: 4G Total offline memory: 0B All memory can get unplugged, all memory block can get removed. Of course, no workload ran and the system was basically idle, but it highlights the issue - the fairly deterministic chain of unmovable allocations. When a huge page for the 2MB memmap is needed, a just-onlined 4MB page will be split. The remaining 2MB page will be used for the memmap of the next memory block. So one memory block will hold the memmap of the two following memory blocks. Finally the pages of the last-onlined memory block will get used for the next bigger allocations - if any allocation is unmovable, all dependent memory blocks cannot get unplugged and removed until that allocation is gone. Note that with bigger memory blocks (e.g., 256MB), *all* memory blocks are dependent and none can get unplugged again! b) Experiment with memory intensive workload I performed an experiment with an older version of this patch set (before we used undo_isolate_page_range() in online_pages(): Hotplug 56GB to a VM with an initial 4GB, onlining all memory to ZONE_NORMAL right from the kernel when adding it. I then run various memory intensive workloads that consume most system memory for a total of 45 minutes. Once finished, I try to unplug as much memory as possible. With this change, I am able to remove via virtio-mem (adding individual 128MB memory blocks) 413 out of 448 added memory blocks. Via individual (256MB) DIMMs 380 out of 448 added memory blocks. (I don't have any numbers without this patchset, but looking at the above example, it's at most half of the 448 memory blocks for virtio-mem, and most probably none for DIMMs). Again, there are workloads that might behave very differently due to the nature of ZONE_NORMAL. This change also affects (besides memory onlining): - Other users of undo_isolate_page_range(): Pages are always placed to the tail. -- When memory offlining fails -- When memory isolation fails after having isolated some pageblocks -- When alloc_contig_range() either succeeds or fails - Other users of __putback_isolated_page(): Pages are always placed to the tail. -- Free page reporting - Other users of __free_pages_core() -- AFAIKs, any memory that is getting exposed to the buddy during boot. IIUC we will now usually allocate memory from lower addresses within a zone first (especially during boot). - Other users of generic_online_page() -- Hyper-V balloon This patch (of 5): Let's prepare for additional flags and avoid long parameter lists of bools. Follow-up patches will also make use of the flags in __free_pages_ok(). Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Alexander Duyck Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Reviewed-by: Pankaj Gupta Acked-by: Michal Hocko Cc: Mel Gorman Cc: Dave Hansen Cc: Mike Rapoport Cc: Matthew Wilcox Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Michal Hocko Cc: Scott Cheloha Cc: Stephen Hemminger Cc: Wei Liu Cc: Michal Hocko Link: https://lkml.kernel.org/r/20201005121534.15649-1-david@redhat.com Link: https://lkml.kernel.org/r/20201005121534.15649-2-david@redhat.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f7f292f1d108..fc54ef10107b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -78,6 +78,22 @@ #include "shuffle.h" #include "page_reporting.h" +/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ +typedef int __bitwise fpi_t; + +/* No special request */ +#define FPI_NONE ((__force fpi_t)0) + +/* + * Skip free page reporting notification for the (possibly merged) page. + * This does not hinder free page reporting from grabbing the page, + * reporting it and marking it "reported" - it only skips notifying + * the free page reporting infrastructure about a newly freed page. For + * example, used when temporarily pulling a page from a freelist and + * putting it back unmodified. + */ +#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -952,7 +968,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, static inline void __free_one_page(struct page *page, unsigned long pfn, struct zone *zone, unsigned int order, - int migratetype, bool report) + int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); unsigned long buddy_pfn; @@ -1039,7 +1055,7 @@ done_merging: add_to_free_list(page, zone, order, migratetype); /* Notify page reporting subsystem of freed page */ - if (report) + if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) page_reporting_notify_free(order); } @@ -1380,7 +1396,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt, true); + __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); trace_mm_page_pcpu_drain(page, 0, mt); } spin_unlock(&zone->lock); @@ -1396,7 +1412,7 @@ static void free_one_page(struct zone *zone, is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } - __free_one_page(page, pfn, zone, order, migratetype, true); + __free_one_page(page, pfn, zone, order, migratetype, FPI_NONE); spin_unlock(&zone->lock); } @@ -3289,7 +3305,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) lockdep_assert_held(&zone->lock); /* Return isolated page to tail of freelist. */ - __free_one_page(page, page_to_pfn(page), zone, order, mt, false); + __free_one_page(page, page_to_pfn(page), zone, order, mt, + FPI_SKIP_REPORT_NOTIFY); } /* -- cgit v1.2.3-59-g8ed1b From 47b6a24a23825ae7b33ff11396980da7c353843d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:26 -0700 Subject: mm/page_alloc: place pages to tail in __putback_isolated_page() __putback_isolated_page() already documents that pages will be placed to the tail of the freelist - this is, however, not the case for "order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be the case for all existing users. This change affects two users: - free page reporting - page isolation, when undoing the isolation (including memory onlining). This behavior is desirable for pages that haven't really been touched lately, so exactly the two users that don't actually read/write page content, but rather move untouched pages. The new behavior is especially desirable for memory onlining, where we allow allocation of newly onlined pages via undo_isolate_page_range() in online_pages(). Right now, we always place them to the head of the freelist, resulting in undesireable behavior: Assume we add individual memory chunks via add_memory() and online them right away to the NORMAL zone. We create a dependency chain of unmovable allocations e.g., via the memmap. The memmap of the next chunk will be placed onto previous chunks - if the last block cannot get offlined+removed, all dependent ones cannot get offlined+removed. While this can already be observed with individual DIMMs, it's more of an issue for virtio-mem (and I suspect also ppc DLPAR). Document that this should only be used for optimizations, and no code should rely on this behavior for correction (if the order of the freelists ever changes). We won't care about page shuffling: memory onlining already properly shuffles after onlining. free page reporting doesn't care about physically contiguous ranges, and there are already cases where page isolation will simply move (physically close) free pages to (currently) the head of the freelists via move_freepages_block() instead of shuffling. If this becomes ever relevant, we should shuffle the whole zone when undoing isolation of larger ranges, and after free_contig_range(). Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Alexander Duyck Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Reviewed-by: Pankaj Gupta Acked-by: Michal Hocko Cc: Mel Gorman Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Scott Cheloha Cc: Michael Ellerman Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Stephen Hemminger Cc: Wei Liu Link: https://lkml.kernel.org/r/20201005121534.15649-3-david@redhat.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc54ef10107b..eb79d2af9de6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -94,6 +94,18 @@ typedef int __bitwise fpi_t; */ #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) +/* + * Place the (possibly merged) page to the tail of the freelist. Will ignore + * page shuffling (relevant code - e.g., memory onlining - is expected to + * shuffle the whole zone). + * + * Note: No code should rely on this flag for correctness - it's purely + * to allow for optimizations when handing back either fresh pages + * (memory onlining) or untouched pages (page isolation, free page + * reporting). + */ +#define FPI_TO_TAIL ((__force fpi_t)BIT(1)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -1044,7 +1056,9 @@ continue_merging: done_merging: set_page_order(page, order); - if (is_shuffle_order(order)) + if (fpi_flags & FPI_TO_TAIL) + to_tail = true; + else if (is_shuffle_order(order)) to_tail = shuffle_pick_tail(); else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); @@ -3306,7 +3320,7 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) /* Return isolated page to tail of freelist. */ __free_one_page(page, page_to_pfn(page), zone, order, mt, - FPI_SKIP_REPORT_NOTIFY); + FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); } /* -- cgit v1.2.3-59-g8ed1b From 293ffa5ebb9c08a77d8de458166c31b4d7b0cd65 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:30 -0700 Subject: mm/page_alloc: move pages to tail in move_to_free_list() Whenever we move pages between freelists via move_to_free_list()/ move_freepages_block(), we don't actually touch the pages: 1. Page isolation doesn't actually touch the pages, it simply isolates pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist. When undoing isolation, we move the pages back to the target list. 2. Page stealing (steal_suitable_fallback()) moves free pages directly between lists without touching them. 3. reserve_highatomic_pageblock()/unreserve_highatomic_pageblock() moves free pages directly between freelists without touching them. We already place pages to the tail of the freelists when undoing isolation via __putback_isolated_page(), let's do it in any case (e.g., if order <= pageblock_order) and document the behavior. To simplify, let's move the pages to the tail for all move_to_free_list()/move_freepages_block() users. In 2., the target list is empty, so there should be no change. In 3., we might observe a change, however, highatomic is more concerned about allocations succeeding than cache hotness - if we ever realize this change degrades a workload, we can special-case this instance and add a proper comment. This change results in all pages getting onlined via online_pages() to be placed to the tail of the freelist. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Acked-by: Pankaj Gupta Acked-by: Michal Hocko Cc: Alexander Duyck Cc: Mel Gorman Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Scott Cheloha Cc: Michael Ellerman Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Stephen Hemminger Cc: Wei Liu Link: https://lkml.kernel.org/r/20201005121534.15649-4-david@redhat.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++++--- mm/page_isolation.c | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eb79d2af9de6..ea7b697bcf6c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -901,13 +901,17 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, area->nr_free++; } -/* Used for pages which are on another list */ +/* + * Used for pages which are on another list. Move the pages to the tail + * of the list - so the moved pages won't immediately be considered for + * allocation again (e.g., optimization for memory onlining). + */ static inline void move_to_free_list(struct page *page, struct zone *zone, unsigned int order, int migratetype) { struct free_area *area = &zone->free_area[order]; - list_move(&page->lru, &area->free_list[migratetype]); + list_move_tail(&page->lru, &area->free_list[migratetype]); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -2340,7 +2344,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, #endif /* - * Move the free pages in a range to the free lists of the requested type. + * Move the free pages in a range to the freelist tail of the requested type. * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ diff --git a/mm/page_isolation.c b/mm/page_isolation.c index abfe26ad59fd..83692b937784 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -106,6 +106,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * If we isolate freepage with more than pageblock_order, there * should be no freepage in the range, so we could avoid costly * pageblock scanning for freepage moving. + * + * We didn't actually touch any of the isolated pages, so place them + * to the tail of the freelist. This is an optimization for memory + * onlining - just onlined memory won't immediately be considered for + * allocation. */ if (!isolated_page) { nr_pages = move_freepages_block(zone, page, migratetype, NULL); -- cgit v1.2.3-59-g8ed1b From 7fef431be9c9ac255838a9578331567b9dba4477 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:35 -0700 Subject: mm/page_alloc: place pages to tail in __free_pages_core() __free_pages_core() is used when exposing fresh memory to the buddy during system boot and when onlining memory in generic_online_page(). generic_online_page() is used in two cases: 1. Direct memory onlining in online_pages(). 2. Deferred memory onlining in memory-ballooning-like mechanisms (HyperV balloon and virtio-mem), when parts of a section are kept fake-offline to be fake-onlined later on. In 1, we already place pages to the tail of the freelist. Pages will be freed to MIGRATE_ISOLATE lists first and moved to the tail of the freelists via undo_isolate_page_range(). In 2, we currently don't implement a proper rule. In case of virtio-mem, where we currently always online MAX_ORDER - 1 pages, the pages will be placed to the HEAD of the freelist - undesireable. While the hyper-v balloon calls generic_online_page() with single pages, usually it will call it on successive single pages in a larger block. The pages are fresh, so place them to the tail of the freelist and avoid the PCP. In __free_pages_core(), remove the now superflouos call to set_page_refcounted() and add a comment regarding page initialization and the refcount. Note: In 2. we currently don't shuffle. If ever relevant (page shuffling is usually of limited use in virtualized environments), we might want to shuffle after a sequence of generic_online_page() calls in the relevant callers. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Acked-by: Pankaj Gupta Acked-by: Michal Hocko Cc: Alexander Duyck Cc: Mel Gorman Cc: Dave Hansen Cc: Mike Rapoport Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Michal Hocko Cc: Scott Cheloha Link: https://lkml.kernel.org/r/20201005121534.15649-5-david@redhat.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea7b697bcf6c..3b032dac62e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -275,7 +275,8 @@ bool pm_suspended_storage(void) unsigned int pageblock_order __read_mostly; #endif -static void __free_pages_ok(struct page *page, unsigned int order); +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags); /* * results with 256, 32 in the lowmem_reserve sysctl: @@ -687,7 +688,7 @@ out: void free_compound_page(struct page *page) { mem_cgroup_uncharge(page); - __free_pages_ok(page, compound_order(page)); + __free_pages_ok(page, compound_order(page), FPI_NONE); } void prep_compound_page(struct page *page, unsigned int order) @@ -1423,14 +1424,14 @@ static void free_pcppages_bulk(struct zone *zone, int count, static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, - int migratetype) + int migratetype, fpi_t fpi_flags) { spin_lock(&zone->lock); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } - __free_one_page(page, pfn, zone, order, migratetype, FPI_NONE); + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock(&zone->lock); } @@ -1508,7 +1509,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) } } -static void __free_pages_ok(struct page *page, unsigned int order) +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags) { unsigned long flags; int migratetype; @@ -1520,7 +1522,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, pfn, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype, + fpi_flags); local_irq_restore(flags); } @@ -1530,6 +1533,11 @@ void __free_pages_core(struct page *page, unsigned int order) struct page *p = page; unsigned int loop; + /* + * When initializing the memmap, __init_single_page() sets the refcount + * of all pages to 1 ("allocated"/"not free"). We have to set the + * refcount of all involved pages to 0. + */ prefetchw(p); for (loop = 0; loop < (nr_pages - 1); loop++, p++) { prefetchw(p + 1); @@ -1540,8 +1548,12 @@ void __free_pages_core(struct page *page, unsigned int order) set_page_count(p, 0); atomic_long_add(nr_pages, &page_zone(page)->managed_pages); - set_page_refcounted(page); - __free_pages(page, order); + + /* + * Bypass PCP and place fresh pages right to the tail, primarily + * relevant for memory onlining. + */ + __free_pages_ok(page, order, FPI_TO_TAIL); } #ifdef CONFIG_NEED_MULTIPLE_NODES @@ -3168,7 +3180,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype, + FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; @@ -4991,7 +5004,7 @@ static inline void free_the_page(struct page *page, unsigned int order) if (order == 0) /* Via pcp? */ free_unref_page(page); else - __free_pages_ok(page, order); + __free_pages_ok(page, order, FPI_NONE); } void __free_pages(struct page *page, unsigned int order) -- cgit v1.2.3-59-g8ed1b From b86c5fc4e71a23e284b7cdfb757fb76534ab4119 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:39 -0700 Subject: mm/memory_hotplug: update comment regarding zone shuffling As we no longer shuffle via generic_online_page() and when undoing isolation, we can simplify the comment. We now effectively shuffle only once (properly) when onlining new memory. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Acked-by: Michal Hocko Cc: Alexander Duyck Cc: Mel Gorman Cc: Michal Hocko Cc: Dave Hansen Cc: Vlastimil Babka Cc: Wei Yang Cc: Oscar Salvador Cc: Mike Rapoport Cc: Pankaj Gupta Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Scott Cheloha Cc: Stephen Hemminger Cc: Wei Liu Link: https://lkml.kernel.org/r/20201005121534.15649-6-david@redhat.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d397af38f9ce..6f203574ca1d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -858,13 +858,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); /* - * When exposing larger, physically contiguous memory areas to the - * buddy, shuffling in the buddy (when freeing onlined pages, putting - * them either to the head or the tail of the freelist) is only helpful - * for maintaining the shuffle, but not for creating the initial - * shuffle. Shuffle the whole zone to make sure the just onlined pages - * are properly distributed across the whole freelist. Make sure to - * shuffle once pageblocks are no longer isolated. + * Freshly onlined pages aren't shuffled (e.g., all pages are placed to + * the tail of the freelist when undoing isolation). Shuffle the whole + * zone to make sure the just onlined pages are properly distributed + * across the whole freelist - to create an initial shuffle. */ shuffle_zone(zone); -- cgit v1.2.3-59-g8ed1b From 4e79603bbd33aa9600313ae6f887741efbb01456 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 15 Oct 2020 20:09:43 -0700 Subject: zram: failing to decompress is WARN_ON worthy If we fail to decompress in zram it's a pretty serious problem. We were entrusted to be able to decompress the old data but we failed. Either we've got some crazy bug in the compression code or we've got memory corruption. At the moment, when this happens the log looks like this: ERR kernel: [ 1833.099861] zram: Decompression failed! err=-22, page=336112 ERR kernel: [ 1833.099881] zram: Decompression failed! err=-22, page=336112 ALERT kernel: [ 1833.099886] Read-error on swap-device (253:0:2688896) It is true that we have an "ALERT" level log in there, but (at least to me) it feels like even this isn't enough to impart the seriousness of this error. Let's convert to a WARN_ON. Note that WARN_ON is automatically "unlikely" so we can simply replace the old annotation with the new one. Signed-off-by: Douglas Anderson Signed-off-by: Andrew Morton Acked-by: Minchan Kim Cc: Sergey Senozhatsky Cc: Sonny Rao Cc: Jens Axboe Link: https://lkml.kernel.org/r/20200917174059.1.If09c882545dbe432268f7a67a4d4cfcb6caace4f@changeid Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bff3d4021c18..029403c18ca3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1270,7 +1270,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, zram_slot_unlock(zram, index); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret)) + if (WARN_ON(ret)) pr_err("Decompression failed! err=%d, page=%u\n", ret, index); return ret; -- cgit v1.2.3-59-g8ed1b From c7df08f1953bb6cbd61b0c37f825b6e7b4489217 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 15 Oct 2020 20:09:46 -0700 Subject: mm/slab.h: remove duplicate include Remove duplicate header which is included twice. Signed-off-by: YueHaibing Signed-off-by: Andrew Morton Reviewed-by: Pekka Enberg Link: http://lkml.kernel.org/r/20200818114323.58156-1-yuehaibing@huawei.com Signed-off-by: Linus Torvalds --- mm/slab.h | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/slab.h b/mm/slab.h index 6dd4b702888a..06c6587765a3 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -46,7 +46,6 @@ struct kmem_cache { #include #include #include -#include /* * State of the slab allocator. -- cgit v1.2.3-59-g8ed1b From 58f6f03497164445edaafbcf421dff712155ca7f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 15 Oct 2020 20:09:49 -0700 Subject: mm/page_reporting.c: drop stale list head check in page_reporting_cycle list_for_each_entry_safe() guarantees that we will never stumble over the list head; "&page->lru != list" will always evaluate to true. Let's simplify. [david@redhat.com: Changelog refinements] Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Alexander Duyck Link: http://lkml.kernel.org/r/20200818084448.33969-1-richard.weiyang@linux.alibaba.com Signed-off-by: Linus Torvalds --- mm/page_reporting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 3bbd471cfc81..aaaa3605123d 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -178,7 +178,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, * the new head of the free list before we release the * zone lock. */ - if (&page->lru != list && !list_is_first(&page->lru, list)) + if (!list_is_first(&page->lru, list)) list_rotate_to_front(&page->lru, list); /* release lock before waiting on report processing */ -- cgit v1.2.3-59-g8ed1b From 955cc774f286adddeef85bd2e69edb0f2dab3214 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 15 Oct 2020 20:09:52 -0700 Subject: mm/highmem.c: clean up endif comments The #endif at the end of the file matches up with the '#if defined(HASHED_PAGE_VIRTUAL)' on line 374. Not the CONFIG_HIGHMEM #if earlier. Fix comments on both of the #endif's to indicate the correct end of blocks for each. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Mike Rapoport Link: https://lkml.kernel.org/r/20200819184635.112579-1-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- mm/highmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index 64d8dea47dd1..1352a27951e3 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -369,7 +369,7 @@ void kunmap_high(struct page *page) } EXPORT_SYMBOL(kunmap_high); -#endif +#endif /* CONFIG_HIGHMEM */ #if defined(HASHED_PAGE_VIRTUAL) @@ -481,4 +481,4 @@ void __init page_address_init(void) } } -#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ +#endif /* defined(HASHED_PAGE_VIRTUAL) */ -- cgit v1.2.3-59-g8ed1b From ed0173733dd468883198c3136284394320b8fad6 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 15 Oct 2020 20:09:55 -0700 Subject: mm: use self-explanatory macros rather than "2" Signed-off-by: Yu Zhao Signed-off-by: Andrew Morton Cc: Alex Shi Link: http://lkml.kernel.org/r/20200831175042.3527153-2-yuzhao@google.com Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++++---- include/linux/vmstat.h | 2 +- mm/vmscan.c | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c27fb1faffe5..7e0ea3fe95ca 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -266,6 +266,8 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } +#define ANON_AND_FILE 2 + enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI @@ -283,8 +285,8 @@ struct lruvec { unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; - /* Refaults at the time of last reclaim cycle, anon=0, file=1 */ - unsigned long refaults[2]; + /* Refaults at the time of last reclaim cycle */ + unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_MEMCG @@ -441,6 +443,8 @@ enum zone_type { #ifndef __GENERATING_BOUNDS_H +#define ASYNC_AND_SYNC 2 + struct zone { /* Read-mostly fields */ @@ -560,8 +564,8 @@ struct zone { #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; - /* pfn where async and sync compaction migration scanner should start */ - unsigned long compact_cached_migrate_pfn[2]; + /* pfn where compaction migration scanner should start */ + unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 7557c1070fd7..322dcbfcc933 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -28,7 +28,7 @@ struct reclaim_stat { unsigned nr_writeback; unsigned nr_immediate; unsigned nr_pageout; - unsigned nr_activate[2]; + unsigned nr_activate[ANON_AND_FILE]; unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; diff --git a/mm/vmscan.c b/mm/vmscan.c index fc4dee9b91f8..1b8f0e059767 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2239,7 +2239,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); - u64 fraction[2]; + u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; unsigned long ap, fp; -- cgit v1.2.3-59-g8ed1b From 0e9aa6755757ea5e06e6130d5e50a93442456499 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 15 Oct 2020 20:09:58 -0700 Subject: mm: fix some broken comments Fix some broken comments including typo, grammar error and wrong function name. Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200913095456.54873-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- mm/swap_state.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 8aad142ec4be..45e564a20f8e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1445,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem * unlock_page - unlock a locked page * @page: the page * - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). + * Unlocks the page and wakes up sleepers in wait_on_page_locked(). * Also wakes sleepers in wait_on_page_writeback() because the wakeup * mechanism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. @@ -3004,7 +3004,7 @@ filler: goto out; /* - * Page is not up to date and may be locked due one of the following + * Page is not up to date and may be locked due to one of the following * case a: Page is being filled and the page lock is held * case b: Read/write error clearing the page uptodate status * case c: Truncation in progress (page locked) diff --git a/mm/swap_state.c b/mm/swap_state.c index aa40e706604c..ee465827420e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -246,7 +246,7 @@ int add_to_swap(struct page *page) goto fail; /* * Normally the page will be dirtied in unmap because its pte should be - * dirty. A special case is MADV_FREE page. The page'e pte could have + * dirty. A special case is MADV_FREE page. The page's pte could have * dirty bit cleared but the page's SwapBacked bit is still set because * clearing the dirty bit and SwapBacked bit has no lock protected. For * such page, unmap will not set dirty bit for it, so page reclaim will -- cgit v1.2.3-59-g8ed1b From 70b6d25ec59cbc1327456ca15c2a779421ed13ce Mon Sep 17 00:00:00 2001 From: Chen Tao Date: Thu, 15 Oct 2020 20:10:01 -0700 Subject: mm: fix some comments formatting Correct one function name "get_partials" with "get_partial". Update the old struct name of list3 with kmem_cache_node. Signed-off-by: Chen Tao Signed-off-by: Andrew Morton Reviewed-by: Mike Rapoport Link: https://lkml.kernel.org/r/Message-ID: Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- mm/slub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 399a9d185b0f..b1113561b98b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1062,7 +1062,7 @@ int slab_prepare_cpu(unsigned int cpu) * Even if all the cpus of a node are down, we don't free the * kmem_cache_node of any cache. This to avoid a race between cpu_down, and * a kmalloc allocation from another cpu for memory from the node of - * the cpu going down. The list3 structure is usually allocated from + * the cpu going down. The kmem_cache_node structure is usually allocated from * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). */ int slab_dead_cpu(unsigned int cpu) diff --git a/mm/slub.c b/mm/slub.c index 61d0d2968413..b30be2385d1c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1956,7 +1956,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, /* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a - * partial slab and there is none available then get_partials() + * partial slab and there is none available then get_partial() * will return NULL. */ if (!n || !n->nr_partial) -- cgit v1.2.3-59-g8ed1b From e755f4af08b74e5bee4f7a1fc2c3a05188774608 Mon Sep 17 00:00:00 2001 From: Xiaofei Tan Date: Thu, 15 Oct 2020 20:10:05 -0700 Subject: mm/workingset.c: fix some doc warnings Fix following warnings caused by mismatch bewteen function parameters and comments. mm/workingset.c:228: warning: Function parameter or member 'lruvec' not described in 'workingset_age_nonresident' mm/workingset.c:228: warning: Excess function parameter 'memcg' description in 'workingset_age_nonresident' Signed-off-by: Xiaofei Tan Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/1600485913-11192-1-git-send-email-tanxiaofei@huawei.com Signed-off-by: Linus Torvalds --- mm/workingset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/workingset.c b/mm/workingset.c index 92e66113a577..8ed8e6296d8c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -216,7 +216,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, /** * workingset_age_nonresident - age non-resident entries as LRU ages - * @memcg: the lruvec that was aged + * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as -- cgit v1.2.3-59-g8ed1b From 73eb7f9a4ff034d20c8f9454a5a6f45a60cf8dfc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 15 Oct 2020 20:10:08 -0700 Subject: mm: use helper function put_write_access() In commit 1da177e4c3f4 ("Linux-2.6.12-rc2"), the helper put_write_access() came with the atomic_dec operation of the i_writecount field. But it forgot to use this helper in __vma_link_file() and dup_mmap(). Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200924115235.5111-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- mm/mmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 3ca8f1f83fb3..e0f74be1423c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -556,7 +556,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); + put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); diff --git a/mm/mmap.c b/mm/mmap.c index 67d11ad6df24..9fb9f8a233c7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -619,7 +619,7 @@ static void __vma_link_file(struct vm_area_struct *vma) struct address_space *mapping = file->f_mapping; if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file_inode(file)->i_writecount); + put_write_access(file_inode(file)); if (vma->vm_flags & VM_SHARED) mapping_allow_writable(mapping); -- cgit v1.2.3-59-g8ed1b From 1f0f8c0de09066d23760c1f5fac2cd53b32f1127 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 15 Oct 2020 20:10:11 -0700 Subject: include/linux/mmzone.h: remove unused early_pfn_valid() The early_pfn_valid() macro is defined but it is never used. Remove it. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Link: https://lkml.kernel.org/r/20200923162915.26935-1-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7e0ea3fe95ca..fb3bf696c05e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1420,7 +1420,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define pfn_to_nid(pfn) (0) #endif -#define early_pfn_valid(pfn) pfn_valid(pfn) void sparse_init(void); #else #define sparse_init() do {} while (0) @@ -1440,10 +1439,6 @@ struct mminit_pfnnid_cache { int last_nid; }; -#ifndef early_pfn_valid -#define early_pfn_valid(pfn) (1) -#endif - /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we * need to check pfn validity within that MAX_ORDER_NR_PAGES block. -- cgit v1.2.3-59-g8ed1b From ab130f9108dcf2062a44f9f0706824ef2e30492e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:10:15 -0700 Subject: mm: rename page_order() to buddy_order() The current page_order() can only be called on pages in the buddy allocator. For compound pages, you have to use compound_order(). This is confusing and led to a bug, so rename page_order() to buddy_order(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20201001152259.14932-2-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +++--- mm/internal.h | 8 ++++---- mm/page_alloc.c | 30 +++++++++++++++--------------- mm/page_isolation.c | 4 ++-- mm/page_owner.c | 6 +++--- mm/page_reporting.c | 2 +- mm/shuffle.c | 2 +- 7 files changed, 29 insertions(+), 29 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 6c63844fc061..6e0ee5641788 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -625,7 +625,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, } /* Found a free page, will break it into order-0 pages */ - order = page_order(page); + order = buddy_order(page); isolated = __isolate_free_page(page, order); if (!isolated) break; @@ -898,7 +898,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * potential isolation targets. */ if (PageBuddy(page)) { - unsigned long freepage_order = page_order_unsafe(page); + unsigned long freepage_order = buddy_order_unsafe(page); /* * Without lock, we cannot be sure that what we got is @@ -1172,7 +1172,7 @@ static bool suitable_migration_target(struct compact_control *cc, * the only small danger is that we skip a potentially suitable * pageblock, so it's not worth to check order for valid range. */ - if (page_order_unsafe(page) >= pageblock_order) + if (buddy_order_unsafe(page) >= pageblock_order) return false; } diff --git a/mm/internal.h b/mm/internal.h index 6345b08ce86c..c43ccdddb0f6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -270,16 +270,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the * page cannot be allocated or merged in parallel. Alternatively, it must - * handle invalid values gracefully, and use page_order_unsafe() below. + * handle invalid values gracefully, and use buddy_order_unsafe() below. */ -static inline unsigned int page_order(struct page *page) +static inline unsigned int buddy_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); } /* - * Like page_order(), but for callers who cannot afford to hold the zone lock. + * Like buddy_order(), but for callers who cannot afford to hold the zone lock. * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * @@ -289,7 +289,7 @@ static inline unsigned int page_order(struct page *page) * times, potentially observing different values in the tests and the actual * use of the result. */ -#define page_order_unsafe(page) READ_ONCE(page_private(page)) +#define buddy_order_unsafe(page) READ_ONCE(page_private(page)) static inline bool is_cow_mapping(vm_flags_t flags) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b032dac62e6..ccf615c0627e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -792,7 +792,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif -static inline void set_page_order(struct page *page, unsigned int order) +static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -817,7 +817,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy, if (!page_is_guard(buddy) && !PageBuddy(buddy)) return false; - if (page_order(buddy) != order) + if (buddy_order(buddy) != order) return false; /* @@ -1059,7 +1059,7 @@ continue_merging: } done_merging: - set_page_order(page, order); + set_buddy_order(page, order); if (fpi_flags & FPI_TO_TAIL) to_tail = true; @@ -2178,7 +2178,7 @@ static inline void expand(struct zone *zone, struct page *page, continue; add_to_free_list(&page[size], zone, high, migratetype); - set_page_order(&page[size], high); + set_buddy_order(&page[size], high); } } @@ -2392,7 +2392,7 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); VM_BUG_ON_PAGE(page_zone(page) != zone, page); - order = page_order(page); + order = buddy_order(page); move_to_free_list(page, zone, order, migratetype); page += 1 << order; pages_moved += 1 << order; @@ -2516,7 +2516,7 @@ static inline void boost_watermark(struct zone *zone) static void steal_suitable_fallback(struct zone *zone, struct page *page, unsigned int alloc_flags, int start_type, bool whole_block) { - unsigned int current_order = page_order(page); + unsigned int current_order = buddy_order(page); int free_pages, movable_pages, alike_pages; int old_block_type; @@ -8344,7 +8344,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, */ if (!page_ref_count(page)) { if (PageBuddy(page)) - iter += (1 << page_order(page)) - 1; + iter += (1 << buddy_order(page)) - 1; continue; } @@ -8557,7 +8557,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, } if (outer_start != start) { - order = page_order(pfn_to_page(outer_start)); + order = buddy_order(pfn_to_page(outer_start)); /* * outer_start page could be small order buddy page and @@ -8782,7 +8782,7 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); - order = page_order(page); + order = buddy_order(page); del_page_from_free_list(page, zone, order); pfn += (1 << order); } @@ -8801,7 +8801,7 @@ bool is_free_buddy_page(struct page *page) for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); - if (PageBuddy(page_head) && page_order(page_head) >= order) + if (PageBuddy(page_head) && buddy_order(page_head) >= order) break; } spin_unlock_irqrestore(&zone->lock, flags); @@ -8838,7 +8838,7 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, if (current_buddy != target) { add_to_free_list(current_buddy, zone, high, migratetype); - set_page_order(current_buddy, high); + set_buddy_order(current_buddy, high); page = next_page; } } @@ -8858,16 +8858,16 @@ bool take_page_off_buddy(struct page *page) spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); - int buddy_order = page_order(page_head); + int page_order = buddy_order(page_head); - if (PageBuddy(page_head) && buddy_order >= order) { + if (PageBuddy(page_head) && page_order >= order) { unsigned long pfn_head = page_to_pfn(page_head); int migratetype = get_pfnblock_migratetype(page_head, pfn_head); - del_page_from_free_list(page_head, zone, buddy_order); + del_page_from_free_list(page_head, zone, page_order); break_down_buddy_pages(zone, page_head, page, 0, - buddy_order, migratetype); + page_order, migratetype); ret = true; break; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 83692b937784..abbf42214485 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * these pages to be merged. */ if (PageBuddy(page)) { - order = page_order(page); + order = buddy_order(page); if (order >= pageblock_order) { pfn = page_to_pfn(page); buddy_pfn = __find_buddy_pfn(pfn, order); @@ -261,7 +261,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * the correct MIGRATE_ISOLATE freelist. There is no * simple way to verify that as VM_BUG_ON(), though. */ - pfn += 1 << page_order(page); + pfn += 1 << buddy_order(page); else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) /* A HWPoisoned page cannot be also PageBuddy */ pfn++; diff --git a/mm/page_owner.c b/mm/page_owner.c index 4ca3051a1035..b735a8eafcdb 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -295,7 +295,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (PageBuddy(page)) { unsigned long freepage_order; - freepage_order = page_order_unsafe(page); + freepage_order = buddy_order_unsafe(page); if (freepage_order < MAX_ORDER) pfn += (1UL << freepage_order) - 1; continue; @@ -490,7 +490,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) page = pfn_to_page(pfn); if (PageBuddy(page)) { - unsigned long freepage_order = page_order_unsafe(page); + unsigned long freepage_order = buddy_order_unsafe(page); if (freepage_order < MAX_ORDER) pfn += (1UL << freepage_order) - 1; @@ -584,7 +584,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) * heavy lock contention. */ if (PageBuddy(page)) { - unsigned long order = page_order_unsafe(page); + unsigned long order = buddy_order_unsafe(page); if (order > 0 && order < MAX_ORDER) pfn += (1UL << order) - 1; diff --git a/mm/page_reporting.c b/mm/page_reporting.c index aaaa3605123d..cd8e13d41df4 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -92,7 +92,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev, * report on the new larger page when we make our way * up to that higher order. */ - if (PageBuddy(page) && page_order(page) == order) + if (PageBuddy(page) && buddy_order(page) == order) __SetPageReported(page); } while ((sg = sg_next(sg))); diff --git a/mm/shuffle.c b/mm/shuffle.c index 9b5cd4b004b0..9c2e145a747a 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -60,7 +60,7 @@ static struct page * __meminit shuffle_valid_page(struct zone *zone, * ...is the page on the same list as the page we will * shuffle it with? */ - if (page_order(page) != order) + if (buddy_order(page) != order) return NULL; return page; -- cgit v1.2.3-59-g8ed1b From ce9bebe683a131b86be986c3b6a741a520327679 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:18 -0700 Subject: fs: configfs: delete repeated words in comments Drop duplicated words {the, that} in comments. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Cc: Joel Becker Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20200811021826.25032-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- fs/configfs/dir.c | 2 +- fs/configfs/file.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ca2273727225..b0983e2a4e2c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1168,7 +1168,7 @@ EXPORT_SYMBOL(configfs_depend_item); /* * Release the dependent linkage. This is much simpler than - * configfs_depend_item() because we know that that the client driver is + * configfs_depend_item() because we know that the client driver is * pinned, thus the subsystem is pinned, and therefore configfs is pinned. */ void configfs_undepend_item(struct config_item *target) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index fb65b706cc0d..1f0270229d7b 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -267,7 +267,7 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come * on the first write. - * Hint: if you're writing a value, first read the file, modify only the + * Hint: if you're writing a value, first read the file, modify only * the value you're changing, then write entire buffer back. */ -- cgit v1.2.3-59-g8ed1b From b296a6d53339a79082c1d2c1761e948e8b3def69 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Oct 2020 20:10:21 -0700 Subject: kernel.h: split out min()/max() et al. helpers kernel.h is being used as a dump for all kinds of stuff for a long time. Here is the attempt to start cleaning it up by splitting out min()/max() et al. helpers. At the same time convert users in header and lib folder to use new header. Though for time being include new header back to kernel.h to avoid twisted indirected includes for other existing users. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Rasmus Villemoes Cc: Joe Perches Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20200910164152.GA1891694@smile.fi.intel.com Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 1 + include/linux/bvec.h | 6 +- include/linux/jiffies.h | 3 +- include/linux/kernel.h | 150 +-------------------------------------------- include/linux/minmax.h | 153 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nodemask.h | 2 +- include/linux/uaccess.h | 1 + kernel/range.c | 3 +- lib/find_bit.c | 1 + lib/hexdump.c | 1 + lib/math/rational.c | 2 +- lib/math/reciprocal_div.c | 1 + 12 files changed, 170 insertions(+), 154 deletions(-) create mode 100644 include/linux/minmax.h diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c09375e0a0eb..639cae2c158b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/bvec.h b/include/linux/bvec.h index dd74503f7e5e..2efec10bf792 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -7,10 +7,14 @@ #ifndef __LINUX_BVEC_ITER_H #define __LINUX_BVEC_ITER_H -#include #include #include +#include +#include #include +#include + +struct page; /** * struct bio_vec - a contiguous range of physical memory addresses diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fed6ba96c527..5e13f801c902 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -3,8 +3,9 @@ #define _LINUX_JIFFIES_H #include +#include #include -#include +#include #include #include #include diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e4aa29b1ad62..c629215fdad9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -833,155 +834,6 @@ ftrace_vprintk(const char *fmt, va_list ap) static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #endif /* CONFIG_TRACING */ -/* - * min()/max()/clamp() macros must accomplish three things: - * - * - avoid multiple evaluations of the arguments (so side-effects like - * "x++" happen only once) when non-constant. - * - perform strict type-checking (to generate warnings instead of - * nasty runtime surprises). See the "unnecessary" pointer comparison - * in __typecheck(). - * - retain result as a constant expressions when called with only - * constant expressions (to avoid tripping VLA warnings in stack - * allocation usage). - */ -#define __typecheck(x, y) \ - (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) - -/* - * This returns a constant expression while determining if an argument is - * a constant expression, most importantly without evaluating the argument. - * Glory to Martin Uecker - */ -#define __is_constexpr(x) \ - (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) - -#define __no_side_effects(x, y) \ - (__is_constexpr(x) && __is_constexpr(y)) - -#define __safe_cmp(x, y) \ - (__typecheck(x, y) && __no_side_effects(x, y)) - -#define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) - -#define __cmp_once(x, y, unique_x, unique_y, op) ({ \ - typeof(x) unique_x = (x); \ - typeof(y) unique_y = (y); \ - __cmp(unique_x, unique_y, op); }) - -#define __careful_cmp(x, y, op) \ - __builtin_choose_expr(__safe_cmp(x, y), \ - __cmp(x, y, op), \ - __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) - -/** - * min - return minimum of two values of the same or compatible types - * @x: first value - * @y: second value - */ -#define min(x, y) __careful_cmp(x, y, <) - -/** - * max - return maximum of two values of the same or compatible types - * @x: first value - * @y: second value - */ -#define max(x, y) __careful_cmp(x, y, >) - -/** - * min3 - return minimum of three values - * @x: first value - * @y: second value - * @z: third value - */ -#define min3(x, y, z) min((typeof(x))min(x, y), z) - -/** - * max3 - return maximum of three values - * @x: first value - * @y: second value - * @z: third value - */ -#define max3(x, y, z) max((typeof(x))max(x, y), z) - -/** - * min_not_zero - return the minimum that is _not_ zero, unless both are zero - * @x: value1 - * @y: value2 - */ -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -/** - * clamp - return a value clamped to a given range with strict typechecking - * @val: current value - * @lo: lowest allowable value - * @hi: highest allowable value - * - * This macro does strict typechecking of @lo/@hi to make sure they are of the - * same type as @val. See the unnecessary pointer comparisons. - */ -#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) - -/* - * ..and if you can't take the strict - * types, you can specify one yourself. - * - * Or not use min/max/clamp at all, of course. - */ - -/** - * min_t - return minimum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value - */ -#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) - -/** - * max_t - return maximum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value - */ -#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) - -/** - * clamp_t - return a value clamped to a given range using a given type - * @type: the type of variable to use - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of type - * @type to make all the comparisons. - */ -#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) - -/** - * clamp_val - return a value clamped to a given range using val's type - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of whatever - * type the input argument @val is. This is useful when @val is an unsigned - * type and @lo and @hi are literals that will otherwise be assigned a signed - * integer type. - */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) - - -/** - * swap - swap values of @a and @b - * @a: first value - * @b: second value - */ -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - /* This counts to 12. Any more, it will return 13th argument. */ #define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n #define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) diff --git a/include/linux/minmax.h b/include/linux/minmax.h new file mode 100644 index 000000000000..c0f57b0c64d9 --- /dev/null +++ b/include/linux/minmax.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MINMAX_H +#define _LINUX_MINMAX_H + +/* + * min()/max()/clamp() macros must accomplish three things: + * + * - avoid multiple evaluations of the arguments (so side-effects like + * "x++" happen only once) when non-constant. + * - perform strict type-checking (to generate warnings instead of + * nasty runtime surprises). See the "unnecessary" pointer comparison + * in __typecheck(). + * - retain result as a constant expressions when called with only + * constant expressions (to avoid tripping VLA warnings in stack + * allocation usage). + */ +#define __typecheck(x, y) \ + (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) + +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) + +#define __no_side_effects(x, y) \ + (__is_constexpr(x) && __is_constexpr(y)) + +#define __safe_cmp(x, y) \ + (__typecheck(x, y) && __no_side_effects(x, y)) + +#define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) + +#define __cmp_once(x, y, unique_x, unique_y, op) ({ \ + typeof(x) unique_x = (x); \ + typeof(y) unique_y = (y); \ + __cmp(unique_x, unique_y, op); }) + +#define __careful_cmp(x, y, op) \ + __builtin_choose_expr(__safe_cmp(x, y), \ + __cmp(x, y, op), \ + __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) + +/** + * min - return minimum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define min(x, y) __careful_cmp(x, y, <) + +/** + * max - return maximum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define max(x, y) __careful_cmp(x, y, >) + +/** + * min3 - return minimum of three values + * @x: first value + * @y: second value + * @z: third value + */ +#define min3(x, y, z) min((typeof(x))min(x, y), z) + +/** + * max3 - return maximum of three values + * @x: first value + * @y: second value + * @z: third value + */ +#define max3(x, y, z) max((typeof(x))max(x, y), z) + +/** + * min_not_zero - return the minimum that is _not_ zero, unless both are zero + * @x: value1 + * @y: value2 + */ +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) + +/** + * clamp - return a value clamped to a given range with strict typechecking + * @val: current value + * @lo: lowest allowable value + * @hi: highest allowable value + * + * This macro does strict typechecking of @lo/@hi to make sure they are of the + * same type as @val. See the unnecessary pointer comparisons. + */ +#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) + +/* + * ..and if you can't take the strict + * types, you can specify one yourself. + * + * Or not use min/max/clamp at all, of course. + */ + +/** + * min_t - return minimum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) + +/** + * max_t - return maximum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) + +/** + * clamp_t - return a value clamped to a given range using a given type + * @type: the type of variable to use + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of type + * @type to make all the comparisons. + */ +#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) + +/** + * clamp_val - return a value clamped to a given range using val's type + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of whatever + * type the input argument @val is. This is useful when @val is an unsigned + * type and @lo and @hi are literals that will otherwise be assigned a signed + * integer type. + */ +#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) + +/** + * swap - swap values of @a and @b + * @a: first value + * @b: second value + */ +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#endif /* _LINUX_MINMAX_H */ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 3334ce056335..ac398e143c9a 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -90,9 +90,9 @@ * for such situations. See below and CPUMASK_ALLOC also. */ -#include #include #include +#include #include typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1ae36bc8db35..ef084eacaa7c 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -3,6 +3,7 @@ #define __LINUX_UACCESS_H__ #include +#include #include #include diff --git a/kernel/range.c b/kernel/range.c index d84de6766472..56435f96da73 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -2,8 +2,9 @@ /* * Range add and subtract */ -#include #include +#include +#include #include #include #include diff --git a/lib/find_bit.c b/lib/find_bit.c index 49f875f1baf7..4a8751010d59 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -16,6 +16,7 @@ #include #include #include +#include #if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \ diff --git a/lib/hexdump.c b/lib/hexdump.c index 147133f8eb2f..9301578f98e8 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include diff --git a/lib/math/rational.c b/lib/math/rational.c index df75c8809693..9781d521963d 100644 --- a/lib/math/rational.c +++ b/lib/math/rational.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include /* * calculate best rational approximation for a given fraction diff --git a/lib/math/reciprocal_div.c b/lib/math/reciprocal_div.c index bf043258fa00..32436dd4171e 100644 --- a/lib/math/reciprocal_div.c +++ b/lib/math/reciprocal_div.c @@ -4,6 +4,7 @@ #include #include #include +#include /* * For a description of the algorithm please have a look at -- cgit v1.2.3-59-g8ed1b From 15ec0fcff6dab37533e02e2092d9228726186366 Mon Sep 17 00:00:00 2001 From: Liao Pingfang Date: Thu, 15 Oct 2020 20:10:25 -0700 Subject: kernel/sys.c: replace do_brk with do_brk_flags in comment of prctl_set_mm_map() Replace do_brk with do_brk_flags in comment of prctl_set_mm_map(), since do_brk was removed in following commit. Fixes: bb177a732c4369 ("mm: do not bug_on on incorrect length in __mm_populate()") Signed-off-by: Liao Pingfang Signed-off-by: Yi Wang Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/1600650751-43127-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sys.c b/kernel/sys.c index ab6c409b1159..6401880dff74 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2034,7 +2034,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * - * - @start_brk/@brk which are used in do_brk but kernel lookups + * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these memvers so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself -- cgit v1.2.3-59-g8ed1b From 7b7b8a2c9560efb5874ea1d84d1dce5ba4c8c487 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:28 -0700 Subject: kernel/: fix repeated words in comments Fix multiple occurrences of duplicated words in kernel/. Fix one typo/spello on the same line as a duplicate word. Change one instance of "the the" to "that the". Otherwise just drop one of the repeated words. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/98202fa6-8919-ef63-9efe-c0fad5ca7af1@infradead.org Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- kernel/cgroup/cpuset.c | 2 +- kernel/dma/direct.c | 2 +- kernel/fork.c | 2 +- kernel/futex.c | 2 +- kernel/irq/timings.c | 2 +- kernel/jump_label.c | 2 +- kernel/kcsan/encoding.h | 2 +- kernel/kexec_core.c | 2 +- kernel/kthread.c | 2 +- kernel/livepatch/state.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/smp.c | 2 +- kernel/user_namespace.c | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index b0c5b3a9f5af..a1b3cf7b3c4b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -25,7 +25,7 @@ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. * - * Fixed a nasty interaction with with sys_umount(). If the accointing + * Fixed a nasty interaction with sys_umount(). If the accounting * was suspeneded we failed to stop it on umount(). Messy. * Another one: remount to readonly didn't stop accounting. * Question: what should we do if we have CAP_SYS_ADMIN but not diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 642415b8c3c9..57b5b5d0a5fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -390,7 +390,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to to be + * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ cpumask_copy(pmask, cpu_online_mask); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b92d08e65999..06c111544f61 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -16,7 +16,7 @@ #include "direct.h" /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ diff --git a/kernel/fork.c b/kernel/fork.c index e0f74be1423c..32083db7a2a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2189,7 +2189,7 @@ static __latent_entropy struct task_struct *copy_process( /* * Ensure that the cgroup subsystem policies allow the new process to be - * forked. It should be noted the the new process's css_set can be changed + * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ diff --git a/kernel/futex.c b/kernel/futex.c index a5876694a60e..680854dcf156 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -916,7 +916,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * [10] Found | Found | task | !=taskTID | 0/1 | Invalid * * [1] Indicates that the kernel can acquire the futex atomically. We - * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. * * [2] Valid, if TID does not belong to a kernel thread. If no matching * thread is found then it indicates that the owner TID has died. diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e960d7ce7bcc..773b6105c4ae 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -604,7 +604,7 @@ int irq_timings_alloc(int irq) /* * Some platforms can have the same private interrupt per cpu, - * so this function may be be called several times with the + * so this function may be called several times with the * same interrupt number. Just bail out in case the per cpu * stat structure is already allocated. */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index e661c61b3d6b..015ef903ce8c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -19,7 +19,7 @@ #include #include -/* mutex to protect coming/going of the the jump_label table */ +/* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index f03562aaf2eb..1a6db2f797ac 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -32,7 +32,7 @@ * 1. different addresses but with the same encoded address race; * 2. and both map onto the same watchpoint slots; * - * Both these are assumed to be very unlikely. However, in case it still happens + * Both these are assumed to be very unlikely. However, in case it still * happens, the report logic will filter out the false positive (see report.c). */ #define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c5e5e5a11535..8798a8183974 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded); * defined more restrictively in . * * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size + * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from diff --git a/kernel/kthread.c b/kernel/kthread.c index 3edaa380dc7b..e29773c82b70 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -775,7 +775,7 @@ EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it - * it to a given CPU and the associated NUMA node. + * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c index 7ee19476de9d..2565d039ade0 100644 --- a/kernel/livepatch/state.c +++ b/kernel/livepatch/state.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state); * * The function can be called only during transition when a new * livepatch is being enabled or when such a transition is reverted. - * It is typically called only from from pre/post (un)patch + * It is typically called only from pre/post (un)patch * callbacks. * * Return: pointer to the latest struct klp_state from already diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ac135bd600eb..9de21803a8ae 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -233,7 +233,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * - * The code relies on the the pid_ns->child_reaper ignoring + * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d25749bce7cf..46b1804c1ddf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -735,7 +735,7 @@ zone_found: */ /* - * If the zone we wish to scan is the the current zone and the + * If the zone we wish to scan is the current zone and the * pfn falls into the current node then we do not need to walk * the tree. */ diff --git a/kernel/smp.c b/kernel/smp.c index d0ae8eb6bf8b..d9832a171046 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -741,7 +741,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and - * the the info parameter. The function is called + * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 87804e0371fe..e703d5d9cbe8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -515,7 +515,7 @@ EXPORT_SYMBOL(from_kgid_munged); * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test - * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) -- cgit v1.2.3-59-g8ed1b From b7621ebf8a0870f8d822a09dbd040aa5c4909da5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:31 -0700 Subject: kernel: acct.c: fix some kernel-doc nits Fix kernel-doc notation to use the documented Returns: syntax and place the function description for acct_process() on the first line where it should be. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Cc: Alexander Viro Link: https://lkml.kernel.org/r/b4c33e5d-98e8-0c47-77b6-ac1859f94d7f@infradead.org Signed-off-by: Linus Torvalds --- kernel/acct.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index a1b3cf7b3c4b..f175df8f6aa4 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -263,12 +263,12 @@ static DEFINE_MUTEX(acct_on_mutex); * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting * - * Returns 0 for success or negative errno values for failure. - * * sys_acct() is the only system call needed to implement process * accounting. It takes the name of the file where accounting records * should be written. If the filename is NULL, accounting will be * shutdown. + * + * Returns: 0 for success or negative errno values for failure. */ SYSCALL_DEFINE1(acct, const char __user *, name) { @@ -586,9 +586,7 @@ static void slow_acct_process(struct pid_namespace *ns) } /** - * acct_process - * - * handles process accounting for an exiting task + * acct_process - handles process accounting for an exiting task */ void acct_process(void) { -- cgit v1.2.3-59-g8ed1b From cdfe2d22047661c5e0ecf8f28e7e8843ee177aff Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 15 Oct 2020 20:10:34 -0700 Subject: get_maintainer: add test for file in VCS It's somewhat common for me to ask get_maintainer to tell me who maintains a patch file rather than the files modified by the patch. Emit a warning if using get_maintainer.pl -f Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/f63229c051567041819f25e76f49d83c6e4c0f71.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 484d2fbf5921..5a36354a38cc 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -541,6 +541,9 @@ foreach my $file (@ARGV) { die "$P: file '${file}' not found\n"; } } + if ($from_filename && (vcs_exists() && !vcs_file_exists($file))) { + warn "$P: file '$file' not found in version control $!\n"; + } if ($from_filename || ($file ne "&STDIN" && vcs_file_exists($file))) { $file =~ s/^\Q${cur_path}\E//; #strip any absolute path $file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree -- cgit v1.2.3-59-g8ed1b From 6343f6b71f83af8ebfc6f9740d1d74f2db88e6b0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 15 Oct 2020 20:10:37 -0700 Subject: get_maintainer: exclude MAINTAINERS file(s) from --git-fallback MAINTAINERS files generally have no specific maintainer but are updated by individuals for subsystems all over the source tree. Exclude MAINTAINERS file(s) from --git-fallback searches so the unlucky individuals that update the files the most are not shown by default. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Cc: Rob Herring Cc: Mauro Carvalho Chehab Cc: "David S. Miller" Link: https://lkml.kernel.org/r/2bacb0a9c06fbb6d56a43bf930e808c74243c908.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 5a36354a38cc..2075db0c08b8 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -957,8 +957,10 @@ sub get_maintainers { foreach my $file (@files) { if ($email && - ($email_git || ($email_git_fallback && - !$exact_pattern_match_hash{$file}))) { + ($email_git || + ($email_git_fallback && + $file !~ /MAINTAINERS$/ && + !$exact_pattern_match_hash{$file}))) { vcs_file_signoffs($file); } if ($email && $email_git_blame) { -- cgit v1.2.3-59-g8ed1b From 32dd8afae7db4d58612338a0bdc0974450e66382 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 15 Oct 2020 20:10:41 -0700 Subject: MAINTAINERS: jarkko.sakkinen@linux.intel.com -> jarkko@kernel.org Use @kernel.org address as the main communications end point. Update the corresponding M-entries and .mailmap (for git shortlog translation). Signed-off-by: Jarkko Sakkinen Signed-off-by: Andrew Morton Cc: Joe Perches Cc: Jonathan Corbet Cc: Kees Cook Cc: Mauro Carvalho Chehab Cc: "David S. Miller" Cc: Rob Herring Link: https://lkml.kernel.org/r/20201015142710.8371-1-jarkko.sakkinen@linux.intel.com Signed-off-by: Linus Torvalds --- .mailmap | 1 + MAINTAINERS | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.mailmap b/.mailmap index e4ccac4e2f88..1e14566a3d56 100644 --- a/.mailmap +++ b/.mailmap @@ -133,6 +133,7 @@ James Ketrenos Jan Glauber Jan Glauber Jan Glauber +Jarkko Sakkinen Jason Gunthorpe Jason Gunthorpe Jason Gunthorpe diff --git a/MAINTAINERS b/MAINTAINERS index 48b0333d19e2..c925fd3c5c3f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9701,7 +9701,7 @@ F: security/keys/encrypted-keys/ KEYS-TRUSTED M: James Bottomley -M: Jarkko Sakkinen +M: Jarkko Sakkinen M: Mimi Zohar L: linux-integrity@vger.kernel.org L: keyrings@vger.kernel.org @@ -9713,7 +9713,7 @@ F: security/keys/trusted-keys/ KEYS/KEYRINGS M: David Howells -M: Jarkko Sakkinen +M: Jarkko Sakkinen L: keyrings@vger.kernel.org S: Maintained F: Documentation/security/keys/core.rst @@ -17684,7 +17684,7 @@ F: drivers/platform/x86/toshiba-wmi.c TPM DEVICE DRIVER M: Peter Huewe -M: Jarkko Sakkinen +M: Jarkko Sakkinen R: Jason Gunthorpe L: linux-integrity@vger.kernel.org S: Maintained -- cgit v1.2.3-59-g8ed1b From 197d6c1dde4e19517e8cf843eec7fc6417241969 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:45 -0700 Subject: lib: bitmap: delete duplicated words Drop the repeated word "an". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200823040424.25760-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- lib/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index 61394890788e..75006c4036e9 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -23,7 +23,7 @@ /** * DOC: bitmap introduction * - * bitmaps provide an array of bits, implemented using an an + * bitmaps provide an array of bits, implemented using an * array of unsigned longs. The number of valid bits in a * given bitmap does _not_ need to be an exact multiple of * BITS_PER_LONG. -- cgit v1.2.3-59-g8ed1b From f1e594acb1bdf29b583fe205793699a8e7c35fc0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:48 -0700 Subject: lib: libcrc32c: delete duplicated words Drop the repeated word "the". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200823040430.25807-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- lib/libcrc32c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index 77ab839644c5..5ca0d815a95d 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -12,7 +12,7 @@ * pages = {}, * month = {June}, *} - * Used by the iSCSI driver, possibly others, and derived from the + * Used by the iSCSI driver, possibly others, and derived from * the iscsi-crc.c module of the linux-iscsi driver at * http://linux-iscsi.sourceforge.net. * -- cgit v1.2.3-59-g8ed1b From 2f22385fb1211a90f53ee197e0f41ae0d35b391c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:51 -0700 Subject: lib: decompress_bunzip2: delete duplicated words Drop the repeated word "how". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200823040436.25852-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- lib/decompress_bunzip2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index f9628f3924ce..c72c865032fa 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c @@ -390,7 +390,7 @@ static int INIT get_next_block(struct bunzip_data *bd) j = (bd->inbufBits >> bd->inbufBitCount)& ((1 << hufGroup->maxLen)-1); got_huff_bits: - /* Figure how how many bits are in next symbol and + /* Figure how many bits are in next symbol and * unget extras */ i = hufGroup->minLen; while (j > limit[i]) -- cgit v1.2.3-59-g8ed1b From dde57fe01a0a39ce131ad5611e0a4430cec66bd8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:57 -0700 Subject: lib: dynamic_queue_limits: delete duplicated words + fix typo Drop the repeated word "the". Fix spelling of "excess". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200823040449.25946-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- lib/dynamic_queue_limits.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c index e659a027036e..fde0aa244148 100644 --- a/lib/dynamic_queue_limits.c +++ b/lib/dynamic_queue_limits.c @@ -60,8 +60,8 @@ void dql_completed(struct dql *dql, unsigned int count) * A decrease is only considered if the queue has been busy in * the whole interval (the check above). * - * If there is slack, the amount of execess data queued above - * the the amount needed to prevent starvation, the queue limit + * If there is slack, the amount of excess data queued above + * the amount needed to prevent starvation, the queue limit * can be decreased. To avoid hysteresis we consider the * minimum amount of slack found over several iterations of the * completion routine. -- cgit v1.2.3-59-g8ed1b From 4e20ace06f705404df09316b859c267e9ec99354 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:11:01 -0700 Subject: lib: earlycpio: delete duplicated words Drop the repeated word "the". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200823040455.25995-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- lib/earlycpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/earlycpio.c b/lib/earlycpio.c index c001e084829e..e83628882001 100644 --- a/lib/earlycpio.c +++ b/lib/earlycpio.c @@ -42,7 +42,7 @@ enum cpio_fields { /** * cpio_data find_cpio_data - Search for files in an uncompressed cpio * @path: The directory to search for, including a slash at the end - * @data: Pointer to the the cpio archive or a header inside + * @data: Pointer to the cpio archive or a header inside * @len: Remaining length of the cpio based on data pointer * @nextoff: When a matching file is found, this is the offset from the * beginning of the cpio to the beginning of the next file, not the -- cgit v1.2.3-59-g8ed1b From e0656501a6192ebced36764a007f91cc014d896d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:11:04 -0700 Subject: lib: radix-tree: delete duplicated words Drop the repeated word "be". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200823040508.26086-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e4a3a4397f2..005ced92a4a9 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -325,7 +325,7 @@ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) int ret = -ENOMEM; /* - * Nodes preloaded by one cgroup can be be used by another cgroup, so + * Nodes preloaded by one cgroup can be used by another cgroup, so * they should never be accounted to any particular memory cgroup. */ gfp_mask &= ~__GFP_ACCOUNT; -- cgit v1.2.3-59-g8ed1b From 408a93a2bb4f276f28059b51fbe0bbd06be350fc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:11:07 -0700 Subject: lib: syscall: delete duplicated words Drop the repeated word "the". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200823040514.26136-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- lib/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/syscall.c b/lib/syscall.c index fb328e7ccb08..8533d2fea2d7 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -44,7 +44,7 @@ static int collect_syscall(struct task_struct *target, struct syscall_info *info * .data.instruction_pointer - filled with user PC * * If @target is blocked in a system call, returns zero with @info.data.nr - * set to the the call's number and @info.data.args filled in with its + * set to the call's number and @info.data.args filled in with its * arguments. Registers not used for system call arguments may not be available * and it is not kosher to use &struct user_regset calls while the system * call is still in progress. Note we may get this result if @target -- cgit v1.2.3-59-g8ed1b From 2d0469814ade60efb5cd8ca9af93c43bc3ba831e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:11:10 -0700 Subject: lib: test_sysctl: delete duplicated words Drop the repeated word "the". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200823040520.1999-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- lib/test_sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 98bc92a91662..3750323973f4 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -16,7 +16,7 @@ */ /* - * This module provides an interface to the the proc sysctl interfaces. This + * This module provides an interface to the proc sysctl interfaces. This * driver requires CONFIG_PROC_SYSCTL. It will not normally be loaded by the * system unless explicitly requested by name. You can also build this driver * into your kernel. -- cgit v1.2.3-59-g8ed1b From 8d8472cfdefa0f5c839f5c4a507af5e26ee6f5bf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:11:14 -0700 Subject: lib/mpi/mpi-bit.c: fix spello of "functions" Fix typo/spello of "functions". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/8df15173-a6df-9426-7cad-a2d279bf1170@infradead.org Signed-off-by: Linus Torvalds --- lib/mpi/mpi-bit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mpi/mpi-bit.c b/lib/mpi/mpi-bit.c index a5119a2bcdd4..142b680835df 100644 --- a/lib/mpi/mpi-bit.c +++ b/lib/mpi/mpi-bit.c @@ -1,4 +1,4 @@ -/* mpi-bit.c - MPI bit level fucntions +/* mpi-bit.c - MPI bit level functions * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. -- cgit v1.2.3-59-g8ed1b From 3b6742618ed9216dd6caad968fe8c83b32dff485 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 15 Oct 2020 20:11:17 -0700 Subject: lib/idr.c: document calling context for IDA APIs mustn't use locks The documentation for these functions indicates that callers don't need to hold a lock while calling them, but that documentation is only in one place under "IDA Usage". Let's state the same information on each IDA function so that it's clear what the calling context requires. Furthermore, let's document ida_simple_get() with the same information so that callers know how this API works. Signed-off-by: Stephen Boyd Signed-off-by: Andrew Morton Reviewed-by: Greg Kroah-Hartman Cc: Tri Vo Cc: Jonathan Corbet Cc: Matthew Wilcox Link: https://lkml.kernel.org/r/20200910055246.2297797-1-swboyd@chromium.org Signed-off-by: Linus Torvalds --- include/linux/idr.h | 9 ++++++--- lib/idr.c | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index 3ade03e5c7af..b235ed987021 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -263,7 +263,8 @@ void ida_destroy(struct ida *ida); * * Allocate an ID between 0 and %INT_MAX, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -280,7 +281,8 @@ static inline int ida_alloc(struct ida *ida, gfp_t gfp) * * Allocate an ID between @min and %INT_MAX, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -297,7 +299,8 @@ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) * * Allocate an ID between 0 and @max, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ diff --git a/lib/idr.c b/lib/idr.c index c2cf2c52bbde..3fa8be43696f 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -372,7 +372,8 @@ EXPORT_SYMBOL(idr_replace); * Allocate an ID between @min and @max, inclusive. The allocated ID will * not exceed %INT_MAX, even if @max is larger. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -479,7 +480,8 @@ EXPORT_SYMBOL(ida_alloc_range); * @ida: IDA handle. * @id: Previously allocated ID. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. */ void ida_free(struct ida *ida, unsigned int id) { @@ -531,7 +533,8 @@ EXPORT_SYMBOL(ida_free); * or freed. If the IDA is already empty, there is no need to call this * function. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. */ void ida_destroy(struct ida *ida) { -- cgit v1.2.3-59-g8ed1b From 3264ceec8f17a99a3895de7de06b4d7e9c8f3f30 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 15 Oct 2020 20:11:21 -0700 Subject: lib/idr.c: document that ida_simple_{get,remove}() are deprecated These two functions are deprecated. Users should call ida_alloc() or ida_free() respectively instead. Add documentation to this effect until the macro can be removed. Signed-off-by: Stephen Boyd Signed-off-by: Andrew Morton Reviewed-by: Tri Vo Cc: Greg KH Cc: Jonathan Corbet Cc: Matthew Wilcox Link: https://lkml.kernel.org/r/20200910055246.2297797-2-swboyd@chromium.org Signed-off-by: Linus Torvalds --- include/linux/idr.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/idr.h b/include/linux/idr.h index b235ed987021..a0dce14090a9 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -314,6 +314,10 @@ static inline void ida_init(struct ida *ida) xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } +/* + * ida_simple_get() and ida_simple_remove() are deprecated. Use + * ida_alloc() and ida_free() instead respectively. + */ #define ida_simple_get(ida, start, end, gfp) \ ida_alloc_range(ida, start, (end) - 1, gfp) #define ida_simple_remove(ida, id) ida_free(ida, id) -- cgit v1.2.3-59-g8ed1b From 6ed9b92e290b530197c2dda2271f5312abc475e6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 15 Oct 2020 20:11:25 -0700 Subject: lib/scatterlist.c: avoid a double memset 'sgl' is zeroed a few lines below in 'sg_init_table()'. There is no need to clear it twice. Remove the redundant initialization. Signed-off-by: Christophe JAILLET Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200920071544.368841-1-christophe.jaillet@wanadoo.fr Signed-off-by: Linus Torvalds --- lib/scatterlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 5d63a8857f36..d94628fa3349 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -504,7 +504,7 @@ struct scatterlist *sgl_alloc_order(unsigned long long length, nalloc++; } sgl = kmalloc_array(nalloc, sizeof(struct scatterlist), - (gfp & ~GFP_DMA) | __GFP_ZERO); + gfp & ~GFP_DMA); if (!sgl) return NULL; -- cgit v1.2.3-59-g8ed1b From 1d339638a954edce06ffcaa15d883d322e9e8291 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 15 Oct 2020 20:11:28 -0700 Subject: lib/percpu_counter.c: use helper macro abs() Use helper macro abs() to simplify the "x >= t || x <= -t" cmp. Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: https://lkml.kernel.org/r/20200927122746.5964-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- lib/percpu_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index f61689a96e85..00f666d94486 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -85,7 +85,7 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) preempt_disable(); count = __this_cpu_read(*fbc->counters) + amount; - if (count >= batch || count <= -batch) { + if (abs(count) >= batch) { unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); fbc->count += count; -- cgit v1.2.3-59-g8ed1b From e130816164e244b692921de49771eeb28205152d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Oct 2020 20:11:31 -0700 Subject: include/linux/list.h: add a macro to test if entry is pointing to the head Add a macro to test if entry is pointing to the head of the list which is useful in cases like: list_for_each_entry(pos, &head, member) { if (cond) break; } if (list_entry_is_head(pos, &head, member)) return -ERRNO; that allows to avoid additional variable to be added to track if loop has not been stopped in the middle. While here, convert list_for_each_entry*() family of macros to use a new one. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Reviewed-by: Cezary Rojewski Link: https://lkml.kernel.org/r/20200929134342.51489-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Torvalds --- include/linux/list.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 0d0d17a10d25..a18c87b63376 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -609,6 +609,15 @@ static inline void list_splice_tail_init(struct list_head *list, pos != (head); \ pos = n, n = pos->prev) +/** + * list_entry_is_head - test if the entry points to the head of the list + * @pos: the type * to cursor + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_entry_is_head(pos, head, member) \ + (&pos->member == (head)) + /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. @@ -617,7 +626,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -628,7 +637,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -653,7 +662,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -667,7 +676,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -679,7 +688,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -692,7 +701,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -705,7 +714,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -721,7 +730,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -736,7 +745,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -752,7 +761,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** -- cgit v1.2.3-59-g8ed1b From f3c9d0a3fe97a8b05548d27e9a8e7a5e6875004c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 15 Oct 2020 20:11:34 -0700 Subject: lib/test_hmm.c: fix an error code in dmirror_allocate_chunk() This is supposed to return false on failure, not a negative error code. Fixes: 170e38548b81 ("mm/hmm/test: use after free in dmirror_allocate_chunk()") Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Reviewed-by: Ralph Campbell Cc: Jerome Glisse Cc: Stephen Rothwell Cc: Jason Gunthorpe Cc: Dan Williams Link: https://lkml.kernel.org/r/20201010200812.GA1886610@mwanda Signed-off-by: Linus Torvalds --- lib/test_hmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e151a7f10519..80a78877bd93 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -461,7 +461,7 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); if (!devmem) - return -ENOMEM; + return false; res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, "hmm_dmirror"); -- cgit v1.2.3-59-g8ed1b From 904542dc56524f921a6bab0639ff6249c01e775f Mon Sep 17 00:00:00 2001 From: Tobias Jordan Date: Thu, 15 Oct 2020 20:11:38 -0700 Subject: lib/crc32.c: fix trivial typo in preprocessor condition Whether crc32_be needs a lookup table is chosen based on CRC_LE_BITS. Obviously, the _be function should be governed by the _BE_ define. This probably never pops up as it's hard to come up with a configuration where CRC_BE_BITS isn't the same as CRC_LE_BITS and as nobody is using bitwise CRC anyway. Fixes: 46c5801eaf86 ("crc32: bolt on crc32c") Signed-off-by: Tobias Jordan Signed-off-by: Andrew Morton Cc: Krzysztof Kozlowski Cc: Jonathan Corbet Cc: Mauro Carvalho Chehab Link: https://lkml.kernel.org/r/20200923182122.GA3338@agrajag.zerfleddert.de Signed-off-by: Linus Torvalds --- lib/crc32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/crc32.c b/lib/crc32.c index 35a03d03f973..2a68dfd3b96c 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -331,7 +331,7 @@ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, return crc; } -#if CRC_LE_BITS == 1 +#if CRC_BE_BITS == 1 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE); -- cgit v1.2.3-59-g8ed1b From a9eb63705e379f10a3c9d13fc6aee8b50805e862 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 15 Oct 2020 20:11:41 -0700 Subject: bitops: simplify get_count_order_long() These two cases could be unified into one. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: Christian Brauner Cc: Andy Shevchenko Link: https://lkml.kernel.org/r/20200807085837.11697-2-richard.weiyang@linux.alibaba.com Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 99f2ac30b1d9..030a98f0c452 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -206,10 +206,7 @@ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; - else if (l & (l - 1UL)) - return (int)fls_long(l); - else - return (int)fls_long(l) - 1; + return (int)fls_long(--l); } /** -- cgit v1.2.3-59-g8ed1b From 004fba1ae6ddd66ba0faa4f60c603b3ca77b3554 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 15 Oct 2020 20:11:46 -0700 Subject: bitops: use the same mechanism for get_count_order[_long] These two functions share the same logic. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: Christian Brauner Cc: Andy Shevchenko Link: https://lkml.kernel.org/r/20200807085837.11697-3-richard.weiyang@linux.alibaba.com Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 030a98f0c452..5b74bdf159d6 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -188,12 +188,10 @@ static inline unsigned fls_long(unsigned long l) static inline int get_count_order(unsigned int count) { - int order; + if (count == 0) + return -1; - order = fls(count) - 1; - if (count & (count - 1)) - order++; - return order; + return fls(--count); } /** -- cgit v1.2.3-59-g8ed1b From 3e89ad8506f39c4739a6c9ca1e1552f506f000c9 Mon Sep 17 00:00:00 2001 From: Jerome Forissier Date: Thu, 15 Oct 2020 20:11:49 -0700 Subject: checkpatch: add --kconfig-prefix Kconfig allows to customize the CONFIG_ prefix via the $CONFIG_ environment variable. Out-of-tree projects may therefore use Kconfig with a different prefix, or they may use a custom configuration tool which does not use the CONFIG_ prefix at all. Such projects may still want to adhere to the Linux kernel coding style and run checkpatch.pl. One example is OP-TEE [1] which does not use Kconfig but does have configuration options prefixed with CFG_. It also mostly follows the kernel coding style and therefore being able to use checkpatch is quite valuable. To make this possible, add the --kconfig-prefix command line option. [1] https://github.com/OP-TEE/optee_os Signed-off-by: Jerome Forissier Signed-off-by: Andrew Morton Acked-by: Joe Perches Link: http://lkml.kernel.org/r/20200818081732.800449-1-jerome@forissier.org Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 504d2e431c60..9998340d69c6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -65,6 +65,7 @@ my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANC # git output parsing needs US English output, so first set backtick child process LANGUAGE my $git_command ='export LANGUAGE=en_US.UTF-8; git'; my $tabsize = 8; +my ${CONFIG_} = "CONFIG_"; sub help { my ($exitcode) = @_; @@ -127,6 +128,8 @@ Options: --typedefsfile Read additional types from this file --color[=WHEN] Use colors 'always', 'never', or only when output is a terminal ('auto'). Default is 'auto'. + --kconfig-prefix=WORD use WORD as a prefix for Kconfig symbols (default + ${CONFIG_}) -h, --help, --version display this help and exit When FILE is - read standard input. @@ -235,6 +238,7 @@ GetOptions( 'color=s' => \$color, 'no-color' => \$color, #keep old behaviors of -nocolor 'nocolor' => \$color, #keep old behaviors of -nocolor + 'kconfig-prefix=s' => \${CONFIG_}, 'h|help' => \$help, 'version' => \$help ) or help(1); @@ -6524,16 +6528,16 @@ sub process { } # check for IS_ENABLED() without CONFIG_ ($rawline for comments too) - if ($rawline =~ /\bIS_ENABLED\s*\(\s*(\w+)\s*\)/ && $1 !~ /^CONFIG_/) { + if ($rawline =~ /\bIS_ENABLED\s*\(\s*(\w+)\s*\)/ && $1 !~ /^${CONFIG_}/) { WARN("IS_ENABLED_CONFIG", - "IS_ENABLED($1) is normally used as IS_ENABLED(CONFIG_$1)\n" . $herecurr); + "IS_ENABLED($1) is normally used as IS_ENABLED(${CONFIG_}$1)\n" . $herecurr); } # check for #if defined CONFIG_ || defined CONFIG__MODULE - if ($line =~ /^\+\s*#\s*if\s+defined(?:\s*\(?\s*|\s+)(CONFIG_[A-Z_]+)\s*\)?\s*\|\|\s*defined(?:\s*\(?\s*|\s+)\1_MODULE\s*\)?\s*$/) { + if ($line =~ /^\+\s*#\s*if\s+defined(?:\s*\(?\s*|\s+)(${CONFIG_}[A-Z_]+)\s*\)?\s*\|\|\s*defined(?:\s*\(?\s*|\s+)\1_MODULE\s*\)?\s*$/) { my $config = $1; if (WARN("PREFER_IS_ENABLED", - "Prefer IS_ENABLED() to CONFIG_ || CONFIG__MODULE\n" . $herecurr) && + "Prefer IS_ENABLED() to ${CONFIG_} || ${CONFIG_}_MODULE\n" . $herecurr) && $fix) { $fixed[$fixlinenr] = "\+#if IS_ENABLED($config)"; } -- cgit v1.2.3-59-g8ed1b From 310cd06ba2496002481eb7fd3b02c51d115aa115 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 15 Oct 2020 20:11:52 -0700 Subject: checkpatch: move repeated word test Currently this test only works on .[ch] files. Move the test to check more file types and the commit log. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/180b3b5677771c902b2e2f7a2b7090ede65fe004.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 72 +++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9998340d69c6..268028e382b5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2991,6 +2991,42 @@ sub process { } } +# check for repeated words separated by a single space + if ($rawline =~ /^\+/ || $in_commit_log) { + while ($rawline =~ /\b($word_pattern) (?=($word_pattern))/g) { + + my $first = $1; + my $second = $2; + + if ($first =~ /(?:struct|union|enum)/) { + pos($rawline) += length($first) + length($second) + 1; + next; + } + + next if ($first ne $second); + next if ($first eq 'long'); + + if (WARN("REPEATED_WORD", + "Possible repeated word: '$first'\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\b$first $second\b/$first/; + } + } + + # if it's a repeated word on consecutive lines in a comment block + if ($prevline =~ /$;+\s*$/ && + $prevrawline =~ /($word_pattern)\s*$/) { + my $last_word = $1; + if ($rawline =~ /^\+\s*\*\s*$last_word /) { + if (WARN("REPEATED_WORD", + "Possible repeated word: '$last_word'\n" . $hereprev) && + $fix) { + $fixed[$fixlinenr] =~ s/(\+\s*\*\s*)$last_word /$1/; + } + } + } + } + # ignore non-hunk lines and lines being removed next if (!$hunk_line || $line =~ /^-/); @@ -3314,42 +3350,6 @@ sub process { } } -# check for repeated words separated by a single space - if ($rawline =~ /^\+/) { - while ($rawline =~ /\b($word_pattern) (?=($word_pattern))/g) { - - my $first = $1; - my $second = $2; - - if ($first =~ /(?:struct|union|enum)/) { - pos($rawline) += length($first) + length($second) + 1; - next; - } - - next if ($first ne $second); - next if ($first eq 'long'); - - if (WARN("REPEATED_WORD", - "Possible repeated word: '$first'\n" . $herecurr) && - $fix) { - $fixed[$fixlinenr] =~ s/\b$first $second\b/$first/; - } - } - - # if it's a repeated word on consecutive lines in a comment block - if ($prevline =~ /$;+\s*$/ && - $prevrawline =~ /($word_pattern)\s*$/) { - my $last_word = $1; - if ($rawline =~ /^\+\s*\*\s*$last_word /) { - if (WARN("REPEATED_WORD", - "Possible repeated word: '$last_word'\n" . $hereprev) && - $fix) { - $fixed[$fixlinenr] =~ s/(\+\s*\*\s*)$last_word /$1/; - } - } - } - } - # check for space before tabs. if ($rawline =~ /^\+/ && $rawline =~ / \t/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; -- cgit v1.2.3-59-g8ed1b From 40873aba2c6b96c6f21265e6508eaa3de145e30a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 15 Oct 2020 20:11:56 -0700 Subject: checkpatch: add test for comma use that should be semicolon There are commas used as statement terminations that should typically have used semicolons instead. Only direct assignments or use of a single function or value on a single line are detected by this test. e.g.: foo = bar(), /* typical use is semicolon not comma */ bar = baz(); Add an imperfect test to detect these comma uses. No false positives were found in testing, but many types of false negatives are possible. e.g.: foo = bar() + 1, /* comma use, but not direct assignment */ bar = baz(); Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/3bf27caf462007dfa75647b040ab3191374a59de.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 268028e382b5..74ccd122fc07 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4940,6 +4940,17 @@ sub process { } } +# check if a statement with a comma should be two statements like: +# foo = bar(), /* comma should be semicolon */ +# bar = baz(); + if (defined($stat) && + $stat =~ /^\+\s*(?:$Lval\s*$Assignment\s*)?$FuncArg\s*,\s*(?:$Lval\s*$Assignment\s*)?$FuncArg\s*;\s*$/) { + my $cnt = statement_rawlines($stat); + my $herectx = get_stat_here($linenr, $cnt, $here); + WARN("SUSPECT_COMMA_SEMICOLON", + "Possible comma where semicolon could be used\n" . $herectx); + } + # return is not a function if (defined($stat) && $stat =~ /^.\s*return(\s*)\(/s) { my $spacing = $1; -- cgit v1.2.3-59-g8ed1b From ed4761f7804743e73ba2c0c140535a16d07032b6 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Thu, 15 Oct 2020 20:11:59 -0700 Subject: const_structs.checkpatch: add phy_ops All usages of phy_ops in include/linux uses const phy_ops * and all instances of phy_ops in the kernel that are not const already can be made const (patches have been posted for those separately). Suggested-by: Joe Perches Signed-off-by: Rikard Falkeborn Signed-off-by: Andrew Morton Cc: Kishon Vijay Abraham I Cc: Vinod Koul Link: https://lkml.kernel.org/r/20200824214132.9072-1-rikard.falkeborn@gmail.com Signed-off-by: Linus Torvalds --- scripts/const_structs.checkpatch | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch index e9df9cc28a85..cd45cb3c2b04 100644 --- a/scripts/const_structs.checkpatch +++ b/scripts/const_structs.checkpatch @@ -39,6 +39,7 @@ nlmsvc_binding nvkm_device_chip of_device_id pci_raw_ops +phy_ops pipe_buf_operations platform_hibernation_ops platform_suspend_ops -- cgit v1.2.3-59-g8ed1b From 8020b253631286807ca41f176e96e18af14bfb2e Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Thu, 15 Oct 2020 20:12:02 -0700 Subject: checkpatch: warn if trace_printk and friends are called trace_printk is meant as a debugging tool, and should not be compiled into production code without specific debug Kconfig options enabled, or source code changes, as indicated by the warning that shows up on boot if any trace_printk is called: ** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE ** ** ** ** trace_printk() being used. Allocating extra memory. ** ** ** ** This means that this is a DEBUG kernel and it is ** ** unsafe for production use. ** Let's warn developers when they try to submit such a change. Signed-off-by: Nicolas Boichat Signed-off-by: Andrew Morton Cc: Steven Rostedt Cc: Joe Perches Link: https://lkml.kernel.org/r/20200825193600.v2.1.I723c43c155f02f726c97501be77984f1e6bb740a@changeid Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 74ccd122fc07..81c20557d974 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4274,6 +4274,12 @@ sub process { "Prefer dev_$level(... to dev_printk(KERN_$orig, ...\n" . $herecurr); } +# trace_printk should not be used in production code. + if ($line =~ /\b(trace_printk|trace_puts|ftrace_vprintk)\s*\(/) { + WARN("TRACE_PRINTK", + "Do not use $1() in production code (this can be ignored if built only with a debug config option)\n" . $herecurr); + } + # ENOSYS means "bad syscall nr" and nothing else. This will have a small # number of false positives, but assembly files are not checked, so at # least the arch entry code will not trigger this warning. -- cgit v1.2.3-59-g8ed1b From c12093a11462b775a83ed600bf0036c8bef3e841 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Thu, 15 Oct 2020 20:12:05 -0700 Subject: const_structs.checkpatch: add pinctrl_ops and pinmux_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All usages of include/linux of these are const pointers, and all instances in the kernel except one, that are not const can be made const (patches have been posted for those separately). Signed-off-by: Rikard Falkeborn Signed-off-by: Andrew Morton Reviewed-by: Linus Walleij Acked-by: Manivannan Sadhasivam Cc: Joe Perches Cc: Andreas Färber Cc: Rikard Falkeborn Link: https://lkml.kernel.org/r/20200830224352.37114-1-rikard.falkeborn@gmail.com Signed-off-by: Linus Torvalds --- scripts/const_structs.checkpatch | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch index cd45cb3c2b04..1aae4f4fdacc 100644 --- a/scripts/const_structs.checkpatch +++ b/scripts/const_structs.checkpatch @@ -40,6 +40,8 @@ nvkm_device_chip of_device_id pci_raw_ops phy_ops +pinctrl_ops +pinmux_ops pipe_buf_operations platform_hibernation_ops platform_suspend_ops -- cgit v1.2.3-59-g8ed1b From 99ca38c2aa7da5ab39e55f13fd425c6101903ccb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 15 Oct 2020 20:12:09 -0700 Subject: checkpatch: warn on self-assignments The uninitialized_var() macro was removed recently via commit 63a0895d960a ("compiler: Remove uninitialized_var() macro") as it's not a particularly useful warning and its use can "paper over real bugs". Add a checkpatch test to warn on self-assignments as a means to avoid compiler warnings and as a back-door mechanism to reproduce the old uninitialized_var macro behavior. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Cc: Kees Cook Cc: Gustavo A. R. Silva Cc: Denis Efremov Cc: Julia Lawall Link: https://lkml.kernel.org/r/afc2cffdd315d3e4394af149278df9e8af7f49f4.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 81c20557d974..5fe8762b1fae 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3899,6 +3899,17 @@ sub process { #ignore lines not being added next if ($line =~ /^[^\+]/); +# check for self assignments used to avoid compiler warnings +# e.g.: int foo = foo, *bar = NULL; +# struct foo bar = *(&(bar)); + if ($line =~ /^\+\s*(?:$Declare)?([A-Za-z_][A-Za-z\d_]*)\s*=/) { + my $var = $1; + if ($line =~ /^\+\s*(?:$Declare)?$var\s*=\s*(?:$var|\*\s*\(?\s*&\s*\(?\s*$var\s*\)?\s*\)?)\s*[;,]/) { + WARN("SELF_ASSIGNMENT", + "Do not use self-assignments to avoid compiler warnings\n" . $herecurr); + } + } + # check for dereferences that span multiple lines if ($prevline =~ /^\+.*$Lval\s*(?:\.|->)\s*$/ && $line =~ /^\+\s*(?!\#\s*(?!define\s+|if))\s*$Lval/) { -- cgit v1.2.3-59-g8ed1b From f5f613259f3fea813c3f698a426c78eac61d8601 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 15 Oct 2020 20:12:12 -0700 Subject: checkpatch: allow not using -f with files that are in git If a file exists in git and checkpatch is used without the -f flag for scanning a file, then checkpatch will scan the file assuming it's a patch and emit: ERROR: Does not appear to be a unified-diff format patch Change the behavior to assume the -f flag if the file exists in git. [joe@perches.com: fix git "fatal" warning if file argument outside kernel tree] Link: https://lkml.kernel.org/r/b6afa04112d450c2fc120a308d706acd60cee294.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Reviewed-by: Julia Lawall Cc: Rasmus Villemoes Link: https://lkml.kernel.org/r/45b81a48e1568bd0126a96f5046eb7aaae9b83c9.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 5fe8762b1fae..360ae4b84ee3 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -974,6 +974,16 @@ sub seed_camelcase_includes { } } +sub git_is_single_file { + my ($filename) = @_; + + return 0 if ((which("git") eq "") || !(-e "$gitroot")); + + my $output = `${git_command} ls-files -- $filename 2>/dev/null`; + my $count = $output =~ tr/\n//; + return $count eq 1 && $output =~ m{^${filename}$}; +} + sub git_commit_info { my ($commit, $id, $desc) = @_; @@ -1047,6 +1057,9 @@ my $vname; $allow_c99_comments = !defined $ignore_type{"C99_COMMENT_TOLERANCE"}; for my $filename (@ARGV) { my $FILE; + my $is_git_file = git_is_single_file($filename); + my $oldfile = $file; + $file = 1 if ($is_git_file); if ($git) { open($FILE, '-|', "git format-patch -M --stdout -1 $filename") || die "$P: $filename: git format-patch failed - $!\n"; @@ -1091,6 +1104,7 @@ for my $filename (@ARGV) { @modifierListFile = (); @typeListFile = (); build_types(); + $file = $oldfile if ($is_git_file); } if (!$quiet) { -- cgit v1.2.3-59-g8ed1b From e7f929f3ca9ecadb7d38340345920af49e09eb6a Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Thu, 15 Oct 2020 20:12:15 -0700 Subject: checkpatch: extend author Signed-off-by check for split From: header Checkpatch did not handle cases where the author From: header was split into multiple lines. The author identity could not be resolved and checkpatch generated a false NO_AUTHOR_SIGN_OFF warning. A typical example is commit e33bcbab16d1 ("tee: add support for session's client UUID generation"). When checkpatch was run on this commit, it displayed: "WARNING:NO_AUTHOR_SIGN_OFF: Missing Signed-off-by: line by nominal patch author ''" This was due to split header lines not being handled properly and the author himself wrote in commit cd2614967d8b ("checkpatch: warn if missing author Signed-off-by"): "Split From: headers are not fully handled: only the first part is compared." Support split From: headers by correctly parsing the header extension lines. RFC 5322, Section-2.2.3 stated that each extended line must start with a WSP character (a space or htab). The solution was therefore to concatenate the lines which start with a WSP to get the correct long header. Suggested-by: Joe Perches Signed-off-by: Dwaipayan Ray Signed-off-by: Andrew Morton Tested-by: Lukas Bulwahn Reviewed-by: Lukas Bulwahn Acked-by: Joe Perches Link: https://lore.kernel.org/linux-kernel-mentees/f5d8124e54a50480b0a9fa638787bc29b6e09854.camel@perches.com/ Link: https://lkml.kernel.org/r/20200921085436.63003-1-dwaipayanray1@gmail.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 360ae4b84ee3..f40a81f24d43 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2679,6 +2679,10 @@ sub process { # Check the patch for a From: if (decode("MIME-Header", $line) =~ /^From:\s*(.*)/) { $author = $1; + my $curline = $linenr; + while(defined($rawlines[$curline]) && ($rawlines[$curline++] =~ /^[ \t]\s*(.*)/)) { + $author .= $1; + } $author = encode("utf8", $author) if ($line =~ /=\?utf-8\?/i); $author =~ s/"//g; $author = reformat_email($author); -- cgit v1.2.3-59-g8ed1b From a0154cdbd3dcf2b1b92f42cb1351a63f3f947e80 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 15 Oct 2020 20:12:19 -0700 Subject: checkpatch: emit a warning on embedded filenames Embedding the complete filename path inside the file isn't particularly useful as often the path is moved around and becomes incorrect. Emit a warning when the source contains the filename. [akpm@linux-foundation.org: remove stray " di"] Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/1fd5f9188a14acdca703ca00301ee323de672a8d.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f40a81f24d43..5424134e201d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3271,6 +3271,12 @@ sub process { } } +# check for embedded filenames + if ($rawline =~ /^\+.*\Q$realfile\E/) { + WARN("EMBEDDED_FILENAME", + "It's generally not useful to have the filename in the file\n" . $herecurr); + } + # check we are in a valid source file if not then ignore this hunk next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/); -- cgit v1.2.3-59-g8ed1b From 2e44e8033a9bc0420f315c32843ab89c0fc7bd0f Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Thu, 15 Oct 2020 20:12:22 -0700 Subject: checkpatch: fix multi-statement macro checks for while blocks. Checkpatch.pl doesn't have a check for excluding while (...) {...} blocks from MULTISTATEMENT_MACRO_USE_DO_WHILE error. For example, running checkpatch.pl on the file mm/maccess.c in the kernel generates the following error: ERROR: Macros with complex values should be enclosed in parentheses +#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ + while (len >= sizeof(type)) { \ + __get_kernel_nofault(dst, src, type, err_label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } The error is misleading for this case. Enclosing it in parentheses doesn't make any sense. Checkpatch already has an exception list for such common macro types. Added a new exception for while (...) {...} style blocks to the same. In addition, the brace flatten logic was modified by changing the substitution characters from "1" to "1u". This was done to ensure that macros in the form "#define foo(bar) while(bar){bar--;}" were also correctly procecssed. Link: https://lore.kernel.org/linux-kernel-mentees/dc985938aa3986702815a0bd68dfca8a03c85447.camel@perches.com/ Suggested-by: Joe Perches Signed-off-by: Dwaipayan Ray Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20201001171903.312021-1-dwaipayanray1@gmail.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 5424134e201d..8a5f0367a2f1 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5351,9 +5351,9 @@ sub process { $dstat =~ s/\s*$//s; # Flatten any parentheses and braces - while ($dstat =~ s/\([^\(\)]*\)/1/ || - $dstat =~ s/\{[^\{\}]*\}/1/ || - $dstat =~ s/.\[[^\[\]]*\]/1/) + while ($dstat =~ s/\([^\(\)]*\)/1u/ || + $dstat =~ s/\{[^\{\}]*\}/1u/ || + $dstat =~ s/.\[[^\[\]]*\]/1u/) { } @@ -5394,6 +5394,7 @@ sub process { $dstat !~ /^\.$Ident\s*=/ && # .foo = $dstat !~ /^(?:\#\s*$Ident|\#\s*$Constant)\s*$/ && # stringification #foo $dstat !~ /^do\s*$Constant\s*while\s*$Constant;?$/ && # do {...} while (...); // do {...} while (...) + $dstat !~ /^while\s*$Constant\s*$Constant\s*$/ && # while (...) {...} $dstat !~ /^for\s*$Constant$/ && # for (...) $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() $dstat !~ /^do\s*{/ && # do {... -- cgit v1.2.3-59-g8ed1b From c70735c23bf6cbfc9d7dad5f4ad562f0e40a89fa Mon Sep 17 00:00:00 2001 From: Łukasz Stelmach Date: Thu, 15 Oct 2020 20:12:25 -0700 Subject: checkpatch: fix false positive on empty block comment lines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid false positives in presence of SPDX-License-Identifier in networking files it is required to increase the leeway for empty block comment lines by one line. For example, checking drivers/net/loopback.c which starts with // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX rsults in an unnecessary warning WARNING: networking block comments don't use an empty /* line, use /* Comment... +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX Signed-off-by: Łukasz Stelmach Signed-off-by: Andrew Morton Acked-by: Joe Perches Cc: Bartłomiej Żolnierkiewicz Cc: Marek Szyprowski Link: https://lkml.kernel.org/r/20201006083509.19934-1-l.stelmach@samsung.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8a5f0367a2f1..e1ddcafdc176 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3464,7 +3464,7 @@ sub process { if ($realfile =~ m@^(drivers/net/|net/)@ && $prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ && $rawline =~ /^\+[ \t]*\*/ && - $realline > 2) { + $realline > 3) { # Do not warn about the initial copyright comment block after SPDX-License-Identifier WARN("NETWORKING_BLOCK_COMMENT_STYLE", "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev); } -- cgit v1.2.3-59-g8ed1b From 48ca2d8ac8a1d404a0f85eabfe1546304adbd844 Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Thu, 15 Oct 2020 20:12:28 -0700 Subject: checkpatch: add new warnings to author signoff checks. The author signed-off-by checks are currently very vague. Cases like same name or same address are not handled separately. For example, running checkpatch on commit be6577af0cef ("parisc: Add atomic64_set_release() define to avoid CPU soft lockups"), gives: WARNING: Missing Signed-off-by: line by nominal patch author 'John David Anglin ' The signoff line was: "Signed-off-by: Dave Anglin " Clearly the author has signed off but with a slightly different version of his name. A more appropriate warning would have been to point out at the name mismatch instead. Previously, the values assumed by $authorsignoff were either 0 or 1 to indicate whether a proper sign off by author is present. Extended the checks to handle four new cases. $authorsignoff values now denote the following: 0: Missing sign off by patch author. 1: Sign off present and identical. 2: Addresses and names match, but comments differ. "James Watson(JW) ", "James Watson " 3: Addresses match, but names are different. "James Watson ", "James " 4: Names match, but addresses are different. "James Watson ", "James Watson " 5: Names match, addresses excluding subaddress details (RFC 5233) match. "James Watson ", "James Watson " Also introduced a new message type FROM_SIGN_OFF_MISMATCH for cases 2, 3, 4 and 5. Suggested-by: Joe Perches Signed-off-by: Dwaipayan Ray Signed-off-by: Andrew Morton Acked-by: Joe Perches Link: https://lore.kernel.org/linux-kernel-mentees/c1ca28e77e8e3bfa7aadf3efa8ed70f97a9d369c.camel@perches.com/ Link: https://lkml.kernel.org/r/20201007192029.551744-1-dwaipayanray1@gmail.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 93 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 16 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e1ddcafdc176..4223a9ac7059 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1181,10 +1181,10 @@ sub parse_email { } } + $comment = trim($comment); $name = trim($name); $name =~ s/^\"|\"$//g; - $name =~ s/(\s*\([^\)]+\))\s*//; - if (defined($1)) { + if ($name =~ s/(\s*\([^\)]+\))\s*//) { $name_comment = trim($1); } $address = trim($address); @@ -1199,10 +1199,12 @@ sub parse_email { } sub format_email { - my ($name, $address) = @_; + my ($name, $name_comment, $address, $comment) = @_; my $formatted_email; + $name_comment = trim($name_comment); + $comment = trim($comment); $name = trim($name); $name =~ s/^\"|\"$//g; $address = trim($address); @@ -1215,9 +1217,9 @@ sub format_email { if ("$name" eq "") { $formatted_email = "$address"; } else { - $formatted_email = "$name <$address>"; + $formatted_email = "$name$name_comment <$address>"; } - + $formatted_email .= "$comment"; return $formatted_email; } @@ -1225,17 +1227,23 @@ sub reformat_email { my ($email) = @_; my ($email_name, $name_comment, $email_address, $comment) = parse_email($email); - return format_email($email_name, $email_address); + return format_email($email_name, $name_comment, $email_address, $comment); } sub same_email_addresses { - my ($email1, $email2) = @_; + my ($email1, $email2, $match_comment) = @_; my ($email1_name, $name1_comment, $email1_address, $comment1) = parse_email($email1); my ($email2_name, $name2_comment, $email2_address, $comment2) = parse_email($email2); + if ($match_comment != 1) { + return $email1_name eq $email2_name && + $email1_address eq $email2_address; + } return $email1_name eq $email2_name && - $email1_address eq $email2_address; + $email1_address eq $email2_address && + $name1_comment eq $name2_comment && + $comment1 eq $comment2; } sub which { @@ -2365,6 +2373,7 @@ sub process { my $signoff = 0; my $author = ''; my $authorsignoff = 0; + my $author_sob = ''; my $is_patch = 0; my $is_binding_patch = -1; my $in_header_lines = $file ? 0 : 1; @@ -2692,9 +2701,37 @@ sub process { if ($line =~ /^\s*signed-off-by:\s*(.*)/i) { $signoff++; $in_commit_log = 0; - if ($author ne '') { - if (same_email_addresses($1, $author)) { + if ($author ne '' && $authorsignoff != 1) { + if (same_email_addresses($1, $author, 1)) { $authorsignoff = 1; + } else { + my $ctx = $1; + my ($email_name, $email_comment, $email_address, $comment1) = parse_email($ctx); + my ($author_name, $author_comment, $author_address, $comment2) = parse_email($author); + + if ($email_address eq $author_address && $email_name eq $author_name) { + $author_sob = $ctx; + $authorsignoff = 2; + } elsif ($email_address eq $author_address) { + $author_sob = $ctx; + $authorsignoff = 3; + } elsif ($email_name eq $author_name) { + $author_sob = $ctx; + $authorsignoff = 4; + + my $address1 = $email_address; + my $address2 = $author_address; + + if ($address1 =~ /(\S+)\+\S+(\@.*)/) { + $address1 = "$1$2"; + } + if ($address2 =~ /(\S+)\+\S+(\@.*)/) { + $address2 = "$1$2"; + } + if ($address1 eq $address2) { + $authorsignoff = 5; + } + } } } } @@ -2751,7 +2788,7 @@ sub process { } my ($email_name, $name_comment, $email_address, $comment) = parse_email($email); - my $suggested_email = format_email(($email_name, $email_address)); + my $suggested_email = format_email(($email_name, $name_comment, $email_address, $comment)); if ($suggested_email eq "") { ERROR("BAD_SIGN_OFF", "Unrecognized email address: '$email'\n" . $herecurr); @@ -2761,9 +2798,9 @@ sub process { $dequoted =~ s/" missing sign off + # 1 -> sign off identical + # 2 -> names and addresses match, comments mismatch + # 3 -> addresses match, names different + # 4 -> names match, addresses different + # 5 -> names match, addresses excluding subaddress details (refer RFC 5233) match + + my $sob_msg = "'From: $author' != 'Signed-off-by: $author_sob'"; + + if ($authorsignoff == 0) { + ERROR("NO_AUTHOR_SIGN_OFF", + "Missing Signed-off-by: line by nominal patch author '$author'\n"); + } elsif ($authorsignoff == 2) { + CHK("FROM_SIGN_OFF_MISMATCH", + "From:/Signed-off-by: email comments mismatch: $sob_msg\n"); + } elsif ($authorsignoff == 3) { + WARN("FROM_SIGN_OFF_MISMATCH", + "From:/Signed-off-by: email name mismatch: $sob_msg\n"); + } elsif ($authorsignoff == 4) { + WARN("FROM_SIGN_OFF_MISMATCH", + "From:/Signed-off-by: email address mismatch: $sob_msg\n"); + } elsif ($authorsignoff == 5) { + WARN("FROM_SIGN_OFF_MISMATCH", + "From:/Signed-off-by: email subaddress mismatch: $sob_msg\n"); + } } } -- cgit v1.2.3-59-g8ed1b From ce81bb256a224259ab686742a6284930cbe4f1fa Mon Sep 17 00:00:00 2001 From: Chris Kennelly Date: Thu, 15 Oct 2020 20:12:32 -0700 Subject: fs/binfmt_elf: use PT_LOAD p_align values for suitable start address Patch series "Selecting Load Addresses According to p_align", v3. The current ELF loading mechancism provides page-aligned mappings. This can lead to the program being loaded in a way unsuitable for file-backed, transparent huge pages when handling PIE executables. While specifying -z,max-page-size=0x200000 to the linker will generate suitably aligned segments for huge pages on x86_64, the executable needs to be loaded at a suitably aligned address as well. This alignment requires the binary's cooperation, as distinct segments need to be appropriately paddded to be eligible for THP. For binaries built with increased alignment, this limits the number of bits usable for ASLR, but provides some randomization over using fixed load addresses/non-PIE binaries. This patch (of 2): The current ELF loading mechancism provides page-aligned mappings. This can lead to the program being loaded in a way unsuitable for file-backed, transparent huge pages when handling PIE executables. For binaries built with increased alignment, this limits the number of bits usable for ASLR, but provides some randomization over using fixed load addresses/non-PIE binaries. Tested by verifying program with -Wl,-z,max-page-size=0x200000 loading. [akpm@linux-foundation.org: fix max() warning] [ckennelly@google.com: augment comment] Link: https://lkml.kernel.org/r/20200821233848.3904680-2-ckennelly@google.com Signed-off-by: Chris Kennelly Signed-off-by: Andrew Morton Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Song Liu Cc: David Rientjes Cc: Ian Rogers Cc: Hugh Dickens Cc: Suren Baghdasaryan Cc: Sandeep Patil Cc: Fangrui Song Cc: Nick Desaulniers Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Shuah Khan Link: https://lkml.kernel.org/r/20200820170541.1132271-1-ckennelly@google.com Link: https://lkml.kernel.org/r/20200820170541.1132271-2-ckennelly@google.com Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13d053982dd7..96370e3e3687 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -421,6 +422,26 @@ static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) return 0; } +static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) +{ + unsigned long alignment = 0; + int i; + + for (i = 0; i < nr; i++) { + if (cmds[i].p_type == PT_LOAD) { + unsigned long p_align = cmds[i].p_align; + + /* skip non-power of two alignments as invalid */ + if (!is_power_of_2(p_align)) + continue; + alignment = max(alignment, p_align); + } + } + + /* ensure we align to at least one page */ + return ELF_PAGEALIGN(alignment); +} + /** * load_elf_phdrs() - load ELF program headers * @elf_ex: ELF header of the binary whose program headers should be loaded @@ -1008,6 +1029,7 @@ out_free_interp: int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; + unsigned long alignment; if (elf_ppnt->p_type != PT_LOAD) continue; @@ -1086,6 +1108,9 @@ out_free_interp: load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + if (alignment) + load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED; } else load_bias = 0; -- cgit v1.2.3-59-g8ed1b From 206e22f01941b19f9466f48b53cc0d19de493e7a Mon Sep 17 00:00:00 2001 From: Chris Kennelly Date: Thu, 15 Oct 2020 20:12:36 -0700 Subject: tools/testing/selftests: add self-test for verifying load alignment This produces a PIE binary with a variety of p_align requirements, suitable for verifying that the load address meets that alignment requirement. Signed-off-by: Chris Kennelly Signed-off-by: Andrew Morton Cc: Shuah Khan Cc: Alexander Viro Cc: Alexey Dobriyan Cc: David Rientjes Cc: Fangrui Song Cc: Hugh Dickens Cc: Ian Rogers Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Nick Desaulniers Cc: Sandeep Patil Cc: Song Liu Cc: Suren Baghdasaryan Link: https://lkml.kernel.org/r/20200820170541.1132271-3-ckennelly@google.com Link: https://lkml.kernel.org/r/20200821233848.3904680-3-ckennelly@google.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/exec/.gitignore | 1 + tools/testing/selftests/exec/Makefile | 9 +++- tools/testing/selftests/exec/load_address.c | 68 +++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/exec/load_address.c diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore index 344a99c6da1b..9e2f00343f15 100644 --- a/tools/testing/selftests/exec/.gitignore +++ b/tools/testing/selftests/exec/.gitignore @@ -7,6 +7,7 @@ execveat.moved execveat.path.ephemeral execveat.ephemeral execveat.denatured +/load_address_* /recursion-depth xxxxxxxx* pipe diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 0a13b110c1e6..cf69b2fcce59 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -4,7 +4,7 @@ CFLAGS += -Wno-nonnull CFLAGS += -D_GNU_SOURCE TEST_PROGS := binfmt_script non-regular -TEST_GEN_PROGS := execveat +TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216 TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir pipe # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile @@ -27,4 +27,9 @@ $(OUTPUT)/execveat.symlink: $(OUTPUT)/execveat $(OUTPUT)/execveat.denatured: $(OUTPUT)/execveat cp $< $@ chmod -x $@ - +$(OUTPUT)/load_address_4096: load_address.c + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000 -pie $< -o $@ +$(OUTPUT)/load_address_2097152: load_address.c + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x200000 -pie $< -o $@ +$(OUTPUT)/load_address_16777216: load_address.c + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000000 -pie $< -o $@ diff --git a/tools/testing/selftests/exec/load_address.c b/tools/testing/selftests/exec/load_address.c new file mode 100644 index 000000000000..d487c2f6a615 --- /dev/null +++ b/tools/testing/selftests/exec/load_address.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include + +struct Statistics { + unsigned long long load_address; + unsigned long long alignment; +}; + +int ExtractStatistics(struct dl_phdr_info *info, size_t size, void *data) +{ + struct Statistics *stats = (struct Statistics *) data; + int i; + + if (info->dlpi_name != NULL && info->dlpi_name[0] != '\0') { + // Ignore headers from other than the executable. + return 2; + } + + stats->load_address = (unsigned long long) info->dlpi_addr; + stats->alignment = 0; + + for (i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type != PT_LOAD) + continue; + + if (info->dlpi_phdr[i].p_align > stats->alignment) + stats->alignment = info->dlpi_phdr[i].p_align; + } + + return 1; // Terminate dl_iterate_phdr. +} + +int main(int argc, char **argv) +{ + struct Statistics extracted; + unsigned long long misalign; + int ret; + + ret = dl_iterate_phdr(ExtractStatistics, &extracted); + if (ret != 1) { + fprintf(stderr, "FAILED\n"); + return 1; + } + + if (extracted.alignment == 0) { + fprintf(stderr, "No alignment found\n"); + return 1; + } else if (extracted.alignment & (extracted.alignment - 1)) { + fprintf(stderr, "Alignment is not a power of 2\n"); + return 1; + } + + misalign = extracted.load_address & (extracted.alignment - 1); + if (misalign) { + printf("alignment = %llu, load_address = %llu\n", + extracted.alignment, extracted.load_address); + fprintf(stderr, "FAILED\n"); + return 1; + } + + fprintf(stderr, "PASS\n"); + return 0; +} -- cgit v1.2.3-59-g8ed1b From 8f942eea12ae8c5a7cde85c145234b25e38de959 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:40 -0700 Subject: binfmt_elf_fdpic: stop using dump_emit() on user pointers on !MMU Patch series "Fix ELF / FDPIC ELF core dumping, and use mmap_lock properly in there", v5. At the moment, we have that rather ugly mmget_still_valid() helper to work around : ELF core dumping doesn't take the mmap_sem while traversing the task's VMAs, and if anything (like userfaultfd) then remotely messes with the VMA tree, fireworks ensue. So at the moment we use mmget_still_valid() to bail out in any writers that might be operating on a remote mm's VMAs. With this series, I'm trying to get rid of the need for that as cleanly as possible. ("cleanly" meaning "avoid holding the mmap_lock across unbounded sleeps".) Patches 1, 2, 3 and 4 are relatively unrelated cleanups in the core dumping code. Patches 5 and 6 implement the main change: Instead of repeatedly accessing the VMA list with sleeps in between, we snapshot it at the start with proper locking, and then later we just use our copy of the VMA list. This ensures that the kernel won't crash, that VMA metadata in the coredump is consistent even in the presence of concurrent modifications, and that any virtual addresses that aren't being concurrently modified have their contents show up in the core dump properly. The disadvantage of this approach is that we need a bit more memory during core dumping for storing metadata about all VMAs. At the end of the series, patch 7 removes the old workaround for this issue (mmget_still_valid()). I have tested: - Creating a simple core dump on X86-64 still works. - The created coredump on X86-64 opens in GDB and looks plausible. - X86-64 core dumps contain the first page for executable mappings at offset 0, and don't contain the first page for non-executable file mappings or executable mappings at offset !=0. - NOMMU 32-bit ARM can still generate plausible-looking core dumps through the FDPIC implementation. (I can't test this with GDB because GDB is missing some structure definition for nommu ARM, but I've poked around in the hexdump and it looked decent.) This patch (of 7): dump_emit() is for kernel pointers, and VMAs describe userspace memory. Let's be tidy here and avoid accessing userspace pointers under KERNEL_DS, even if it probably doesn't matter much on !MMU systems - especially given that it looks like we can just use the same get_dump_page() as on MMU if we move it out of the CONFIG_MMU block. One small change we have to make in get_dump_page() is to use __get_user_pages_locked() instead of __get_user_pages(), since the latter doesn't exist on nommu. On mmu builds, __get_user_pages_locked() will just call __get_user_pages() for us. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-1-jannh@google.com Link: http://lkml.kernel.org/r/20200827114932.3572699-2-jannh@google.com Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 8 -------- mm/gup.c | 57 +++++++++++++++++++++++++-------------------------- 2 files changed, 28 insertions(+), 37 deletions(-) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 50f845702b92..a53f83830986 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1529,14 +1529,11 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm) struct vm_area_struct *vma; for (vma = current->mm->mmap; vma; vma = vma->vm_next) { -#ifdef CONFIG_MMU unsigned long addr; -#endif if (!maydump(vma, cprm->mm_flags)) continue; -#ifdef CONFIG_MMU for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { bool res; @@ -1552,11 +1549,6 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm) if (!res) return false; } -#else - if (!dump_emit(cprm, (void *) vma->vm_start, - vma->vm_end - vma->vm_start)) - return false; -#endif } return true; } diff --git a/mm/gup.c b/mm/gup.c index ad617e7f22f5..f0f3f4052b94 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1490,35 +1490,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ } - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_lock, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; -} -#endif /* CONFIG_ELF_CORE */ #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1564,6 +1535,34 @@ finish_or_fault: } #endif /* !CONFIG_MMU */ +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_lock, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages_locked(current->mm, addr, 1, &page, &vma, NULL, + FOLL_FORCE | FOLL_DUMP | FOLL_GET) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ + #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { -- cgit v1.2.3-59-g8ed1b From df0c09c0117300cd03f6e8532527bf591d4eee65 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:43 -0700 Subject: coredump: let dump_emit() bail out on short writes dump_emit() has a retry loop, but there seems to be no way for that retry logic to actually be used; and it was also buggy, writing the same data repeatedly after a short write. Let's just bail out on a short write. Suggested-by: Linus Torvalds Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-3-jannh@google.com Signed-off-by: Linus Torvalds --- fs/coredump.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 76e7c10edfc0..5e24c06092c9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -840,17 +840,17 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) ssize_t n; if (cprm->written + nr > cprm->limit) return 0; - while (nr) { - if (dump_interrupted()) - return 0; - n = __kernel_write(file, addr, nr, &pos); - if (n <= 0) - return 0; - file->f_pos = pos; - cprm->written += n; - cprm->pos += n; - nr -= n; - } + + + if (dump_interrupted()) + return 0; + n = __kernel_write(file, addr, nr, &pos); + if (n != nr) + return 0; + file->f_pos = pos; + cprm->written += n; + cprm->pos += n; + return 1; } EXPORT_SYMBOL(dump_emit); -- cgit v1.2.3-59-g8ed1b From afc63a97b764bc5a715762d0d9cc9785c2ef4e75 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:46 -0700 Subject: coredump: refactor page range dumping into common helper Both fs/binfmt_elf.c and fs/binfmt_elf_fdpic.c need to dump ranges of pages into the coredump file. Extract that logic into a common helper. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-4-jannh@google.com Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 22 ++-------------------- fs/binfmt_elf_fdpic.c | 18 +++--------------- fs/coredump.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/coredump.h | 2 ++ 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 96370e3e3687..cf956edf05ef 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2444,26 +2444,8 @@ static int elf_core_dump(struct coredump_params *cprm) for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma)) { - unsigned long addr; - unsigned long end; - - end = vma->vm_start + vma_filesz[i++]; - - for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { - struct page *page; - int stop; - - page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else - stop = !dump_skip(cprm, PAGE_SIZE); - if (stop) - goto end_coredump; - } + if (!dump_user_range(cprm, vma->vm_start, vma_filesz[i++])) + goto end_coredump; } dump_truncate(cprm); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index a53f83830986..76e8c0defdc8 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1534,21 +1534,9 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm) if (!maydump(vma, cprm->mm_flags)) continue; - for (addr = vma->vm_start; addr < vma->vm_end; - addr += PAGE_SIZE) { - bool res; - struct page *page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - res = dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else { - res = dump_skip(cprm, PAGE_SIZE); - } - if (!res) - return false; - } + if (!dump_user_range(cprm, vma->vm_start, + vma->vma_end - vma->vm_start)) + return false; } return true; } diff --git a/fs/coredump.c b/fs/coredump.c index 5e24c06092c9..6042d15acd51 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -876,6 +876,40 @@ int dump_skip(struct coredump_params *cprm, size_t nr) } EXPORT_SYMBOL(dump_skip); +#ifdef CONFIG_ELF_CORE +int dump_user_range(struct coredump_params *cprm, unsigned long start, + unsigned long len) +{ + unsigned long addr; + + for (addr = start; addr < start + len; addr += PAGE_SIZE) { + struct page *page; + int stop; + + /* + * To avoid having to allocate page tables for virtual address + * ranges that have never been used yet, and also to make it + * easy to generate sparse core files, use a helper that returns + * NULL when encountering an empty page table entry that would + * otherwise have been filled with the zero page. + */ + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + + stop = !dump_emit(cprm, kaddr, PAGE_SIZE); + kunmap(page); + put_page(page); + } else { + stop = !dump_skip(cprm, PAGE_SIZE); + } + if (stop) + return 0; + } + return 1; +} +#endif + int dump_align(struct coredump_params *cprm, int align) { unsigned mod = cprm->pos & (align - 1); diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 7a899e83835d..f0b71a74d0bc 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -16,6 +16,8 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); +int dump_user_range(struct coredump_params *cprm, unsigned long start, + unsigned long len); #ifdef CONFIG_COREDUMP extern void do_coredump(const kernel_siginfo_t *siginfo); #else -- cgit v1.2.3-59-g8ed1b From 429a22e776a2b9f85a2b9c53d8e647598b553dd1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:50 -0700 Subject: coredump: rework elf/elf_fdpic vma_dump_size() into common helper At the moment, the binfmt_elf and binfmt_elf_fdpic code have slightly different code to figure out which VMAs should be dumped, and if so, whether the dump should contain the entire VMA or just its first page. Eliminate duplicate code by reworking the binfmt_elf version into a generic core dumping helper in coredump.c. As part of that, change the heuristic for detecting executable/library header pages to check whether the inode is executable instead of looking at the file mode. This is less problematic in terms of locking because it lets us avoid get_user() under the mmap_sem. (And arguably it looks nicer and makes more sense in generic code.) Adjust a little bit based on the binfmt_elf_fdpic version: ->anon_vma is only meaningful under CONFIG_MMU, otherwise we have to assume that the VMA has been written to. Suggested-by: Linus Torvalds Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-5-jannh@google.com Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 120 ----------------------------------------------- fs/binfmt_elf_fdpic.c | 83 ++------------------------------ fs/coredump.c | 101 +++++++++++++++++++++++++++++++++++++++ include/linux/coredump.h | 1 + 4 files changed, 106 insertions(+), 199 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index cf956edf05ef..c1756fd8d10b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1414,126 +1414,6 @@ out: * Jeremy Fitzhardinge */ -/* - * The purpose of always_dump_vma() is to make sure that special kernel mappings - * that are useful for post-mortem analysis are included in every core dump. - * In that way we ensure that the core dump is fully interpretable later - * without matching up the same kernel and hardware config to see what PC values - * meant. These special mappings include - vDSO, vsyscall, and other - * architecture specific mappings - */ -static bool always_dump_vma(struct vm_area_struct *vma) -{ - /* Any vsyscall mappings? */ - if (vma == get_gate_vma(vma->vm_mm)) - return true; - - /* - * Assume that all vmas with a .name op should always be dumped. - * If this changes, a new vm_ops field can easily be added. - */ - if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) - return true; - - /* - * arch_vma_name() returns non-NULL for special architecture mappings, - * such as vDSO sections. - */ - if (arch_vma_name(vma)) - return true; - - return false; -} - -/* - * Decide what to dump of a segment, part, all or none. - */ -static unsigned long vma_dump_size(struct vm_area_struct *vma, - unsigned long mm_flags) -{ -#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - - /* always dump the vdso and vsyscall sections */ - if (always_dump_vma(vma)) - goto whole; - - if (vma->vm_flags & VM_DONTDUMP) - return 0; - - /* support for DAX */ - if (vma_is_dax(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) - goto whole; - return 0; - } - - /* Hugetlb memory check */ - if (is_vm_hugetlb_page(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) - goto whole; - return 0; - } - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) - return 0; - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0 ? - FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) - goto whole; - return 0; - } - - /* Dump segments that have been written to. */ - if (vma->anon_vma && FILTER(ANON_PRIVATE)) - goto whole; - if (vma->vm_file == NULL) - return 0; - - if (FILTER(MAPPED_PRIVATE)) - goto whole; - - /* - * If this looks like the beginning of a DSO or executable mapping, - * check for an ELF header. If we find one, dump the first page to - * aid in determining what was mapped here. - */ - if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { - u32 __user *header = (u32 __user *) vma->vm_start; - u32 word; - /* - * Doing it this way gets the constant folded by GCC. - */ - union { - u32 cmp; - char elfmag[SELFMAG]; - } magic; - BUILD_BUG_ON(SELFMAG != sizeof word); - magic.elfmag[EI_MAG0] = ELFMAG0; - magic.elfmag[EI_MAG1] = ELFMAG1; - magic.elfmag[EI_MAG2] = ELFMAG2; - magic.elfmag[EI_MAG3] = ELFMAG3; - if (unlikely(get_user(word, header))) - word = 0; - if (word == magic.cmp) - return PAGE_SIZE; - } - -#undef FILTER - - return 0; - -whole: - return vma->vm_end - vma->vm_start; -} - /* An ELF note in memory */ struct memelfnote { diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 76e8c0defdc8..f531c6198864 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1215,76 +1215,6 @@ struct elf_prstatus_fdpic int pr_fpvalid; /* True if math co-processor being used. */ }; -/* - * Decide whether a segment is worth dumping; default is yes to be - * sure (missing info is worse than too much; etc). - * Personally I'd include everything, and use the coredump limit... - * - * I think we should skip something. But I am not sure how. H.J. - */ -static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) -{ - int dump_ok; - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) { - kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* If we may not read the contents, don't allow us to dump - * them either. "dump_write()" can't handle it anyway. - */ - if (!(vma->vm_flags & VM_READ)) { - kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* support for DAX */ - if (vma_is_dax(vma)) { - if (vma->vm_flags & VM_SHARED) { - dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } else { - dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } - return dump_ok; - } - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0) { - dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - - dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - -#ifdef CONFIG_MMU - /* By default, if it hasn't been written to, don't write it out */ - if (!vma->anon_vma) { - dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } -#endif - - dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags, - dump_ok ? "yes" : "no"); - return dump_ok; -} - /* An ELF note in memory */ struct memelfnote { @@ -1529,13 +1459,9 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm) struct vm_area_struct *vma; for (vma = current->mm->mmap; vma; vma = vma->vm_next) { - unsigned long addr; - - if (!maydump(vma, cprm->mm_flags)) - continue; + unsigned long size = vma_dump_size(vma, cprm->mm_flags); - if (!dump_user_range(cprm, vma->vm_start, - vma->vma_end - vma->vm_start)) + if (!dump_user_range(cprm, vma->vm_start, size)) return false; } return true; @@ -1547,8 +1473,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags) size_t size = 0; for (vma = current->mm->mmap; vma; vma = vma->vm_next) - if (maydump(vma, mm_flags)) - size += vma->vm_end - vma->vm_start; + size += vma_dump_size(vma, mm_flags); return size; } @@ -1694,7 +1619,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) phdr.p_offset = offset; phdr.p_vaddr = vma->vm_start; phdr.p_paddr = 0; - phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0; + phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); phdr.p_memsz = sz; offset += phdr.p_filesz; phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; diff --git a/fs/coredump.c b/fs/coredump.c index 6042d15acd51..4ef4c49a65b7 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -936,3 +936,104 @@ void dump_truncate(struct coredump_params *cprm) } } EXPORT_SYMBOL(dump_truncate); + +/* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + /* + * Assume that all vmas with a .name op should always be dumped. + * If this changes, a new vm_ops field can easily be added. + */ + if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) + return true; + + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + +/* + * Decide how much of @vma's contents should be included in a core dump. + */ +unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags) +{ +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) + goto whole; + + if (vma->vm_flags & VM_DONTDUMP) + return 0; + + /* support for DAX */ + if (vma_is_dax(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) + goto whole; + return 0; + } + + /* Hugetlb memory check */ + if (is_vm_hugetlb_page(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + return 0; + } + + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & VM_IO) + return 0; + + /* By default, dump shared memory if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (file_inode(vma->vm_file)->i_nlink == 0 ? + FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) + goto whole; + return 0; + } + + /* Dump segments that have been written to. */ + if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE)) + goto whole; + if (vma->vm_file == NULL) + return 0; + + if (FILTER(MAPPED_PRIVATE)) + goto whole; + + /* + * If this is the beginning of an executable file mapping, + * dump the first page to aid in determining what was mapped here. + */ + if (FILTER(ELF_HEADERS) && + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && + (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + +#undef FILTER + + return 0; + +whole: + return vma->vm_end - vma->vm_start; +} diff --git a/include/linux/coredump.h b/include/linux/coredump.h index f0b71a74d0bc..bfecb8d79a7f 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -16,6 +16,7 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); +unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); #ifdef CONFIG_COREDUMP -- cgit v1.2.3-59-g8ed1b From a07279c9a8cd7dbd321640ff7210591599ee00a4 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:54 -0700 Subject: binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot In both binfmt_elf and binfmt_elf_fdpic, use a new helper dump_vma_snapshot() to take a snapshot of the VMA list (including the gate VMA, if we have one) while protected by the mmap_lock, and then use that snapshot instead of walking the VMA list without locking. An alternative approach would be to keep the mmap_lock held across the entire core dumping operation; however, keeping the mmap_lock locked while we may be blocked for an unbounded amount of time (e.g. because we're dumping to a FUSE filesystem or so) isn't really optimal; the mmap_lock blocks things like the ->release handler of userfaultfd, and we don't really want critical system daemons to grind to a halt just because someone "gifted" them SCM_RIGHTS to an eternally-locked userfaultfd, or something like that. Since both the normal ELF code and the FDPIC ELF code need this functionality (and if any other binfmt wants to add coredump support in the future, they'd probably need it, too), implement this with a common helper in fs/coredump.c. A downside of this approach is that we now need a bigger amount of kernel memory per userspace VMA in the normal ELF case, and that we need O(n) kernel memory in the FDPIC ELF case at all; but 40 bytes per VMA shouldn't be terribly bad. There currently is a data race between stack expansion and anything that reads ->vm_start or ->vm_end under the mmap_lock held in read mode; to mitigate that for core dumping, take the mmap_lock in write mode when taking a snapshot of the VMA hierarchy. (If we only took the mmap_lock in read mode, we could end up with a corrupted core dump if someone does get_user_pages_remote() concurrently. Not really a major problem, but taking the mmap_lock either way works here, so we might as well avoid the issue.) (This doesn't do anything about the existing data races with stack expansion in other mm code.) Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-6-jannh@google.com Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 100 +++++++++++------------------------------------ fs/binfmt_elf_fdpic.c | 67 +++++++++++++------------------ fs/coredump.c | 81 +++++++++++++++++++++++++++++++++++++- include/linux/coredump.h | 10 ++++- 4 files changed, 138 insertions(+), 120 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c1756fd8d10b..e7e9d0cde51a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2125,32 +2125,6 @@ static void free_note_info(struct elf_note_info *info) #endif -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} -/* - * Helper function for iterating across a vma list. It ensures that the caller - * will visit `gate_vma' prior to terminating the search. - */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) - return NULL; - return gate_vma; -} - static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, elf_addr_t e_shoff, int segs) { @@ -2177,9 +2151,8 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs, i; - size_t vma_data_size = 0; - struct vm_area_struct *vma, *gate_vma; + int vma_count, segs, i; + size_t vma_data_size; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2187,30 +2160,16 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - elf_addr_t *vma_filesz = NULL; + struct core_vma_metadata *vma_meta; + + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + return 0; - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ - /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); - - gate_vma = get_gate_vma(current->mm); - if (gate_vma != NULL) - segs++; + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2248,24 +2207,6 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - /* - * Zero vma process will get ZERO_SIZE_PTR here. - * Let coredump continue for register state at least. - */ - vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), - GFP_KERNEL); - if (!vma_filesz) - goto end_coredump; - - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - unsigned long dump_size; - - dump_size = vma_dump_size(vma, cprm->mm_flags); - vma_filesz[i++] = dump_size; - vma_data_size += dump_size; - } - offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2286,21 +2227,23 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = vma_filesz[i++]; - phdr.p_memsz = vma->vm_end - vma->vm_start; + phdr.p_filesz = meta->dump_size; + phdr.p_memsz = meta->end - meta->start; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -2322,9 +2265,10 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - if (!dump_user_range(cprm, vma->vm_start, vma_filesz[i++])) + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; + + if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; } dump_truncate(cprm); @@ -2340,7 +2284,7 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_filesz); + kvfree(vma_meta); kfree(phdr4note); return has_dumped; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index f531c6198864..be4062b8ba75 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1454,29 +1454,21 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, /* * dump the segments for an MMU process */ -static bool elf_fdpic_dump_segments(struct coredump_params *cprm) +static bool elf_fdpic_dump_segments(struct coredump_params *cprm, + struct core_vma_metadata *vma_meta, + int vma_count) { - struct vm_area_struct *vma; + int i; - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { - unsigned long size = vma_dump_size(vma, cprm->mm_flags); + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; - if (!dump_user_range(cprm, vma->vm_start, size)) + if (!dump_user_range(cprm, meta->start, meta->dump_size)) return false; } return true; } -static size_t elf_core_vma_data_size(unsigned long mm_flags) -{ - struct vm_area_struct *vma; - size_t size = 0; - - for (vma = current->mm->mmap; vma; vma = vma->vm_next) - size += vma_dump_size(vma, mm_flags); - return size; -} - /* * Actual dumper * @@ -1487,9 +1479,8 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags) static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs; + int vma_count, segs; int i; - struct vm_area_struct *vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; struct memelfnote psinfo_note, auxv_note; @@ -1503,18 +1494,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ + struct core_vma_metadata *vma_meta = NULL; + size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1524,6 +1505,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + goto end_coredump; + for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1543,8 +1527,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1589,7 +1572,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += elf_core_vma_data_size(cprm->mm_flags); + offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1609,23 +1592,26 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; size_t sz; - sz = vma->vm_end - vma->vm_start; + sz = meta->end - meta->start; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); + phdr.p_filesz = meta->dump_size; phdr.p_memsz = sz; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -1657,7 +1643,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - if (!elf_fdpic_dump_segments(cprm)) + if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1681,6 +1667,7 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } + kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/coredump.c b/fs/coredump.c index 4ef4c49a65b7..0cd9056d79cc 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -971,7 +971,8 @@ static bool always_dump_vma(struct vm_area_struct *vma) /* * Decide how much of @vma's contents should be included in a core dump. */ -unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags) +static unsigned long vma_dump_size(struct vm_area_struct *vma, + unsigned long mm_flags) { #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) @@ -1037,3 +1038,81 @@ unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags) whole: return vma->vm_end - vma->vm_start; } + +static struct vm_area_struct *first_vma(struct task_struct *tsk, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret = tsk->mm->mmap; + + if (ret) + return ret; + return gate_vma; +} + +/* + * Helper function for iterating across a vma list. It ensures that the caller + * will visit `gate_vma' prior to terminating the search. + */ +static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret; + + ret = this_vma->vm_next; + if (ret) + return ret; + if (this_vma == gate_vma) + return NULL; + return gate_vma; +} + +/* + * Under the mmap_lock, take a snapshot of relevant information about the task's + * VMAs. + */ +int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, + struct core_vma_metadata **vma_meta, + size_t *vma_data_size_ptr) +{ + struct vm_area_struct *vma, *gate_vma; + struct mm_struct *mm = current->mm; + int i; + size_t vma_data_size = 0; + + /* + * Once the stack expansion code is fixed to not change VMA bounds + * under mmap_lock in read mode, this can be changed to take the + * mmap_lock in read mode. + */ + if (mmap_write_lock_killable(mm)) + return -EINTR; + + gate_vma = get_gate_vma(mm); + *vma_count = mm->map_count + (gate_vma ? 1 : 0); + + *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); + if (!*vma_meta) { + mmap_write_unlock(mm); + return -ENOMEM; + } + + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma), i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + m->start = vma->vm_start; + m->end = vma->vm_end; + m->flags = vma->vm_flags; + m->dump_size = vma_dump_size(vma, cprm->mm_flags); + + vma_data_size += m->dump_size; + } + + mmap_write_unlock(mm); + + if (WARN_ON(i != *vma_count)) + return -EFAULT; + + *vma_data_size_ptr = vma_data_size; + return 0; +} diff --git a/include/linux/coredump.h b/include/linux/coredump.h index bfecb8d79a7f..e58e8c207782 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -7,6 +7,12 @@ #include #include +struct core_vma_metadata { + unsigned long start, end; + unsigned long flags; + unsigned long dump_size; +}; + /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. @@ -16,9 +22,11 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); -unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); +int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, + struct core_vma_metadata **vma_meta, + size_t *vma_data_size_ptr); #ifdef CONFIG_COREDUMP extern void do_coredump(const kernel_siginfo_t *siginfo); #else -- cgit v1.2.3-59-g8ed1b From 7f3bfab52cab96c510fd60dd757a32db7bc52d3c Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:57 -0700 Subject: mm/gup: take mmap_lock in get_dump_page() Properly take the mmap_lock before calling into the GUP code from get_dump_page(); and play nice, allowing the GUP code to drop the mmap_lock if it has to sleep. As Linus pointed out, we don't actually need the VMA because __get_user_pages() will flush the dcache for us if necessary. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-7-jannh@google.com Signed-off-by: Linus Torvalds --- mm/gup.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f0f3f4052b94..102877ed77a4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1547,19 +1547,23 @@ finish_or_fault: * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - * allowing a hole to be left in the corefile to save diskspace. * - * Called without mmap_lock, but after all other threads have been killed. + * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { - struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; struct page *page; + int locked = 1; + int ret; - if (__get_user_pages_locked(current->mm, addr, 1, &page, &vma, NULL, - FOLL_FORCE | FOLL_DUMP | FOLL_GET) < 1) + if (mmap_read_lock_killable(mm)) return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; + ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked, + FOLL_FORCE | FOLL_DUMP | FOLL_GET); + if (locked) + mmap_read_unlock(mm); + return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ -- cgit v1.2.3-59-g8ed1b From 4d45e75a9955ade5c2f49bd96fc4173b2cec9a72 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:13:00 -0700 Subject: mm: remove the now-unnecessary mmget_still_valid() hack The preceding patches have ensured that core dumping properly takes the mmap_lock. Thanks to that, we can now remove mmget_still_valid() and all its users. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-8-jannh@google.com Signed-off-by: Linus Torvalds --- drivers/infiniband/core/uverbs_main.c | 3 --- drivers/vfio/pci/vfio_pci.c | 38 +++++++++++++++++------------------ fs/proc/task_mmu.c | 18 ----------------- fs/userfaultfd.c | 28 +++++++++----------------- include/linux/sched/mm.h | 25 ----------------------- mm/khugepaged.c | 2 +- mm/madvise.c | 17 ---------------- mm/mmap.c | 5 +---- 8 files changed, 29 insertions(+), 107 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 37794d88b1f3..a4ba0b87d6de 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -845,8 +845,6 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) * will only be one mm, so no big deal. */ mmap_read_lock(mm); - if (!mmget_still_valid(mm)) - goto skip_mm; mutex_lock(&ufile->umap_lock); list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list) { @@ -865,7 +863,6 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) } } mutex_unlock(&ufile->umap_lock); - skip_mm: mmap_read_unlock(mm); mmput(mm); } diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 1ab1f5cda4ac..b0f4b92a87ed 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1480,31 +1480,29 @@ static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try) } else { mmap_read_lock(mm); } - if (mmget_still_valid(mm)) { - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) { - mmap_read_unlock(mm); - mmput(mm); - return 0; - } - } else { - mutex_lock(&vdev->vma_lock); + if (try) { + if (!mutex_trylock(&vdev->vma_lock)) { + mmap_read_unlock(mm); + mmput(mm); + return 0; } - list_for_each_entry_safe(mmap_vma, tmp, - &vdev->vma_list, vma_next) { - struct vm_area_struct *vma = mmap_vma->vma; + } else { + mutex_lock(&vdev->vma_lock); + } + list_for_each_entry_safe(mmap_vma, tmp, + &vdev->vma_list, vma_next) { + struct vm_area_struct *vma = mmap_vma->vma; - if (vma->vm_mm != mm) - continue; + if (vma->vm_mm != mm) + continue; - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); + list_del(&mmap_vma->vma_next); + kfree(mmap_vma); - zap_vma_ptes(vma, vma->vm_start, - vma->vm_end - vma->vm_start); - } - mutex_unlock(&vdev->vma_lock); + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); } + mutex_unlock(&vdev->vma_lock); mmap_read_unlock(mm); mmput(mm); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 846d43df3fdf..217aa2705d5d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1244,24 +1244,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } - /* - * Avoid to modify vma->vm_flags - * without locked ops while the - * coredump reads the vm_flags. - */ - if (!mmget_still_valid(mm)) { - /* - * Silently return "count" - * like if get_task_mm() - * failed. FIXME: should this - * function have returned - * -ESRCH if get_task_mm() - * failed like if - * get_proc_task() fails? - */ - mmap_write_unlock(mm); - goto out_mm; - } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0e4a3837da52..000b457ad087 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -601,8 +601,6 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - /* no task can run (and in turn coredump) yet */ - VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -842,7 +840,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - bool still_valid; WRITE_ONCE(ctx->released, true); @@ -858,7 +855,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_lock for writing. */ mmap_write_lock(mm); - still_valid = mmget_still_valid(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -869,17 +865,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); - if (still_valid) { - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX); - if (prev) - vma = prev; - else - prev = vma; - } + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1309,8 +1303,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1511,8 +1503,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 15bfb06f2884..981e34cb1409 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,31 +49,6 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } -/* - * This has to be called after a get_task_mm()/mmget_not_zero() - * followed by taking the mmap_lock for writing before modifying the - * vmas or anything the coredump pretends not to change from under it. - * - * It also has to be called when mmgrab() is used in the context of - * the process, but then the mm_count refcount is transferred outside - * the context of the process to run down_write() on that pinned mm. - * - * NOTE: find_extend_vma() called from GUP context is the only place - * that can modify the "mm" (notably the vm_start/end) under mmap_lock - * for reading and outside the context of the process, so it is also - * the only case that holds the mmap_lock for reading that must call - * this function. Generally if the mmap_lock is hold for reading - * there's no need of this check after get_task_mm()/mmget_not_zero(). - * - * This function can be obsoleted and the check can be removed, after - * the coredump code will hold the mmap_lock for writing before - * invoking the ->core_dump methods. - */ -static inline bool mmget_still_valid(struct mm_struct *mm) -{ - return likely(!mm->core_state); -} - /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 58b0d9c502a1..4e3dff13eb70 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -434,7 +434,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, static inline int khugepaged_test_exit(struct mm_struct *mm) { - return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm); + return atomic_read(&mm->mm_users) == 0; } static bool hugepage_vma_check(struct vm_area_struct *vma, diff --git a/mm/madvise.c b/mm/madvise.c index 68b761c359d0..fd1f448b4e1d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1085,23 +1085,6 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) if (write) { if (mmap_write_lock_killable(current->mm)) return -EINTR; - - /* - * We may have stolen the mm from another process - * that is undergoing core dumping. - * - * Right now that's io_ring, in the future it may - * be remote process management and not "current" - * at all. - * - * We need to fix core dumping to not do this, - * but for now we have the mmget_still_valid() - * model. - */ - if (!mmget_still_valid(current->mm)) { - mmap_write_unlock(current->mm); - return -EINTR; - } } else { mmap_read_lock(current->mm); } diff --git a/mm/mmap.c b/mm/mmap.c index 9fb9f8a233c7..ebb92f5515a1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2562,7 +2562,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (vma && (vma->vm_start <= addr)) return vma; /* don't alter vm_end if the coredump is running */ - if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) + if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2588,9 +2588,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; - /* don't alter vm_start if the coredump is running */ - if (!mmget_still_valid(mm)) - return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; -- cgit v1.2.3-59-g8ed1b From 50b7d85680086126d7bd91dae81d57d4cb1ab6b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:13:04 -0700 Subject: ramfs: fix nommu mmap with gaps in the page cache ramfs needs to check that pages are both physically contiguous and contiguous in the file. If the page cache happens to have, eg, page A for index 0 of the file, no page for index 1, and page A+1 for index 2, then an mmap of the first two pages of the file will succeed when it should fail. Fixes: 642fb4d1f1dd ("[PATCH] NOMMU: Provide shared-writable mmap support on ramfs") Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: David Howells Link: https://lkml.kernel.org/r/20200914122239.GO6583@casper.infradead.org Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 414695454956..355523f4a4bf 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -224,7 +224,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free; - nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); + nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */ -- cgit v1.2.3-59-g8ed1b From 589f6b52682542c1b230c435e1de679755f3332a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 15 Oct 2020 20:13:08 -0700 Subject: autofs: harden ioctl table The table of ioctl functions should be marked const in order to put them in read-only memory, and we should use array_index_nospec() to avoid speculation disclosing the contents of kernel memory to userspace. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Acked-by: Ian Kent Link: https://lkml.kernel.org/r/20200818122203.GO17456@casper.infradead.org Signed-off-by: Linus Torvalds --- fs/autofs/dev-ioctl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 75105f45c51a..322b7dfb4ea0 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "autofs_i.h" @@ -563,7 +564,7 @@ out: static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { - static ioctl_fn _ioctls[] = { + static const ioctl_fn _ioctls[] = { autofs_dev_ioctl_version, autofs_dev_ioctl_protover, autofs_dev_ioctl_protosubver, @@ -581,7 +582,10 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd) }; unsigned int idx = cmd_idx(cmd); - return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; + if (idx >= ARRAY_SIZE(_ioctls)) + return NULL; + idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls)); + return _ioctls[idx]; } /* ioctl dispatcher */ -- cgit v1.2.3-59-g8ed1b From 64ead5201e757159d1d83a5e0752090c2ab3319f Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Thu, 15 Oct 2020 20:13:11 -0700 Subject: nilfs2: fix some kernel-doc warnings for nilfs2 Fixes the following W=1 kernel build warning(s): fs/nilfs2/bmap.c:378: warning: Excess function parameter 'bhp' description in 'nilfs_bmap_assign' fs/nilfs2/cpfile.c:907: warning: Excess function parameter 'status' description in 'nilfs_cpfile_change_cpmode' fs/nilfs2/cpfile.c:946: warning: Excess function parameter 'stat' description in 'nilfs_cpfile_get_stat' fs/nilfs2/page.c:76: warning: Excess function parameter 'inode' description in 'nilfs_forget_buffer' fs/nilfs2/sufile.c:563: warning: Excess function parameter 'stat' description in 'nilfs_sufile_get_stat' Signed-off-by: Wang Hai Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/1601386269-2423-1-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Linus Torvalds --- fs/nilfs2/bmap.c | 2 +- fs/nilfs2/cpfile.c | 6 +++--- fs/nilfs2/page.c | 1 - fs/nilfs2/sufile.c | 4 ++-- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index e516ae389ca5..5900879d5693 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -355,7 +355,7 @@ void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, /** * nilfs_bmap_assign - assign a new block number to a block * @bmap: bmap - * @bhp: pointer to buffer head + * @bh: pointer to buffer head * @blocknr: block number * @binfo: block information * diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 86d4d850d130..025fb082575a 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -889,7 +889,7 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) * nilfs_cpfile_change_cpmode - change checkpoint mode * @cpfile: inode of checkpoint file * @cno: checkpoint number - * @status: mode of checkpoint + * @mode: mode of checkpoint * * Description: nilfs_change_cpmode() changes the mode of the checkpoint * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. @@ -930,12 +930,12 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) /** * nilfs_cpfile_get_stat - get checkpoint statistics * @cpfile: inode of checkpoint file - * @stat: pointer to a structure of checkpoint statistics + * @cpstat: pointer to a structure of checkpoint statistics * * Description: nilfs_cpfile_get_stat() returns information about checkpoints. * * Return Value: On success, 0 is returned, and checkpoints information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @cpstat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index b175f1330408..171fb5cd427f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -69,7 +69,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, /** * nilfs_forget_buffer - discard dirty state - * @inode: owner inode of the buffer * @bh: buffer head of the buffer to be discarded */ void nilfs_forget_buffer(struct buffer_head *bh) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 42ff67c0c14f..63722475e17e 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -546,13 +546,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, /** * nilfs_sufile_get_stat - get segment usage statistics * @sufile: inode of segment usage file - * @stat: pointer to a structure of segment usage statistics + * @sustat: pointer to a structure of segment usage statistics * * Description: nilfs_sufile_get_stat() returns information about segment * usage. * * Return Value: On success, 0 is returned, and segment usage information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @sustat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. -- cgit v1.2.3-59-g8ed1b From fa63f083b3492b5ed5332b8d7c90b03b5ef24a1d Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 15 Oct 2020 20:13:15 -0700 Subject: rapidio: fix error handling path rio_dma_transfer() attempts to clamp the return value of pin_user_pages_fast() to be >= 0. However, the attempt fails because nr_pages is overridden a few lines later, and restored to the undesirable -ERRNO value. The return value is ultimately stored in nr_pages, which in turn is passed to unpin_user_pages(), which expects nr_pages >= 0, else, disaster. Fix this by fixing the nesting of the assignment to nr_pages: nr_pages should be clamped to zero if pin_user_pages_fast() returns -ERRNO, or set to the return value of pin_user_pages_fast(), otherwise. [jhubbard@nvidia.com: new changelog] Fixes: e8de370188d09 ("rapidio: add mport char device driver") Signed-off-by: Souptick Joarder Signed-off-by: Andrew Morton Reviewed-by: Ira Weiny Reviewed-by: John Hubbard Cc: Matthew Wilcox Cc: Matt Porter Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Madhuparna Bhowmik Cc: Dan Carpenter Link: https://lkml.kernel.org/r/1600227737-20785-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/rio_mport_cdev.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index a30342942e26..163b6c72501d 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -871,15 +871,16 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, rmcd_error("pin_user_pages_fast err=%ld", pinned); nr_pages = 0; - } else + } else { rmcd_error("pinned %ld out of %ld pages", pinned, nr_pages); + /* + * Set nr_pages up to mean "how many pages to unpin, in + * the error handler: + */ + nr_pages = pinned; + } ret = -EFAULT; - /* - * Set nr_pages up to mean "how many pages to unpin, in - * the error handler: - */ - nr_pages = pinned; goto err_pg; } -- cgit v1.2.3-59-g8ed1b From 85094c05eeb47d195a74a25366a2db066f1c9d47 Mon Sep 17 00:00:00 2001 From: Jing Xiangfeng Date: Thu, 15 Oct 2020 20:13:18 -0700 Subject: rapidio: fix the missed put_device() for rio_mport_add_riodev rio_mport_add_riodev() misses to call put_device() when the device already exists. Add the missed function call to fix it. Fixes: e8de370188d0 ("rapidio: add mport char device driver") Signed-off-by: Jing Xiangfeng Signed-off-by: Andrew Morton Reviewed-by: Dan Carpenter Cc: Matt Porter Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: John Hubbard Cc: Kees Cook Cc: Madhuparna Bhowmik Link: https://lkml.kernel.org/r/20200922072525.42330-1-jingxiangfeng@huawei.com Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/rio_mport_cdev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 163b6c72501d..94331d999d27 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -1680,6 +1680,7 @@ static int rio_mport_add_riodev(struct mport_cdev_priv *priv, struct rio_dev *rdev; struct rio_switch *rswitch = NULL; struct rio_mport *mport; + struct device *dev; size_t size; u32 rval; u32 swpinfo = 0; @@ -1694,8 +1695,10 @@ static int rio_mport_add_riodev(struct mport_cdev_priv *priv, rmcd_debug(RDEV, "name:%s ct:0x%x did:0x%x hc:0x%x", dev_info.name, dev_info.comptag, dev_info.destid, dev_info.hopcount); - if (bus_find_device_by_name(&rio_bus_type, NULL, dev_info.name)) { + dev = bus_find_device_by_name(&rio_bus_type, NULL, dev_info.name); + if (dev) { rmcd_debug(RDEV, "device %s already exists", dev_info.name); + put_device(dev); return -EEXIST; } -- cgit v1.2.3-59-g8ed1b From 3f388f28639fd19d5bf6df7a882c94ccfbf49c2b Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 15 Oct 2020 20:13:22 -0700 Subject: panic: dump registers on panic_on_warn Currently we print stack and registers for ordinary warnings but we do not for panic_on_warn which looks as oversight - panic() will reboot the machine but won't print registers. This moves printing of registers and modules earlier. This does not move the stack dumping as panic() dumps it. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Douglas Anderson Cc: Ingo Molnar Cc: Kees Cook Cc: Rafael Aquini Cc: Thomas Gleixner Cc: Will Deacon Cc: Nicholas Piggin Link: https://lkml.kernel.org/r/20200804095054.68724-1-aik@ozlabs.ru Signed-off-by: Linus Torvalds --- kernel/panic.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index aef8872ba843..396142ee43fd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -589,6 +589,11 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (args) vprintk(args->fmt, args->args); + print_modules(); + + if (regs) + show_regs(regs); + if (panic_on_warn) { /* * This thread may hit another WARN() in the panic path. @@ -600,12 +605,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - print_modules(); - - if (regs) - show_regs(regs); - else - dump_stack(); + dump_stack(); print_irqtrace_events(current); -- cgit v1.2.3-59-g8ed1b From ac05b7a1b48ba9fc79937a08db4c7131dba8fc5f Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 15 Oct 2020 20:13:25 -0700 Subject: kernel/relay.c: drop unneeded initialization The variable 'consumed' is initialized with the consumed count but immediately after that the consumed count is updated and assigned to 'consumed' again thus overwriting the previous value. So, drop the unneeded initialization. Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: https://lkml.kernel.org/r/20201005205727.1147-1-sudipm.mukherjee@gmail.com Signed-off-by: Linus Torvalds --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/relay.c b/kernel/relay.c index fb4e0c530c08..b08d936d5fa7 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1002,7 +1002,7 @@ static int relay_file_read_avail(struct rchan_buf *buf) size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; - size_t consumed = buf->subbufs_consumed; + size_t consumed; relay_file_read_consume(buf, 0, 0); -- cgit v1.2.3-59-g8ed1b From 998ec76b920086d9f6bac8b11719ee81976743b6 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Thu, 15 Oct 2020 20:13:29 -0700 Subject: scripts/gdb/proc: add struct mount & struct super_block addr in lx-mounts command This is many times found useful while debugging some FS related issue. mount super_block devname pathname fstype options 0xffff888a0bfa4b40 0xffff888a0bfc1000 none / rootfs rw 0 0 0xffff888a033f75c0 0xffff8889fcf65000 /dev/root / ext4 rw,relatime 0 0 0xffff8889fc8ce040 0xffff888a0bb51000 devtmpfs /dev devtmpfs rw,relatime 0 0 Signed-off-by: Ritesh Harjani Signed-off-by: Andrew Morton Reviewed-by: Jan Kiszka Cc: Kieran Bingham Link: http://lkml.kernel.org/r/a3c4177e1597b3e06d66d55e07d72c0c46a03571.1597742951.git.riteshh@linux.ibm.com Signed-off-by: Linus Torvalds --- scripts/gdb/linux/proc.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 6a56bba233a9..09cd871925a5 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -167,6 +167,9 @@ values of that process namespace""" if not namespace: raise gdb.GdbError("No namespace for current process") + gdb.write("{:^18} {:^15} {:>9} {} {} options\n".format( + "mount", "super_block", "devname", "pathname", "fstype")) + for vfs in lists.list_for_each_entry(namespace['list'], mount_ptr_type, "mnt_list"): devname = vfs['mnt_devname'].string() @@ -190,14 +193,10 @@ values of that process namespace""" m_flags = int(vfs['mnt']['mnt_flags']) rd = "ro" if (s_flags & constants.LX_SB_RDONLY) else "rw" - gdb.write( - "{} {} {} {}{}{} 0 0\n" - .format(devname, - pathname, - fstype, - rd, - info_opts(FS_INFO, s_flags), - info_opts(MNT_INFO, m_flags))) + gdb.write("{} {} {} {} {} {}{}{} 0 0\n".format( + vfs.format_string(), superblock.format_string(), devname, + pathname, fstype, rd, info_opts(FS_INFO, s_flags), + info_opts(MNT_INFO, m_flags))) LxMounts() -- cgit v1.2.3-59-g8ed1b From 4fbe310e44067e5f15c327483031f9564e6b1826 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Thu, 15 Oct 2020 20:13:32 -0700 Subject: scripts/gdb/tasks: add headers and improve spacing format With the patch. TASK PID COMM 0xffffffff82c2b8c0 0 swapper/0 0xffff888a0ba20040 1 systemd 0xffff888a0ba24040 2 kthreadd 0xffff888a0ba28040 3 rcu_gp w/o 0xffffffff82c2b8c0 0 swapper/0 0xffff888a0ba20040 1 systemd 0xffff888a0ba24040 2 kthreadd 0xffff888a0ba28040 3 rcu_gp Signed-off-by: Ritesh Harjani Signed-off-by: Andrew Morton Reviewed-by: Jan Kiszka Cc: Kieran Bingham Link: http://lkml.kernel.org/r/54c868c79b5fc364a8be7799891934a6fe6d1464.1597742951.git.riteshh@linux.ibm.com Signed-off-by: Linus Torvalds --- scripts/gdb/linux/tasks.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py index 0301dc1e0138..17ec19e9b5bf 100644 --- a/scripts/gdb/linux/tasks.py +++ b/scripts/gdb/linux/tasks.py @@ -73,11 +73,12 @@ class LxPs(gdb.Command): super(LxPs, self).__init__("lx-ps", gdb.COMMAND_DATA) def invoke(self, arg, from_tty): + gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM")) for task in task_lists(): - gdb.write("{address} {pid} {comm}\n".format( - address=task, - pid=task["pid"], - comm=task["comm"].string())) + gdb.write("{} {:^5} {}\n".format( + task.format_string().split()[0], + task["pid"].format_string(), + task["comm"].string())) LxPs() -- cgit v1.2.3-59-g8ed1b From 5cf53f3ce3b9ff5321b56f9ed9d90d59307be7d0 Mon Sep 17 00:00:00 2001 From: Elena Petrova Date: Thu, 15 Oct 2020 20:13:35 -0700 Subject: sched.h: drop in_ubsan field when UBSAN is in trap mode in_ubsan field of task_struct is only used in lib/ubsan.c, which in its turn is used only `ifneq ($(CONFIG_UBSAN_TRAP),y)`. Removing unnecessary field from a task_struct will help preserve the ABI between vanilla and CONFIG_UBSAN_TRAP'ed kernels. In particular, this will help enabling bounds sanitizer transparently for Android's GKI. Signed-off-by: Elena Petrova Signed-off-by: Andrew Morton Acked-by: Kees Cook Cc: Jann Horn Link: https://lkml.kernel.org/r/20200910134802.3160311-1-lenaptr@google.com Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9030f3abd969..063cd120b459 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1013,7 +1013,7 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif -#ifdef CONFIG_UBSAN +#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif -- cgit v1.2.3-59-g8ed1b From 6a6155f664e31c9be43cd690541a9a682ba3dc22 Mon Sep 17 00:00:00 2001 From: George Popescu Date: Thu, 15 Oct 2020 20:13:38 -0700 Subject: ubsan: introduce CONFIG_UBSAN_LOCAL_BOUNDS for Clang When the kernel is compiled with Clang, -fsanitize=bounds expands to -fsanitize=array-bounds and -fsanitize=local-bounds. Enabling -fsanitize=local-bounds with Clang has the unfortunate side-effect of inserting traps; this goes back to its original intent, which was as a hardening and not a debugging feature [1]. The same feature made its way into -fsanitize=bounds, but the traps remained. For that reason, -fsanitize=bounds was split into 'array-bounds' and 'local-bounds' [2]. Since 'local-bounds' doesn't behave like a normal sanitizer, enable it with Clang only if trapping behaviour was requested by CONFIG_UBSAN_TRAP=y. Add the UBSAN_BOUNDS_LOCAL config to Kconfig.ubsan to enable the 'local-bounds' option by default when UBSAN_TRAP is enabled. [1] http://lists.llvm.org/pipermail/llvm-dev/2012-May/049972.html [2] http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20131021/091536.html Suggested-by: Marco Elver Signed-off-by: George Popescu Signed-off-by: Andrew Morton Reviewed-by: David Brazdil Reviewed-by: Marco Elver Cc: Masahiro Yamada Cc: Michal Marek Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Kees Cook Cc: Dmitry Vyukov Cc: Arnd Bergmann Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20200922074330.2549523-1-georgepope@google.com Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 14 ++++++++++++++ scripts/Makefile.ubsan | 10 +++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 774315de555a..58f8d03d037b 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -47,6 +47,20 @@ config UBSAN_BOUNDS to the {str,mem}*cpy() family of functions (that is addressed by CONFIG_FORTIFY_SOURCE). +config UBSAN_LOCAL_BOUNDS + bool "Perform array local bounds checking" + depends on UBSAN_TRAP + depends on CC_IS_CLANG + depends on !UBSAN_KCOV_BROKEN + help + This option enables -fsanitize=local-bounds which traps when an + exception/error is detected. Therefore, it should be enabled only + if trapping is expected. + Enabling this option detects errors due to accesses through a + pointer that is derived from an object of a statically-known size, + where an added offset (which may not be known statically) is + out-of-bounds. + config UBSAN_MISC bool "Enable all other Undefined Behavior sanity checks" default UBSAN diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index 27348029b2b8..4e3fff0745e8 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -4,7 +4,15 @@ ifdef CONFIG_UBSAN_ALIGNMENT endif ifdef CONFIG_UBSAN_BOUNDS - CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds) + ifdef CONFIG_CC_IS_CLANG + CFLAGS_UBSAN += -fsanitize=array-bounds + else + CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds) + endif +endif + +ifdef CONFIG_UBSAN_LOCAL_BOUNDS + CFLAGS_UBSAN += -fsanitize=local-bounds endif ifdef CONFIG_UBSAN_MISC -- cgit v1.2.3-59-g8ed1b From d9bc85de46cad90851c346b592005090674b7669 Mon Sep 17 00:00:00 2001 From: Libing Zhou Date: Thu, 15 Oct 2020 20:13:42 -0700 Subject: ROMFS: support inode blocks calculation When use 'stat' tool to display file status, the 'Blocks' field always in '0', this is not good for tool 'du'(e.g.: busybox 'du'), it always output '0' size for the files under ROMFS since such tool calculates number of 512B Blocks. This patch calculates approx. number of 512B blocks based on inode size. Signed-off-by: Libing Zhou Signed-off-by: Andrew Morton Cc: David Howells Cc: Al Viro Link: http://lkml.kernel.org/r/20200811052606.4243-1-libing.zhou@nokia-sbell.com Signed-off-by: Linus Torvalds --- fs/romfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/romfs/super.c b/fs/romfs/super.c index e582d001f792..b1b7d3f5752f 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -356,6 +356,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) } i->i_mode = mode; + i->i_blocks = (i->i_size + 511) >> 9; unlock_new_inode(i); return i; -- cgit v1.2.3-59-g8ed1b From 2c739ced5886cd8c8361faa79a9522ec05174ed0 Mon Sep 17 00:00:00 2001 From: Albert van der Linde Date: Thu, 15 Oct 2020 20:13:46 -0700 Subject: lib, include/linux: add usercopy failure capability Patch series "add fault injection to user memory access", v3. The goal of this series is to improve testing of fault-tolerance in usages of user memory access functions, by adding support for fault injection. syzkaller/syzbot are using the existing fault injection modes and will use this particular feature also. The first patch adds failure injection capability for usercopy functions. The second changes usercopy functions to use this new failure capability (copy_from_user, ...). The third patch adds get/put/clear_user failures to x86. This patch (of 3): Add a failure injection capability to improve testing of fault-tolerance in usages of user memory access functions. Add CONFIG_FAULT_INJECTION_USERCOPY to enable faults in usercopy functions. The should_fail_usercopy function is to be called by these functions (copy_from_user, get_user, ...) in order to fail or not. Signed-off-by: Albert van der Linde Signed-off-by: Andrew Morton Reviewed-by: Akinobu Mita Reviewed-by: Alexander Potapenko Cc: Borislav Petkov Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Peter Zijlstra (Intel) Cc: "H. Peter Anvin" Cc: Al Viro Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Marco Elver Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200831171733.955393-1-alinde@google.com Link: http://lkml.kernel.org/r/20200831171733.955393-2-alinde@google.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 1 + Documentation/fault-injection/fault-injection.rst | 7 +++- include/linux/fault-inject-usercopy.h | 22 +++++++++++++ lib/Kconfig.debug | 7 ++++ lib/Makefile | 1 + lib/fault-inject-usercopy.c | 39 +++++++++++++++++++++++ 6 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 include/linux/fault-inject-usercopy.h create mode 100644 lib/fault-inject-usercopy.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f7ac0e663976..6d9afb221ff7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1343,6 +1343,7 @@ current integrity status. failslab= + fail_usercopy= fail_page_alloc= fail_make_request=[KNL] General fault injection mechanism. diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index f850ad018b70..31ecfe44e5b4 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -16,6 +16,10 @@ Available fault injection capabilities injects page allocation failures. (alloc_pages(), get_free_pages(), ...) +- fail_usercopy + + injects failures in user memory access functions. (copy_from_user(), get_user(), ...) + - fail_futex injects futex deadlock and uaddr fault errors. @@ -177,6 +181,7 @@ use the boot option:: failslab= fail_page_alloc= + fail_usercopy= fail_make_request= fail_futex= mmc_core.fail_request=,,, @@ -222,7 +227,7 @@ How to add new fault injection capability - debugfs entries - failslab, fail_page_alloc, and fail_make_request use this way. + failslab, fail_page_alloc, fail_usercopy, and fail_make_request use this way. Helper functions: fault_create_debugfs_attr(name, parent, attr); diff --git a/include/linux/fault-inject-usercopy.h b/include/linux/fault-inject-usercopy.h new file mode 100644 index 000000000000..56c3a693fdd9 --- /dev/null +++ b/include/linux/fault-inject-usercopy.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_FAULT_INJECT_USERCOPY_H__ +#define __LINUX_FAULT_INJECT_USERCOPY_H__ + +/* + * This header provides a wrapper for injecting failures to user space memory + * access functions. + */ + +#include + +#ifdef CONFIG_FAULT_INJECTION_USERCOPY + +bool should_fail_usercopy(void); + +#else + +static inline bool should_fail_usercopy(void) { return false; } + +#endif /* CONFIG_FAULT_INJECTION_USERCOPY */ + +#endif /* __LINUX_FAULT_INJECT_USERCOPY_H__ */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 491789a793ae..ebe5ab111e65 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1768,6 +1768,13 @@ config FAIL_PAGE_ALLOC help Provide fault-injection capability for alloc_pages(). +config FAULT_INJECTION_USERCOPY + bool "Fault injection capability for usercopy functions" + depends on FAULT_INJECTION + help + Provides fault-injection capability to inject failures + in usercopy functions (copy_from_user(), get_user(), ...). + config FAIL_MAKE_REQUEST bool "Fault-injection capability for disk IO" depends on FAULT_INJECTION && BLOCK diff --git a/lib/Makefile b/lib/Makefile index 49a2a9e36224..1c7577b2e86a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -210,6 +210,7 @@ obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o +obj-$(CONFIG_FAULT_INJECTION_USERCOPY) += fault-inject-usercopy.o obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o diff --git a/lib/fault-inject-usercopy.c b/lib/fault-inject-usercopy.c new file mode 100644 index 000000000000..77558b6c29ca --- /dev/null +++ b/lib/fault-inject-usercopy.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +static struct { + struct fault_attr attr; +} fail_usercopy = { + .attr = FAULT_ATTR_INITIALIZER, +}; + +static int __init setup_fail_usercopy(char *str) +{ + return setup_fault_attr(&fail_usercopy.attr, str); +} +__setup("fail_usercopy=", setup_fail_usercopy); + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_usercopy_debugfs(void) +{ + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_usercopy", NULL, + &fail_usercopy.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + return 0; +} + +late_initcall(fail_usercopy_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +bool should_fail_usercopy(void) +{ + return should_fail(&fail_usercopy.attr, 1); +} +EXPORT_SYMBOL_GPL(should_fail_usercopy); -- cgit v1.2.3-59-g8ed1b From 4d0e9df5e43dba52d38b251e3b909df8fa1110be Mon Sep 17 00:00:00 2001 From: Albert van der Linde Date: Thu, 15 Oct 2020 20:13:50 -0700 Subject: lib, uaccess: add failure injection to usercopy functions To test fault-tolerance of user memory access functions, introduce fault injection to usercopy functions. If a failure is expected return either -EFAULT or the total amount of bytes that were not copied. Signed-off-by: Albert van der Linde Signed-off-by: Andrew Morton Reviewed-by: Akinobu Mita Reviewed-by: Alexander Potapenko Cc: Al Viro Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Marco Elver Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200831171733.955393-3-alinde@google.com Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 11 ++++++++++- lib/iov_iter.c | 5 +++++ lib/strncpy_from_user.c | 3 +++ lib/usercopy.c | 5 ++++- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ef084eacaa7c..1b8c9d6162bc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,6 +2,7 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ +#include #include #include #include @@ -84,6 +85,8 @@ static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; instrument_copy_from_user(to, from, n); check_object_size(to, n, false); return raw_copy_from_user(to, from, n); @@ -105,6 +108,8 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + if (should_fail_usercopy()) + return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); @@ -114,6 +119,8 @@ static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); @@ -125,7 +132,7 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); - if (likely(access_ok(from, n))) { + if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user(to, from, n); res = raw_copy_from_user(to, from, n); } @@ -143,6 +150,8 @@ static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 14cae2584db4..1635111c5bd2 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -140,6 +141,8 @@ static int copyout(void __user *to, const void *from, size_t n) { + if (should_fail_usercopy()) + return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); @@ -149,6 +152,8 @@ static int copyout(void __user *to, const void *from, size_t n) static int copyin(void *to, const void __user *from, size_t n) { + if (should_fail_usercopy()) + return n; if (access_ok(from, n)) { instrument_copy_from_user(to, from, n); n = raw_copy_from_user(to, from, n); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 34696a348864..e6d5fcc2cdf3 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -99,6 +100,8 @@ long strncpy_from_user(char *dst, const char __user *src, long count) unsigned long max_addr, src_addr; might_fault(); + if (should_fail_usercopy()) + return -EFAULT; if (unlikely(count <= 0)) return 0; diff --git a/lib/usercopy.c b/lib/usercopy.c index b26509f112f9..7413dd300516 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include @@ -10,7 +11,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n { unsigned long res = n; might_fault(); - if (likely(access_ok(from, n))) { + if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user(to, from, n); res = raw_copy_from_user(to, from, n); } @@ -25,6 +26,8 @@ EXPORT_SYMBOL(_copy_from_user); unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; if (likely(access_ok(to, n))) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); -- cgit v1.2.3-59-g8ed1b