From d622abf74f3d81365e41c3bfdbbda50ecd99ba3d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:56:53 -0700 Subject: mm: memblock: replace dereferences of memblock_region.nid with API calls Patch series "mm: rework free_area_init*() funcitons". After the discussion [1] about removal of CONFIG_NODES_SPAN_OTHER_NODES and CONFIG_HAVE_MEMBLOCK_NODE_MAP options, I took it a bit further and updated the node/zone initialization. Since all architectures have memblock, it is possible to use only the newer version of free_area_init_node() that calculates the zone and node boundaries based on memblock node mapping and architectural limits on possible zone PFNs. The architectures that still determined zone and hole sizes can be switched to the generic code and the old code that took those zone and hole sizes can be simply removed. And, since it all started from the removal of CONFIG_NODES_SPAN_OTHER_NODES, the memmap_init() is now updated to iterate over memblocks and so it does not need to perform early_pfn_to_nid() query for every PFN. [1] https://lore.kernel.org/lkml/1585420282-25630-1-git-send-email-Hoan@os.amperecomputing.com This patch (of 21): There are several places in the code that directly dereference memblock_region.nid despite this field being defined only when CONFIG_HAVE_MEMBLOCK_NODE_MAP=y. Replace these with calls to memblock_get_region_nid() to improve code robustness and to avoid possible breakage when CONFIG_HAVE_MEMBLOCK_NODE_MAP will be removed. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Reviewed-by: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Mike Rapoport Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200412194859.12663-2-rppt@kernel.org Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca864102bebe..511602288e2d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7220,7 +7220,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (!memblock_is_hotpluggable(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? @@ -7241,7 +7241,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (memblock_is_mirror(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = memblock_region_memory_base_pfn(r); -- cgit v1.2.3-59-g8ed1b From 6f24fbd38c4e05f7905814791806c01dc6c4b9de Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:56:57 -0700 Subject: mm: make early_pfn_to_nid() and related defintions close to each other early_pfn_to_nid() and its helper __early_pfn_to_nid() are spread around include/linux/mm.h, include/linux/mmzone.h and mm/page_alloc.c. Drop unused stub for __early_pfn_to_nid() and move its actual generic implementation close to its users. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Reviewed-by: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-3-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- include/linux/mmzone.h | 9 --------- mm/page_alloc.c | 51 +++++++++++++++++++++++++------------------------- 3 files changed, 27 insertions(+), 37 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6c236c5b0015..4288e6993dc8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2445,9 +2445,9 @@ extern void sparse_memory_present_with_active_regions(int nid); #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) +static inline int early_pfn_to_nid(unsigned long pfn) { + BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA)); return 0; } #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fdd9beb5efed..c3a77eb85b42 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1080,15 +1080,6 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #include #endif -#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ - !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) -static inline unsigned long early_pfn_to_nid(unsigned long pfn) -{ - BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA)); - return 0; -} -#endif - #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 511602288e2d..8741ae0828e1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1504,6 +1504,31 @@ void __free_pages_core(struct page *page, unsigned int order) static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + */ +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) +{ + unsigned long start_pfn, end_pfn; + int nid; + + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != NUMA_NO_NODE) { + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; + } + + return nid; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + int __meminit early_pfn_to_nid(unsigned long pfn) { static DEFINE_SPINLOCK(early_pfn_lock); @@ -6310,32 +6335,6 @@ void __meminit init_currently_empty_zone(struct zone *zone, zone->initialized = 1; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID - -/* - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - */ -int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) -{ - unsigned long start_pfn, end_pfn; - int nid; - - if (state->last_start <= pfn && pfn < state->last_end) - return state->last_nid; - - nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != NUMA_NO_NODE) { - state->last_start = start_pfn; - state->last_end = end_pfn; - state->last_nid = nid; - } - - return nid; -} -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ - /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. -- cgit v1.2.3-59-g8ed1b From 3f08a302f533f74ad2e909e7a61274aa7eebc0ab Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:57:02 -0700 Subject: mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP option CONFIG_HAVE_MEMBLOCK_NODE_MAP is used to differentiate initialization of nodes and zones structures between the systems that have region to node mapping in memblock and those that don't. Currently all the NUMA architectures enable this option and for the non-NUMA systems we can presume that all the memory belongs to node 0 and therefore the compile time configuration option is not required. The remaining few architectures that use DISCONTIGMEM without NUMA are easily updated to use memblock_add_node() instead of memblock_add() and thus have proper correspondence of memblock regions to NUMA nodes. Still, free_area_init_node() must have a backward compatible version because its semantics with and without CONFIG_HAVE_MEMBLOCK_NODE_MAP is different. Once all the architectures will use the new semantics, the entire compatibility layer can be dropped. To avoid addition of extra run time memory to store node id for architectures that keep memblock but have only a single node, the node id field of the memblock_region is guarded by CONFIG_NEED_MULTIPLE_NODES and the corresponding accessors presume that in those cases it is always 0. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Acked-by: Catalin Marinas [arm64] Cc: Baoquan He Cc: Brian Cain Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-4-rppt@kernel.org Signed-off-by: Linus Torvalds --- .../features/vm/numa-memblock/arch-support.txt | 34 ------- arch/alpha/mm/numa.c | 4 +- arch/arm64/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m68k/mm/motorola.c | 4 +- arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/x86/Kconfig | 1 - include/linux/memblock.h | 8 +- include/linux/mm.h | 12 +-- include/linux/mmzone.h | 2 +- mm/Kconfig | 3 - mm/memblock.c | 11 +-- mm/memory_hotplug.c | 4 - mm/page_alloc.c | 101 ++++++++++++--------- 20 files changed, 74 insertions(+), 119 deletions(-) delete mode 100644 Documentation/features/vm/numa-memblock/arch-support.txt (limited to 'mm/page_alloc.c') diff --git a/Documentation/features/vm/numa-memblock/arch-support.txt b/Documentation/features/vm/numa-memblock/arch-support.txt deleted file mode 100644 index 3004beb0fd71..000000000000 --- a/Documentation/features/vm/numa-memblock/arch-support.txt +++ /dev/null @@ -1,34 +0,0 @@ -# -# Feature name: numa-memblock -# Kconfig: HAVE_MEMBLOCK_NODE_MAP -# description: arch supports NUMA aware memblocks -# - ----------------------- - | arch |status| - ----------------------- - | alpha: | TODO | - | arc: | .. | - | arm: | .. | - | arm64: | ok | - | c6x: | .. | - | csky: | .. | - | h8300: | .. | - | hexagon: | .. | - | ia64: | ok | - | m68k: | .. | - | microblaze: | ok | - | mips: | ok | - | nds32: | TODO | - | nios2: | .. | - | openrisc: | .. | - | parisc: | .. | - | powerpc: | ok | - | riscv: | ok | - | s390: | ok | - | sh: | ok | - | sparc: | ok | - | um: | .. | - | unicore32: | .. | - | x86: | ok | - | xtensa: | .. | - ----------------------- diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index d0b73371e985..a24cd13e71cb 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -144,8 +144,8 @@ setup_memory_node(int nid, void *kernel_end) if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) panic("kernel loaded out of ram"); - memblock_add(PFN_PHYS(node_min_pfn), - (node_max_pfn - node_min_pfn) << PAGE_SHIFT); + memblock_add_node(PFN_PHYS(node_min_pfn), + (node_max_pfn - node_min_pfn) << PAGE_SHIFT, nid); /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned. Note that we round this down, not up - node memory diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 552d36cacc05..1a9b480c6f1d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -162,7 +162,6 @@ config ARM64 select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING - select HAVE_MEMBLOCK_NODE_MAP if NUMA select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index bab7cd878464..88b05b5256a9 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -31,7 +31,6 @@ config IA64 select HAVE_FUNCTION_TRACER select TTY select HAVE_ARCH_TRACEHOOK - select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING select DMA_NONCOHERENT_MMAP select ARCH_HAS_SYNC_DMA_FOR_CPU diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index fc16190ec2d6..84ab5963cabb 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -386,7 +386,7 @@ void __init paging_init(void) min_addr = m68k_memory[0].addr; max_addr = min_addr + m68k_memory[0].size; - memblock_add(m68k_memory[0].addr, m68k_memory[0].size); + memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0); for (i = 1; i < m68k_num_memory;) { if (m68k_memory[i].addr < min_addr) { printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n", @@ -397,7 +397,7 @@ void __init paging_init(void) (m68k_num_memory - i) * sizeof(struct m68k_mem_info)); continue; } - memblock_add(m68k_memory[i].addr, m68k_memory[i].size); + memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i); addr = m68k_memory[i].addr + m68k_memory[i].size; if (addr > max_addr) max_addr = addr; diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 9606c244b5b8..d262ac0c8714 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -32,7 +32,6 @@ config MICROBLAZE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER - select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE select HAVE_PCI select IRQ_DOMAIN diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 690718b3701a..94a91b5b7759 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -72,7 +72,6 @@ config MIPS select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_LD_DEAD_CODE_DATA_ELIMINATION - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b29d7cb38368..41ba42b107c0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -211,7 +211,6 @@ config PPC select HAVE_KRETPROBES select HAVE_LD_DEAD_CODE_DATA_ELIMINATION select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) select HAVE_HARDLOCKUP_DETECTOR_ARCH if (PPC64 && PPC_BOOK3S) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a31e1a41913a..5c07ca4d5cd6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -32,7 +32,6 @@ config RISCV select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER select HAVE_ASM_MODVERSIONS - select HAVE_MEMBLOCK_NODE_MAP select HAVE_DMA_CONTIGUOUS if MMU select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_PERF_EVENTS diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2167bce993ff..d6dc6933adc2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -162,7 +162,6 @@ config S390 select HAVE_LIVEPATCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MEMBLOCK_PHYS_MAP select MMU_GATHER_NO_GATHER select HAVE_MOD_ARCH_SPECIFIC diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 97656d20b9ea..0424b8f2f8d3 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -9,7 +9,6 @@ config SUPERH select CLKDEV_LOOKUP select DMA_DECLARE_COHERENT select HAVE_IDE if HAS_IOPORT_MAP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index da515fdad83d..795206b7b552 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -65,7 +65,6 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select MMU_GATHER_RCU_TABLE_FREE if SMP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e5d38cd11df0..c669328abf58 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -192,7 +192,6 @@ config X86 select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC select HAVE_MOVE_PMD diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6bc37a731d27..45abfc54da37 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -50,7 +50,7 @@ struct memblock_region { phys_addr_t base; phys_addr_t size; enum memblock_flags flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES int nid; #endif }; @@ -215,7 +215,6 @@ static inline bool memblock_is_nomap(struct memblock_region *m) return m->flags & MEMBLOCK_NOMAP; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, @@ -234,7 +233,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, @@ -310,10 +308,10 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); +#ifdef CONFIG_NEED_MULTIPLE_NODES static inline void memblock_set_region_node(struct memblock_region *r, int nid) { r->nid = nid; @@ -332,7 +330,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r) { return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#endif /* CONFIG_NEED_MULTIPLE_NODES */ /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4288e6993dc8..5f15d8723167 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2401,9 +2401,8 @@ static inline unsigned long get_num_physpages(void) return phys_pages; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its + * Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in a more * architecture independent manner. This is a substitute for creating the * zone_sizes[] and zholes_size[] arrays and passing them to @@ -2424,9 +2423,6 @@ static inline unsigned long get_num_physpages(void) * registered physical page range. Similarly * sparse_memory_present_with_active_regions() calls memory_present() for * each range when SPARSEMEM is enabled. - * - * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_HAVE_MEMBLOCK_NODE_MAP. */ extern void free_area_init_nodes(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); @@ -2441,13 +2437,9 @@ extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); extern void sparse_memory_present_with_active_regions(int nid); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ - !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) +#ifndef CONFIG_NEED_MULTIPLE_NODES static inline int early_pfn_to_nid(unsigned long pfn) { - BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA)); return 0; } #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c3a77eb85b42..0c575c3d7feb 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -876,7 +876,7 @@ extern int movable_zone; #ifdef CONFIG_HIGHMEM static inline int zone_movable_is_highmem(void) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES return movable_zone == ZONE_HIGHMEM; #else return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; diff --git a/mm/Kconfig b/mm/Kconfig index 5c0362bd8d56..3af64646f343 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -126,9 +126,6 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. -config HAVE_MEMBLOCK_NODE_MAP - bool - config HAVE_MEMBLOCK_PHYS_MAP bool diff --git a/mm/memblock.c b/mm/memblock.c index 43e2fd3006c1..743659d88fc4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -620,7 +620,7 @@ repeat: * area, insert that portion. */ if (rbase > base) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); @@ -1197,7 +1197,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, *idx = ULLONG_MAX; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * Common iterator interface used to define for_each_mem_pfn_range(). */ @@ -1247,6 +1246,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { +#ifdef CONFIG_NEED_MULTIPLE_NODES int start_rgn, end_rgn; int i, ret; @@ -1258,9 +1258,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); +#endif return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /** * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() @@ -1799,7 +1800,6 @@ bool __init_memblock memblock_is_map_memory(phys_addr_t addr) return !memblock_is_nomap(&memblock.memory.regions[i]); } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { @@ -1814,7 +1814,6 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, return memblock_get_region_node(&type->regions[mid]); } -#endif /** * memblock_is_region_memory - check if a region is a subset of memory @@ -1905,7 +1904,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) size = rgn->size; end = base + size - 1; flags = rgn->flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc0aad0bc1f5..e67dc501576a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1372,11 +1372,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; -#else - pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); -#endif return 0; } early_param("movable_node", cmdline_parse_movable_node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8741ae0828e1..430e35384b78 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -335,7 +335,6 @@ static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; static unsigned long dma_reserve __initdata; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long required_kernelcore __initdata; @@ -348,7 +347,6 @@ static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -1499,8 +1497,7 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages(page, order); } -#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +#ifdef CONFIG_NEED_MULTIPLE_NODES static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; @@ -1542,7 +1539,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -#endif +#endif /* CONFIG_NEED_MULTIPLE_NODES */ #ifdef CONFIG_NODES_SPAN_OTHER_NODES /* Only safe to use early in boot when initialisation is single-threaded */ @@ -5936,7 +5933,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat) static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static struct memblock_region *r; if (mirrored_kernelcore && zone == ZONE_MOVABLE) { @@ -5952,7 +5948,6 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) return true; } } -#endif return false; } @@ -6585,8 +6580,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid, return nr_absent; } -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline unsigned long __init zone_spanned_pages_in_node(int nid, +static inline unsigned long __init compat_zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -6605,7 +6599,7 @@ static inline unsigned long __init zone_spanned_pages_in_node(int nid, return zones_size[zone_type]; } -static inline unsigned long __init zone_absent_pages_in_node(int nid, +static inline unsigned long __init compat_zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -6617,13 +6611,12 @@ static inline unsigned long __init zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, - unsigned long *zholes_size) + unsigned long *zholes_size, + bool compat) { unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; @@ -6631,17 +6624,38 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long spanned, absent; unsigned long size, real_size; - size = zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - &zone_start_pfn, - &zone_end_pfn, - zones_size); - real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, node_end_pfn, - zholes_size); + if (compat) { + spanned = compat_zone_spanned_pages_in_node( + pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn, + zones_size); + absent = compat_zone_absent_pages_in_node( + pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + zholes_size); + } else { + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn, + zones_size); + absent = zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + zholes_size); + } + + size = spanned; + real_size = size - absent; + if (size) zone->zone_start_pfn = zone_start_pfn; else @@ -6941,10 +6955,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= offset; -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif } @@ -6961,9 +6973,10 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -void __init free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size) +static void __init __free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, + unsigned long *zholes_size, + bool compat) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; @@ -6975,16 +6988,16 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; pgdat->per_cpu_nodestats = NULL; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); - pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, - end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); -#else - start_pfn = node_start_pfn; -#endif + if (!compat) { + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + } else { + start_pfn = node_start_pfn; + } calculate_node_totalpages(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + zones_size, zholes_size, compat); alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); @@ -6992,6 +7005,14 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } +void __init free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, + unsigned long *zholes_size) +{ + __free_area_init_node(nid, zones_size, node_start_pfn, zholes_size, + true); +} + #if !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Initialize all valid struct pages in the range [spfn, epfn) and mark them @@ -7075,8 +7096,6 @@ static inline void __init init_unavailable_mem(void) } #endif /* !CONFIG_FLAT_NODE_MEM_MAP */ -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. @@ -7505,8 +7524,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL); + __free_area_init_node(nid, NULL, + find_min_pfn_for_node(nid), NULL, false); /* Any memory on that node */ if (pgdat->node_present_pages) @@ -7571,8 +7590,6 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - void adjust_managed_page_count(struct page *page, long count) { atomic_long_add(count, &page_zone(page)->managed_pages); -- cgit v1.2.3-59-g8ed1b From fa3354e4ea39e97af906c05551a36396541d70b4 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:57:06 -0700 Subject: mm: free_area_init: use maximal zone PFNs rather than zone sizes Currently, architectures that use free_area_init() to initialize memory map and node and zone structures need to calculate zone and hole sizes. We can use free_area_init_nodes() instead and let it detect the zone boundaries while the architectures will only have to supply the possible limits for the zones. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Reviewed-by: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-5-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/alpha/mm/init.c | 16 ++++++---------- arch/c6x/mm/init.c | 8 +++----- arch/h8300/mm/init.c | 6 +++--- arch/hexagon/mm/init.c | 6 +++--- arch/m68k/mm/init.c | 6 +++--- arch/m68k/mm/mcfmmu.c | 9 +++------ arch/nds32/mm/init.c | 11 ++++------- arch/nios2/mm/init.c | 8 +++----- arch/openrisc/mm/init.c | 9 +++------ arch/um/kernel/mem.c | 12 ++++-------- include/linux/mm.h | 2 +- mm/page_alloc.c | 5 ++--- 12 files changed, 38 insertions(+), 60 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 12e218d3792a..667cd21393b5 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -243,21 +243,17 @@ callback_init(void * kernel_end) */ void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - unsigned long dma_pfn, high_pfn; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; + unsigned long dma_pfn; dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - high_pfn = max_pfn = max_low_pfn; + max_pfn = max_low_pfn; - if (dma_pfn >= high_pfn) - zones_size[ZONE_DMA] = high_pfn; - else { - zones_size[ZONE_DMA] = dma_pfn; - zones_size[ZONE_NORMAL] = high_pfn - dma_pfn; - } + max_zone_pfn[ZONE_DMA] = dma_pfn; + max_zone_pfn[ZONE_NORMAL] = max_pfn; /* Initialize mem_map[]. */ - free_area_init(zones_size); + free_area_init(max_zone_pfn); /* Initialize the kernel's ZERO_PGE. */ memset((void *)ZERO_PGE, 0, PAGE_SIZE); diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 9b374393a8f4..a97e51a3e26d 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(empty_zero_page); void __init paging_init(void) { struct pglist_data *pgdat = NODE_DATA(0); - unsigned long zones_size[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; empty_zero_page = (unsigned long) memblock_alloc(PAGE_SIZE, PAGE_SIZE); @@ -49,11 +49,9 @@ void __init paging_init(void) /* * Define zones */ - zones_size[ZONE_NORMAL] = (memory_end - PAGE_OFFSET) >> PAGE_SHIFT; - pgdat->node_zones[ZONE_NORMAL].zone_start_pfn = - __pa(PAGE_OFFSET) >> PAGE_SHIFT; + max_zone_pfn[ZONE_NORMAL] = memory_end >> PAGE_SHIFT; - free_area_init(zones_size); + free_area_init(max_zone_pfn); } void __init mem_init(void) diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 1eab16b1a0bc..27a0020e3771 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -83,10 +83,10 @@ void __init paging_init(void) start_mem, end_mem); { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - zones_size[ZONE_NORMAL] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT; - free_area_init(zones_size); + max_zone_pfn[ZONE_NORMAL] = end_mem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); } } diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index c961773a6fff..f2e6c868e477 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -91,7 +91,7 @@ void sync_icache_dcache(pte_t pte) */ void __init paging_init(void) { - unsigned long zones_sizes[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; /* * This is not particularly well documented anywhere, but @@ -101,9 +101,9 @@ void __init paging_init(void) * adjust accordingly. */ - zones_sizes[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - free_area_init(zones_sizes); /* sets up the zonelists and mem_map */ + free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */ /* * Start of high memory area. Will probably need something more diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index b88d510d4fe3..6d3147662ff2 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -84,7 +84,7 @@ void __init paging_init(void) * page_alloc get different views of the world. */ unsigned long end_mem = memory_end & PAGE_MASK; - unsigned long zones_size[MAX_NR_ZONES] = { 0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; high_memory = (void *) end_mem; @@ -98,8 +98,8 @@ void __init paging_init(void) */ set_fs (USER_DS); - zones_size[ZONE_DMA] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT; - free_area_init(zones_size); + max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); } #endif /* CONFIG_MMU */ diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 0ea375607767..80064e6d064f 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -39,7 +39,7 @@ void __init paging_init(void) pte_t *pg_table; unsigned long address, size; unsigned long next_pgtable, bootmem_end; - unsigned long zones_size[MAX_NR_ZONES]; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; enum zone_type zone; int i; @@ -80,11 +80,8 @@ void __init paging_init(void) } current->mm = NULL; - - for (zone = 0; zone < MAX_NR_ZONES; zone++) - zones_size[zone] = 0x0; - zones_size[ZONE_DMA] = num_pages; - free_area_init(zones_size); + max_zone_pfn[ZONE_DMA] = PFN_DOWN(_ramend); + free_area_init(max_zone_pfn); } int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word) diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index 0be3833f6814..91147cca4b64 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -31,16 +31,13 @@ EXPORT_SYMBOL(empty_zero_page); static void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - /* Clear the zone sizes */ - memset(zones_size, 0, sizeof(zones_size)); - - zones_size[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = max_pfn; + max_zone_pfn[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init(zones_size); + free_area_init(max_zone_pfn); } diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 2c609c2516b2..9afca77d10b1 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -46,17 +46,15 @@ pgd_t *pgd_current; */ void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; - - memset(zones_size, 0, sizeof(zones_size)); + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; pagetable_init(); pgd_current = swapper_pg_dir; - zones_size[ZONE_NORMAL] = max_mapnr; + max_zone_pfn[ZONE_NORMAL] = max_mapnr; /* pass the memory from the bootmem allocator to the main allocator */ - free_area_init(zones_size); + free_area_init(max_zone_pfn); flush_dcache_range((unsigned long)empty_zero_page, (unsigned long)empty_zero_page + PAGE_SIZE); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 1f87b524db78..f94fe6d3f499 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -45,17 +45,14 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); static void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; - - /* Clear the zone sizes */ - memset(zones_size, 0, sizeof(zones_size)); + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; /* * We use only ZONE_NORMAL */ - zones_size[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - free_area_init(zones_size); + free_area_init(max_zone_pfn); } extern const char _s_kernel_ro[], _e_kernel_ro[]; diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 30885d0b94ac..401b22f14743 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -158,8 +158,8 @@ static void __init fixaddr_user_init( void) void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES], vaddr; - int i; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; + unsigned long vaddr; empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); @@ -167,12 +167,8 @@ void __init paging_init(void) panic("%s: Failed to allocate %lu bytes align=%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(zones_size); i++) - zones_size[i] = 0; - - zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) - - (uml_physmem >> PAGE_SHIFT); - free_area_init(zones_size); + max_zone_pfn[ZONE_NORMAL] = end_iomem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); /* * Fixed mappings, only the page table structure has to be diff --git a/include/linux/mm.h b/include/linux/mm.h index 5f15d8723167..788704977de0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2329,7 +2329,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) } extern void __init pagecache_init(void); -extern void free_area_init(unsigned long * zones_size); +extern void free_area_init(unsigned long * max_zone_pfn); extern void __init free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 430e35384b78..cf420e947d9c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7712,11 +7712,10 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -void __init free_area_init(unsigned long *zones_size) +void __init free_area_init(unsigned long *max_zone_pfn) { init_unavailable_mem(); - free_area_init_node(0, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + free_area_init_nodes(max_zone_pfn); } static int page_alloc_cpu_dead(unsigned int cpu) -- cgit v1.2.3-59-g8ed1b From 9691a071aa26a21fc8dac804a2b98d3c24f76f9a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:57:10 -0700 Subject: mm: use free_area_init() instead of free_area_init_nodes() free_area_init() has effectively became a wrapper for free_area_init_nodes() and there is no point of keeping it. Still free_area_init() name is shorter and more general as it does not imply necessity to initialize multiple nodes. Rename free_area_init_nodes() to free_area_init(), update the callers and drop old version of free_area_init(). Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Reviewed-by: Baoquan He Acked-by: Catalin Marinas Cc: Brian Cain Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-6-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/arm64/mm/init.c | 2 +- arch/ia64/mm/contig.c | 2 +- arch/ia64/mm/discontig.c | 2 +- arch/microblaze/mm/init.c | 2 +- arch/mips/loongson64/numa.c | 2 +- arch/mips/mm/init.c | 2 +- arch/mips/sgi-ip27/ip27-memory.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/riscv/mm/init.c | 2 +- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 2 +- arch/sparc/mm/init_64.c | 2 +- arch/x86/mm/init.c | 2 +- include/linux/mm.h | 7 +++---- mm/page_alloc.c | 10 ++-------- 15 files changed, 18 insertions(+), 25 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index d2df416b840e..5dae97f2f628 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -206,7 +206,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) #endif max_zone_pfns[ZONE_NORMAL] = max; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } #else diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 5b00dc3898e1..8786fa5c7612 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -210,6 +210,6 @@ paging_init (void) printk("Virtual mem_map starts at 0x%p\n", mem_map); } #endif /* !CONFIG_VIRTUAL_MEM_MAP */ - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 4f33f6e7e206..dd8284bcbf16 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -627,7 +627,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = max_dma; #endif max_zone_pfns[ZONE_NORMAL] = max_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 1ffbfa96b9b8..dcaa53d11339 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -112,7 +112,7 @@ static void __init paging_init(void) #endif /* We don't have holes in memory map */ - free_area_init_nodes(zones_size); + free_area_init(zones_size); } void __init setup_memory(void) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 1ae072df4831..901f5be5ee76 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -247,7 +247,7 @@ void __init paging_init(void) zones_size[ZONE_DMA32] = MAX_DMA32_PFN; #endif zones_size[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(zones_size); + free_area_init(zones_size); } void __init mem_init(void) diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 79684000de0e..19719e8b41a5 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -418,7 +418,7 @@ void __init paging_init(void) } #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } #ifdef CONFIG_64BIT diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index a45691e6ab90..1213215ea965 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -419,7 +419,7 @@ void __init paging_init(void) pagetable_init(); zones_size[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(zones_size); + free_area_init(zones_size); } void __init mem_init(void) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 041ed7cfd341..0fcea21f26b4 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -271,7 +271,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); mark_nonram_nosave(); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 736de6c8739f..6168b1985b77 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -39,7 +39,7 @@ static void __init zone_sizes_init(void) #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } static void setup_zero_page(void) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 87b2d024e75a..b11bcf4da531 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -122,7 +122,7 @@ void __init paging_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } void mark_rodata_ro(void) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 8d2a68aea1fc..628f461b8993 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -334,7 +334,7 @@ void __init paging_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } unsigned int mem_init_done = 0; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 1cf0d666dea3..79d3c5e0802e 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2488,7 +2488,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_NORMAL] = end_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } printk("Booting Linux...\n"); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a573a3e63f02..1decb645dac0 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -947,7 +947,7 @@ void __init zone_sizes_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { diff --git a/include/linux/mm.h b/include/linux/mm.h index 788704977de0..ff2c19e14c1e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2329,7 +2329,6 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) } extern void __init pagecache_init(void); -extern void free_area_init(unsigned long * max_zone_pfn); extern void __init free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); @@ -2410,21 +2409,21 @@ static inline unsigned long get_num_physpages(void) * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling - * free_area_init_nodes() passing in the PFN each zone ends at. At a basic + * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid) - * free_area_init_nodes(max_zone_pfns); + * free_area_init(max_zone_pfns); * * free_bootmem_with_active_regions() calls free_bootmem_node() for each * registered physical page range. Similarly * sparse_memory_present_with_active_regions() calls memory_present() for * each range when SPARSEMEM is enabled. */ -extern void free_area_init_nodes(unsigned long *max_zone_pfn); +void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf420e947d9c..644a59d17318 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7440,7 +7440,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } /** - * free_area_init_nodes - Initialise all pg_data_t and zone data + * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. @@ -7452,7 +7452,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init_nodes(unsigned long *max_zone_pfn) +void __init free_area_init(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; int i, nid; @@ -7712,12 +7712,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -void __init free_area_init(unsigned long *max_zone_pfn) -{ - init_unavailable_mem(); - free_area_init_nodes(max_zone_pfn); -} - static int page_alloc_cpu_dead(unsigned int cpu) { -- cgit v1.2.3-59-g8ed1b From 73a6e474cb376921a311786652782155eac2fdf0 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 3 Jun 2020 15:57:55 -0700 Subject: mm: memmap_init: iterate over memblock regions rather that check each PFN When called during boot the memmap_init_zone() function checks if each PFN is valid and actually belongs to the node being initialized using early_pfn_valid() and early_pfn_in_nid(). Each such check may cost up to O(log(n)) where n is the number of memory banks, so for large amount of memory overall time spent in early_pfn*() becomes substantial. Since the information is anyway present in memblock, we can iterate over memblock memory regions in memmap_init() and only call memmap_init_zone() for PFN ranges that are know to be valid and in the appropriate node. [cai@lca.pw: fix a compilation warning from Clang] Link: http://lkml.kernel.org/r/CF6E407F-17DC-427C-8203-21979FB882EF@lca.pw [bhe@redhat.com: fix the incorrect hole in fast_isolate_freepages()] Link: http://lkml.kernel.org/r/8C537EB7-85EE-4DCF-943E-3CC0ED0DF56D@lca.pw Link: http://lkml.kernel.org/r/20200521014407.29690-1-bhe@redhat.com Signed-off-by: Baoquan He Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Cc: Qian Cai Link: http://lkml.kernel.org/r/20200412194859.12663-16-rppt@kernel.org Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 +++- mm/page_alloc.c | 43 ++++++++++++++++--------------------------- 2 files changed, 19 insertions(+), 28 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/compaction.c b/mm/compaction.c index c9d659e6a02c..8c2961100840 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1409,7 +1409,9 @@ fast_isolate_freepages(struct compact_control *cc) cc->free_pfn = highest; } else { if (cc->direct_compaction && pfn_valid(min_pfn)) { - page = pfn_to_page(min_pfn); + page = pageblock_pfn_to_page(min_pfn, + pageblock_end_pfn(min_pfn), + cc->zone); cc->free_pfn = min_pfn; } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 644a59d17318..40587d74cd1c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5951,23 +5951,6 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) return false; } -#ifdef CONFIG_SPARSEMEM -/* Skip PFNs that belong to non-present sections */ -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - const unsigned long section_nr = pfn_to_section_nr(++pfn); - - if (present_section_nr(section_nr)) - return pfn; - return section_nr_to_pfn(next_present_section_nr(section_nr)); -} -#else -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - return pfn++; -} -#endif - /* * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is @@ -6007,14 +5990,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * function. They do not exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) { - pfn = next_pfn(pfn); - continue; - } - if (!early_pfn_in_nid(pfn, nid)) { - pfn++; - continue; - } if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) @@ -6130,9 +6105,23 @@ static void __meminit zone_init_free_lists(struct zone *zone) } void __meminit __weak memmap_init(unsigned long size, int nid, - unsigned long zone, unsigned long start_pfn) + unsigned long zone, + unsigned long range_start_pfn) { - memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); + unsigned long start_pfn, end_pfn; + unsigned long range_end_pfn = range_start_pfn + size; + int i; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + + if (end_pfn > start_pfn) { + size = end_pfn - start_pfn; + memmap_init_zone(size, nid, zone, start_pfn, + MEMMAP_EARLY, NULL); + } + } } static int zone_batchsize(struct zone *zone) -- cgit v1.2.3-59-g8ed1b From acd3f5c441e9635857f02a7c21e7dd590dcf672e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:57:59 -0700 Subject: mm: remove early_pfn_in_nid() and CONFIG_NODES_SPAN_OTHER_NODES The memmap_init() function was made to iterate over memblock regions and as the result the early_pfn_in_nid() function became obsolete. Since CONFIG_NODES_SPAN_OTHER_NODES is only used to pick a stub or a real implementation of early_pfn_in_nid(), it is also not needed anymore. Remove both early_pfn_in_nid() and the CONFIG_NODES_SPAN_OTHER_NODES. Co-developed-by: Hoan Tran Signed-off-by: Hoan Tran Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Cc: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-17-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 9 --------- arch/sparc/Kconfig | 9 --------- arch/x86/Kconfig | 9 --------- mm/page_alloc.c | 20 -------------------- 4 files changed, 47 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 41ba42b107c0..a8eee7a64add 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -686,15 +686,6 @@ config ARCH_MEMORY_PROBE def_bool y depends on MEMORY_HOTPLUG -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. -config NODES_SPAN_OTHER_NODES - def_bool y - depends on NEED_MULTIPLE_NODES - config STDBINUTILS bool "Using standard binutils settings" depends on 44x diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 795206b7b552..0e4f3891b904 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -286,15 +286,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. -config NODES_SPAN_OTHER_NODES - def_bool y - depends on NEED_MULTIPLE_NODES - config ARCH_SPARSEMEM_ENABLE def_bool y if SPARC64 select SPARSEMEM_VMEMMAP_ENABLE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c669328abf58..f3b910fe1d34 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1583,15 +1583,6 @@ config X86_64_ACPI_NUMA ---help--- Enable ACPI SRAT based node topology detection. -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. -config NODES_SPAN_OTHER_NODES - def_bool y - depends on X86_64_ACPI_NUMA - config NUMA_EMU bool "NUMA emulation" depends on NUMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40587d74cd1c..1f7eff7120d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1541,26 +1541,6 @@ int __meminit early_pfn_to_nid(unsigned long pfn) } #endif /* CONFIG_NEED_MULTIPLE_NODES */ -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - int nid; - - nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0 && nid != node) - return false; - return true; -} - -#else -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return true; -} -#endif - - void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { -- cgit v1.2.3-59-g8ed1b From 51930df5801e4da60e962ea52b811634d257a148 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:58:03 -0700 Subject: mm: free_area_init: allow defining max_zone_pfn in descending order Some architectures (e.g. ARC) have the ZONE_HIGHMEM zone below the ZONE_NORMAL. Allowing free_area_init() parse max_zone_pfn array even it is sorted in descending order allows using free_area_init() on such architectures. Add top -> down traversal of max_zone_pfn array in free_area_init() and use the latter in ARC node/zone initialization. [rppt@kernel.org: ARC fix] Link: http://lkml.kernel.org/r/20200504153901.GM14260@kernel.org [rppt@linux.ibm.com: arc: free_area_init(): take into account PAE40 mode] Link: http://lkml.kernel.org/r/20200507205900.GH683243@linux.ibm.com [akpm@linux-foundation.org: declare arch_has_descending_max_zone_pfns()] Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Reviewed-by: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Cc: Guenter Roeck Link: http://lkml.kernel.org/r/20200412194859.12663-18-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/arc/mm/init.c | 41 ++++++++++++----------------------------- include/linux/mm.h | 1 + mm/page_alloc.c | 26 +++++++++++++++++++++----- 3 files changed, 34 insertions(+), 34 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 0920c969c466..e7bdc2ac1c87 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -63,11 +63,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) low_mem_sz = size; in_use = 1; + memblock_add_node(base, size, 0); } else { #ifdef CONFIG_HIGHMEM high_mem_start = base; high_mem_sz = size; in_use = 1; + memblock_add_node(base, size, 1); #endif } @@ -75,6 +77,11 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) base, TO_MB(size), !in_use ? "Not used":""); } +bool arch_has_descending_max_zone_pfns(void) +{ + return !IS_ENABLED(CONFIG_ARC_HAS_PAE40); +} + /* * First memory setup routine called from setup_arch() * 1. setup swapper's mm @init_mm @@ -83,8 +90,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) */ void __init setup_arch_memory(void) { - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long zones_holes[MAX_NR_ZONES]; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; init_mm.start_code = (unsigned long)_text; init_mm.end_code = (unsigned long)_etext; @@ -115,7 +121,6 @@ void __init setup_arch_memory(void) * the crash */ - memblock_add_node(low_mem_start, low_mem_sz, 0); memblock_reserve(CONFIG_LINUX_LINK_BASE, __pa(_end) - CONFIG_LINUX_LINK_BASE); @@ -133,22 +138,7 @@ void __init setup_arch_memory(void) memblock_dump_all(); /*----------------- node/zones setup --------------------------*/ - memset(zones_size, 0, sizeof(zones_size)); - memset(zones_holes, 0, sizeof(zones_holes)); - - zones_size[ZONE_NORMAL] = max_low_pfn - min_low_pfn; - zones_holes[ZONE_NORMAL] = 0; - - /* - * We can't use the helper free_area_init(zones[]) because it uses - * PAGE_OFFSET to compute the @min_low_pfn which would be wrong - * when our kernel doesn't start at PAGE_OFFSET, i.e. - * PAGE_OFFSET != CONFIG_LINUX_RAM_BASE - */ - free_area_init_node(0, /* node-id */ - zones_size, /* num pages per zone */ - min_low_pfn, /* first pfn of node */ - zones_holes); /* holes */ + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM /* @@ -168,20 +158,13 @@ void __init setup_arch_memory(void) min_high_pfn = PFN_DOWN(high_mem_start); max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz); - zones_size[ZONE_NORMAL] = 0; - zones_holes[ZONE_NORMAL] = 0; - - zones_size[ZONE_HIGHMEM] = max_high_pfn - min_high_pfn; - zones_holes[ZONE_HIGHMEM] = 0; - - free_area_init_node(1, /* node-id */ - zones_size, /* num pages per zone */ - min_high_pfn, /* first pfn of node */ - zones_holes); /* holes */ + max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn; high_memory = (void *)(min_high_pfn << PAGE_SHIFT); kmap_init(); #endif + + free_area_init(max_zone_pfn); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index ff2c19e14c1e..21cf171ae9de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2473,6 +2473,7 @@ extern void setup_per_cpu_pageset(void); extern int min_free_kbytes; extern int watermark_boost_factor; extern int watermark_scale_factor; +extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1f7eff7120d7..36d93c73f2bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7408,6 +7408,15 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } } +/* + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * such cases we allow max_zone_pfn sorted in the descending order + */ +bool __weak arch_has_descending_max_zone_pfns(void) +{ + return false; +} + /** * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone @@ -7424,7 +7433,8 @@ static void check_for_memory(pg_data_t *pgdat, int nid) void __init free_area_init(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int i, nid, zone; + bool descending; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -7433,14 +7443,20 @@ void __init free_area_init(unsigned long *max_zone_pfn) sizeof(arch_zone_highest_possible_pfn)); start_pfn = find_min_pfn_with_active_regions(); + descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { - if (i == ZONE_MOVABLE) + if (descending) + zone = MAX_NR_ZONES - i - 1; + else + zone = i; + + if (zone == ZONE_MOVABLE) continue; - end_pfn = max(max_zone_pfn[i], start_pfn); - arch_zone_lowest_possible_pfn[i] = start_pfn; - arch_zone_highest_possible_pfn[i] = end_pfn; + end_pfn = max(max_zone_pfn[zone], start_pfn); + arch_zone_lowest_possible_pfn[zone] = start_pfn; + arch_zone_highest_possible_pfn[zone] = end_pfn; start_pfn = end_pfn; } -- cgit v1.2.3-59-g8ed1b From bc9331a19d758706493cbebba67ca70382edddac Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:58:09 -0700 Subject: mm: rename free_area_init_node() to free_area_init_memoryless_node() free_area_init_node() is only used by x86 to initialize a memory-less nodes. Make its name reflect this and drop all the function parameters except node ID as they are anyway zero. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Cc: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-19-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 5 +---- include/linux/mm.h | 9 +++------ mm/page_alloc.c | 7 ++----- 3 files changed, 6 insertions(+), 15 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fe024b2ac796..8ee952038c80 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -737,12 +737,9 @@ void __init x86_numa_init(void) static void __init init_memory_less_node(int nid) { - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; - /* Allocate and initialize node data. Memory-less node is now online.*/ alloc_node_data(nid); - free_area_init_node(nid, zones_size, 0, zholes_size); + free_area_init_memoryless_node(nid); /* * All zonelists will be built later in start_kernel() after per cpu diff --git a/include/linux/mm.h b/include/linux/mm.h index 21cf171ae9de..0d998c84231c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2329,8 +2329,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) } extern void __init pagecache_init(void); -extern void __init free_area_init_node(int nid, unsigned long * zones_size, - unsigned long zone_start_pfn, unsigned long *zholes_size); +extern void __init free_area_init_memoryless_node(int nid); extern void free_initmem(void); /* @@ -2402,10 +2401,8 @@ static inline unsigned long get_num_physpages(void) /* * Using memblock node mappings, an architecture may initialise its - * zones, allocate the backing mem_map and account for memory holes in a more - * architecture independent manner. This is a substitute for creating the - * zone_sizes[] and zholes_size[] arrays and passing them to - * free_area_init_node() + * zones, allocate the backing mem_map and account for memory holes in an + * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36d93c73f2bb..cc96ecbe52f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6974,12 +6974,9 @@ static void __init __free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } -void __init free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size) +void __init free_area_init_memoryless_node(int nid) { - __free_area_init_node(nid, zones_size, node_start_pfn, zholes_size, - true); + __free_area_init_node(nid, NULL, 0, NULL, false); } #if !defined(CONFIG_FLAT_NODE_MEM_MAP) -- cgit v1.2.3-59-g8ed1b From 854e8848c5841b4199a70f1838f55999cecbf3b6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:58:13 -0700 Subject: mm: clean up free_area_init_node() and its helpers free_area_init_node() now always uses memblock info and the zone PFN limits so it does not need the backwards compatibility functions to calculate the zone spanned and absent pages. The removal of the compat_ versions of zone_{abscent,spanned}_pages_in_node() in turn, makes zone_size and zhole_size parameters unused. The node_start_pfn is determined by get_pfn_range_for_nid(), so there is no need to pass it to free_area_init_node(). As a result, the only required parameter to free_area_init_node() is the node ID, all the rest are removed along with no longer used compat_zone_{abscent,spanned}_pages_in_node() helpers. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Cc: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-20-rppt@kernel.org Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 104 ++++++++++++-------------------------------------------- 1 file changed, 22 insertions(+), 82 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cc96ecbe52f7..ee7ef328c9de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6436,8 +6436,7 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *ignored) + unsigned long *zone_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6501,8 +6500,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *ignored) + unsigned long node_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6549,43 +6547,9 @@ static unsigned long __init zone_absent_pages_in_node(int nid, return nr_absent; } -static inline unsigned long __init compat_zone_spanned_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *zones_size) -{ - unsigned int zone; - - *zone_start_pfn = node_start_pfn; - for (zone = 0; zone < zone_type; zone++) - *zone_start_pfn += zones_size[zone]; - - *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; - - return zones_size[zone_type]; -} - -static inline unsigned long __init compat_zone_absent_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zholes_size) -{ - if (!zholes_size) - return 0; - - return zholes_size[zone_type]; -} - static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zones_size, - unsigned long *zholes_size, - bool compat) + unsigned long node_end_pfn) { unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; @@ -6596,31 +6560,14 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long spanned, absent; unsigned long size, real_size; - if (compat) { - spanned = compat_zone_spanned_pages_in_node( - pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - &zone_start_pfn, - &zone_end_pfn, - zones_size); - absent = compat_zone_absent_pages_in_node( - pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - zholes_size); - } else { - spanned = zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - &zone_start_pfn, - &zone_end_pfn, - zones_size); - absent = zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - zholes_size); - } + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn); + absent = zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn); size = spanned; real_size = size - absent; @@ -6942,10 +6889,7 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -static void __init __free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size, - bool compat) +static void __init free_area_init_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; @@ -6954,19 +6898,16 @@ static void __init __free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + pgdat->node_id = nid; - pgdat->node_start_pfn = node_start_pfn; + pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; - if (!compat) { - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); - pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, - end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); - } else { - start_pfn = node_start_pfn; - } - calculate_node_totalpages(pgdat, start_pfn, end_pfn, - zones_size, zholes_size, compat); + + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); @@ -6976,7 +6917,7 @@ static void __init __free_area_init_node(int nid, unsigned long *zones_size, void __init free_area_init_memoryless_node(int nid) { - __free_area_init_node(nid, NULL, 0, NULL, false); + free_area_init_node(nid); } #if !defined(CONFIG_FLAT_NODE_MEM_MAP) @@ -7506,8 +7447,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - __free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL, false); + free_area_init_node(nid); /* Any memory on that node */ if (pgdat->node_present_pages) -- cgit v1.2.3-59-g8ed1b From 8a1b25fe3ce49f7c0ad562fa294ac7f93731bd3a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:58:18 -0700 Subject: mm: simplify find_min_pfn_with_active_regions() find_min_pfn_with_active_regions() calls find_min_pfn_for_node() with nid parameter set to MAX_NUMNODES. This makes the find_min_pfn_for_node() traverse all memblock memory regions although the first PFN in the system can be easily found with memblock_start_of_DRAM(). Use memblock_start_of_DRAM() in find_min_pfn_with_active_regions() and drop now unused find_min_pfn_for_node(). Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Cc: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-21-rppt@kernel.org Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ee7ef328c9de..0a2a67c7f230 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7066,24 +7066,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/* Find the lowest pfn for a node */ -static unsigned long __init find_min_pfn_for_node(int nid) -{ - unsigned long min_pfn = ULONG_MAX; - unsigned long start_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) - min_pfn = min(min_pfn, start_pfn); - - if (min_pfn == ULONG_MAX) { - pr_warn("Could not find start_pfn for node %d\n", nid); - return 0; - } - - return min_pfn; -} - /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * @@ -7092,7 +7074,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) */ unsigned long __init find_min_pfn_with_active_regions(void) { - return find_min_pfn_for_node(MAX_NUMNODES); + return PHYS_PFN(memblock_start_of_DRAM()); } /* -- cgit v1.2.3-59-g8ed1b From 833d8a426f78f19b166d93bda0569d3d6d507dba Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 3 Jun 2020 15:58:26 -0700 Subject: mm/page_alloc.c: bad_[reason|flags] is not necessary when PageHWPoison Patch series "mm/page_alloc.c: cleanup on check page", v3. This patchset does some cleanup related to check page. 1. Remove unnecessary bad_reason assignment 2. Remove bad_flags to bad_page() 3. Rename function for naming convention 4. Extract common part to check page Thanks for suggestions from David Rientjes and Anshuman Khandual. This patch (of 5): Since function returns directly, bad_[reason|flags] is not used any where. And move this to the first. This is a following cleanup for commit e570f56cccd21 ("mm: check_new_page_bad() directly returns in __PG_HWPOISON case") Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: David Rientjes Link: http://lkml.kernel.org/r/20200411220357.9636-2-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0a2a67c7f230..3ef7db15ec70 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2097,19 +2097,17 @@ static void check_new_page_bad(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; + if (unlikely(page->flags & __PG_HWPOISON)) { + /* Don't complain about hwpoisoned pages */ + page_mapcount_reset(page); /* remove PageBuddy */ + return; + } if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & __PG_HWPOISON)) { - bad_reason = "HWPoisoned (hardware-corrupted)"; - bad_flags = __PG_HWPOISON; - /* Don't complain about hwpoisoned pages */ - page_mapcount_reset(page); /* remove PageBuddy */ - return; - } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; -- cgit v1.2.3-59-g8ed1b From 82a3241a8f76cca20862c16693afc13fcc4622c6 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 3 Jun 2020 15:58:29 -0700 Subject: mm/page_alloc.c: bad_flags is not necessary for bad_page() After commit 5b57b8f22709 ("mm/debug.c: always print flags in dump_page()"), page->flags is always printed for a bad page. It is not necessary to have bad_flags any more. Suggested-by: Anshuman Khandual Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: David Hildenbrand Cc: David Rientjes Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200411220357.9636-3-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3ef7db15ec70..895d85525919 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -607,8 +607,7 @@ static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, const char *reason, - unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason) { static unsigned long resume; static unsigned long nr_shown; @@ -637,10 +636,6 @@ static void bad_page(struct page *page, const char *reason, pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); __dump_page(page, reason); - bad_flags &= page->flags; - if (bad_flags) - pr_alert("bad because of flags: %#lx(%pGp)\n", - bad_flags, &bad_flags); dump_page_owner(page); print_modules(); @@ -1077,11 +1072,7 @@ static inline bool page_expected_state(struct page *page, static void free_pages_check_bad(struct page *page) { - const char *bad_reason; - unsigned long bad_flags; - - bad_reason = NULL; - bad_flags = 0; + const char *bad_reason = NULL; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; @@ -1089,15 +1080,13 @@ static void free_pages_check_bad(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { + if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; - bad_flags = PAGE_FLAGS_CHECK_AT_FREE; - } #ifdef CONFIG_MEMCG if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - bad_page(page, bad_reason, bad_flags); + bad_page(page, bad_reason); } static inline int free_pages_check(struct page *page) @@ -1128,7 +1117,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 1: /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); + bad_page(page, "nonzero compound_mapcount"); goto out; } break; @@ -1140,17 +1129,17 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) break; default: if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); + bad_page(page, "corrupted mapping in tail page"); goto out; } break; } if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); + bad_page(page, "PageTail not set"); goto out; } if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); + bad_page(page, "compound_head not consistent"); goto out; } ret = 0; @@ -2095,7 +2084,6 @@ static inline void expand(struct zone *zone, struct page *page, static void check_new_page_bad(struct page *page) { const char *bad_reason = NULL; - unsigned long bad_flags = 0; if (unlikely(page->flags & __PG_HWPOISON)) { /* Don't complain about hwpoisoned pages */ @@ -2108,15 +2096,13 @@ static void check_new_page_bad(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { + if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; - bad_flags = PAGE_FLAGS_CHECK_AT_PREP; - } #ifdef CONFIG_MEMCG if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - bad_page(page, bad_reason, bad_flags); + bad_page(page, bad_reason); } /* -- cgit v1.2.3-59-g8ed1b From 0d0c48a274d5beeeb42b904caacea01564d3222b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 3 Jun 2020 15:58:33 -0700 Subject: mm/page_alloc.c: rename free_pages_check_bad() to check_free_page_bad() free_pages_check_bad() is the counterpart of check_new_page_bad(). Rename it to use the same naming convention. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: Anshuman Khandual Cc: David Hildenbrand Cc: David Rientjes Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200411220357.9636-4-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 895d85525919..773c1005a662 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1070,7 +1070,7 @@ static inline bool page_expected_state(struct page *page, return true; } -static void free_pages_check_bad(struct page *page) +static void check_free_page_bad(struct page *page) { const char *bad_reason = NULL; @@ -1095,7 +1095,7 @@ static inline int free_pages_check(struct page *page) return 0; /* Something has gone sideways, find it */ - free_pages_check_bad(page); + check_free_page_bad(page); return 1; } -- cgit v1.2.3-59-g8ed1b From 534fe5e3c44f93ddd9daf7bb59ec2950583d0522 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 3 Jun 2020 15:58:36 -0700 Subject: mm/page_alloc.c: rename free_pages_check() to check_free_page() free_pages_check() is the counterpart of check_new_page(). Rename it to use the same naming convention. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: Anshuman Khandual Cc: David Hildenbrand Cc: David Rientjes Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200411220357.9636-5-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 773c1005a662..57c497d32738 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1089,7 +1089,7 @@ static void check_free_page_bad(struct page *page) bad_page(page, bad_reason); } -static inline int free_pages_check(struct page *page) +static inline int check_free_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return 0; @@ -1181,7 +1181,7 @@ static __always_inline bool free_pages_prepare(struct page *page, for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(free_pages_check(page + i))) { + if (unlikely(check_free_page(page + i))) { bad++; continue; } @@ -1193,7 +1193,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageKmemcg(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) - bad += free_pages_check(page); + bad += check_free_page(page); if (bad) return false; @@ -1240,7 +1240,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return free_pages_check(page); + return check_free_page(page); else return false; } @@ -1261,7 +1261,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { - return free_pages_check(page); + return check_free_page(page); } #endif /* CONFIG_DEBUG_VM */ -- cgit v1.2.3-59-g8ed1b From 58b7f1194fe1e188a1687e45c3475a98906aae4b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 3 Jun 2020 15:58:39 -0700 Subject: mm/page_alloc.c: extract check_[new|free]_page_bad() common part to page_bad_reason() We share similar code in check_[new|free]_page_bad() to get the page's bad reason. Let's extract it and reduce code duplication. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: David Rientjes Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200411220357.9636-6-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 57c497d32738..cbe73a5610a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1070,7 +1070,7 @@ static inline bool page_expected_state(struct page *page, return true; } -static void check_free_page_bad(struct page *page) +static const char *page_bad_reason(struct page *page, unsigned long flags) { const char *bad_reason = NULL; @@ -1080,13 +1080,23 @@ static void check_free_page_bad(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) - bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; + if (unlikely(page->flags & flags)) { + if (flags == PAGE_FLAGS_CHECK_AT_PREP) + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; + else + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; + } #ifdef CONFIG_MEMCG if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - bad_page(page, bad_reason); + return bad_reason; +} + +static void check_free_page_bad(struct page *page) +{ + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); } static inline int check_free_page(struct page *page) @@ -2083,26 +2093,14 @@ static inline void expand(struct zone *zone, struct page *page, static void check_new_page_bad(struct page *page) { - const char *bad_reason = NULL; - if (unlikely(page->flags & __PG_HWPOISON)) { /* Don't complain about hwpoisoned pages */ page_mapcount_reset(page); /* remove PageBuddy */ return; } - if (unlikely(atomic_read(&page->_mapcount) != -1)) - bad_reason = "nonzero mapcount"; - if (unlikely(page->mapping != NULL)) - bad_reason = "non-NULL mapping"; - if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) - bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; -#ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) - bad_reason = "page still charged to cgroup"; -#endif - bad_page(page, bad_reason); + + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); } /* -- cgit v1.2.3-59-g8ed1b From 16867664936e32423375bf44d240f440fff194cb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 3 Jun 2020 15:58:42 -0700 Subject: mm,page_alloc,cma: conditionally prefer cma pageblocks for movable allocations Currently a cma area is barely used by the page allocator because it's used only as a fallback from movable, however kswapd tries hard to make sure that the fallback path isn't used. This results in a system evicting memory and pushing data into swap, while lots of CMA memory is still available. This happens despite the fact that alloc_contig_range is perfectly capable of moving any movable allocations out of the way of an allocation. To effectively use the cma area let's alter the rules: if the zone has more free cma pages than the half of total free pages in the zone, use cma pageblocks first and fallback to movable blocks in the case of failure. [guro@fb.com: ifdef the cma-specific code] Link: http://lkml.kernel.org/r/20200311225832.GA178154@carbon.DHCP.thefacebook.com Co-developed-by: Rik van Riel Signed-off-by: Roman Gushchin Signed-off-by: Rik van Riel Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Minchan Kim Cc: Qian Cai Cc: Mel Gorman Cc: Anshuman Khandual Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200306150102.3e77354b@imladris.surriel.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbe73a5610a1..5207a9e86388 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2752,6 +2752,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, { struct page *page; +#ifdef CONFIG_CMA + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (migratetype == MIGRATE_MOVABLE && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + return page; + } +#endif retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { -- cgit v1.2.3-59-g8ed1b From 4ca7be24eeb3198dffdae9472d7464c8b8cadadb Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 3 Jun 2020 15:58:45 -0700 Subject: mm/page_alloc.c: remove unused free_bootmem_with_active_regions Since commit 397dc00e249ec64e10 ("mips: sgi-ip27: switch from DISCONTIGMEM to SPARSEMEM"), the last caller of free_bootmem_with_active_regions() was gone. Now no user calls it any more. Let's remove it. Signed-off-by: Baoquan He Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Link: http://lkml.kernel.org/r/20200402143455.5145-1-bhe@redhat.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- mm/page_alloc.c | 25 ------------------------- 2 files changed, 29 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d998c84231c..4141ebcb3a65 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2415,8 +2415,6 @@ static inline unsigned long get_num_physpages(void) * memblock_add_node(base, size, nid) * free_area_init(max_zone_pfns); * - * free_bootmem_with_active_regions() calls free_bootmem_node() for each - * registered physical page range. Similarly * sparse_memory_present_with_active_regions() calls memory_present() for * each range when SPARSEMEM is enabled. */ @@ -2429,8 +2427,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn, extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); extern unsigned long find_min_pfn_with_active_regions(void); -extern void free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_NEED_MULTIPLE_NODES diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5207a9e86388..a1a4f883b7f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6295,31 +6295,6 @@ void __meminit init_currently_empty_zone(struct zone *zone, zone->initialized = 1; } -/** - * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid - * - * If an architecture guarantees that all ranges registered contain no holes - * and may be freed, this this function may be used instead of calling - * memblock_free_early_nid() manually. - */ -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) -{ - unsigned long start_pfn, end_pfn; - int i, this_nid; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { - start_pfn = min(start_pfn, max_low_pfn); - end_pfn = min(end_pfn, max_low_pfn); - - if (start_pfn < end_pfn) - memblock_free_early_nid(PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT, - this_nid); - } -} - /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.2.3-59-g8ed1b From 86aaf255437af88c0ff5d20a3c98e2b42fb0beda Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 3 Jun 2020 15:58:48 -0700 Subject: mm/page_alloc.c: only tune sysctl_lowmem_reserve_ratio value once when changing it Patch series "improvements about lowmem_reserve and /proc/zoneinfo", v2. This patch (of 3): When people write to /proc/sys/vm/lowmem_reserve_ratio to change sysctl_lowmem_reserve_ratio[], setup_per_zone_lowmem_reserve() is called to recalculate all ->lowmem_reserve[] for each zone of all nodes as below: static void setup_per_zone_lowmem_reserve(void) { ... for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { ... while (idx) { ... if (sysctl_lowmem_reserve_ratio[idx] < 1) { sysctl_lowmem_reserve_ratio[idx] = 0; lower_zone->lowmem_reserve[j] = 0; } else { ... } } } } Meanwhile, here, sysctl_lowmem_reserve_ratio[idx] will be tuned if its value is smaller than '1'. As we know, sysctl_lowmem_reserve_ratio[] is set for zone without regarding to which node it belongs to. That means the tuning will be done on all nodes, even though it has been done in the first node. And the tuning will be done too even when init_per_zone_wmark_min() calls setup_per_zone_lowmem_reserve(), where actually nobody tries to change sysctl_lowmem_reserve_ratio[]. So now move the tuning into lowmem_reserve_ratio_sysctl_handler(), to make code logic more reasonable. Signed-off-by: Baoquan He Signed-off-by: Andrew Morton Cc: Joonsoo Kim Cc: Michal Hocko Cc: Baoquan He Cc: Mel Gorman Cc: David Rientjes Link: http://lkml.kernel.org/r/20200402140113.3696-1-bhe@redhat.com Link: http://lkml.kernel.org/r/20200402140113.3696-2-bhe@redhat.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a1a4f883b7f5..f3d5f0eb6159 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7704,8 +7704,7 @@ static void setup_per_zone_lowmem_reserve(void) idx--; lower_zone = pgdat->node_zones + idx; - if (sysctl_lowmem_reserve_ratio[idx] < 1) { - sysctl_lowmem_reserve_ratio[idx] = 0; + if (!sysctl_lowmem_reserve_ratio[idx]) { lower_zone->lowmem_reserve[j] = 0; } else { lower_zone->lowmem_reserve[j] = @@ -7970,7 +7969,15 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + int i; + proc_dointvec_minmax(table, write, buffer, length, ppos); + + for (i = 0; i < MAX_NR_ZONES; i++) { + if (sysctl_lowmem_reserve_ratio[i] < 1) + sysctl_lowmem_reserve_ratio[i] = 0; + } + setup_per_zone_lowmem_reserve(); return 0; } -- cgit v1.2.3-59-g8ed1b From f63661566fad43c0884ad879e6ff07c55ed890f4 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 3 Jun 2020 15:58:52 -0700 Subject: mm/page_alloc.c: clear out zone->lowmem_reserve[] if the zone is empty When requesting memory allocation from a specific zone is not satisfied, it will fall to lower zone to try allocating memory. In this case, lower zone's ->lowmem_reserve[] will help protect its own memory resource. The higher the relevant ->lowmem_reserve[] is, the harder the upper zone can get memory from this lower zone. However, this protection mechanism should be applied to populated zone, but not an empty zone. So filling ->lowmem_reserve[] for empty zone is not necessary, and may mislead people that it's valid data in that zone. Node 2, zone DMA pages free 0 min 0 low 0 high 0 spanned 0 present 0 managed 0 protection: (0, 0, 1024, 1024) Node 2, zone DMA32 pages free 0 min 0 low 0 high 0 spanned 0 present 0 managed 0 protection: (0, 0, 1024, 1024) Node 2, zone Normal per-node stats nr_inactive_anon 0 nr_active_anon 143 nr_inactive_file 0 nr_active_file 0 nr_unevictable 0 nr_slab_reclaimable 45 nr_slab_unreclaimable 254 Here clear out zone->lowmem_reserve[] if zone is empty. Signed-off-by: Baoquan He Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200402140113.3696-3-bhe@redhat.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3d5f0eb6159..5b8d0966d429 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7704,8 +7704,10 @@ static void setup_per_zone_lowmem_reserve(void) idx--; lower_zone = pgdat->node_zones + idx; - if (!sysctl_lowmem_reserve_ratio[idx]) { + if (!sysctl_lowmem_reserve_ratio[idx] || + !zone_managed_pages(lower_zone)) { lower_zone->lowmem_reserve[j] = 0; + continue; } else { lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; -- cgit v1.2.3-59-g8ed1b From 97a225e69a1f880886f33d2e65a7ace13f152caa Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 3 Jun 2020 15:59:01 -0700 Subject: mm/page_alloc: integrate classzone_idx and high_zoneidx classzone_idx is just different name for high_zoneidx now. So, integrate them and add some comment to struct alloc_context in order to reduce future confusion about the meaning of this variable. The accessor, ac_classzone_idx() is also removed since it isn't needed after integration. In addition to integration, this patch also renames high_zoneidx to highest_zoneidx since it represents more precise meaning. Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Ye Xiaolong Link: http://lkml.kernel.org/r/1587095923-7515-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 9 ++-- include/linux/mmzone.h | 12 ++--- include/trace/events/compaction.h | 22 ++++---- include/trace/events/vmscan.h | 14 +++-- mm/compaction.c | 64 +++++++++++------------ mm/internal.h | 21 +++++--- mm/memory_hotplug.c | 6 +-- mm/oom_kill.c | 4 +- mm/page_alloc.c | 60 +++++++++++----------- mm/slab.c | 4 +- mm/slub.c | 4 +- mm/vmscan.c | 105 ++++++++++++++++++++------------------ 12 files changed, 175 insertions(+), 150 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4b898cdbdf05..3ed2f22b588a 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -97,7 +97,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int classzone_idx); + unsigned int alloc_flags, int highest_zoneidx); extern void defer_compaction(struct zone *zone, int order); extern bool compaction_deferred(struct zone *zone, int order); @@ -182,7 +182,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, extern int kcompactd_run(int nid); extern void kcompactd_stop(int nid); -extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); +extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); #else static inline void reset_isolation_suitable(pg_data_t *pgdat) @@ -190,7 +190,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline enum compact_result compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + int alloc_flags, int highest_zoneidx) { return COMPACT_SKIPPED; } @@ -232,7 +232,8 @@ static inline void kcompactd_stop(int nid) { } -static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +static inline void wakeup_kcompactd(pg_data_t *pgdat, + int order, int highest_zoneidx) { } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0c575c3d7feb..cd8bd5f90552 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -699,13 +699,13 @@ typedef struct pglist_data { struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; - enum zone_type kswapd_classzone_idx; + enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; - enum zone_type kcompactd_classzone_idx; + enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; #endif @@ -783,15 +783,15 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, - enum zone_type classzone_idx); + enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, + unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx); + unsigned long mark, int highest_zoneidx); enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index e5bf6ee4e814..54e5bf081171 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -314,40 +314,44 @@ TRACE_EVENT(mm_compaction_kcompactd_sleep, DECLARE_EVENT_CLASS(kcompactd_wake_template, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx), + TP_ARGS(nid, order, highest_zoneidx), TP_STRUCT__entry( __field(int, nid) __field(int, order) - __field(enum zone_type, classzone_idx) + __field(enum zone_type, highest_zoneidx) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; ), + /* + * classzone_idx is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("nid=%d order=%d classzone_idx=%-8s", __entry->nid, __entry->order, - __print_symbolic(__entry->classzone_idx, ZONE_TYPE)) + __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); #endif diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 74bb594ccb25..2070df64958e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -265,7 +265,7 @@ TRACE_EVENT(mm_shrink_slab_end, ); TRACE_EVENT(mm_vmscan_lru_isolate, - TP_PROTO(int classzone_idx, + TP_PROTO(int highest_zoneidx, int order, unsigned long nr_requested, unsigned long nr_scanned, @@ -274,10 +274,10 @@ TRACE_EVENT(mm_vmscan_lru_isolate, isolate_mode_t isolate_mode, int lru), - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), + TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), TP_STRUCT__entry( - __field(int, classzone_idx) + __field(int, highest_zoneidx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) @@ -288,7 +288,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, ), TP_fast_assign( - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; @@ -298,9 +298,13 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->lru = lru; ), + /* + * classzone is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->isolate_mode, - __entry->classzone_idx, + __entry->highest_zoneidx, __entry->order, __entry->nr_requested, __entry->nr_scanned, diff --git a/mm/compaction.c b/mm/compaction.c index 8c2961100840..883355de4ace 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1968,7 +1968,7 @@ static enum compact_result compact_finished(struct compact_control *cc) */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx, + int highest_zoneidx, unsigned long wmark_target) { unsigned long watermark; @@ -1981,7 +1981,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. */ - if (zone_watermark_ok(zone, order, watermark, classzone_idx, + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, alloc_flags)) return COMPACT_SUCCESS; @@ -1991,9 +1991,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * watermark and alloc_flags have to match, or be more pessimistic than * the check in __isolate_free_page(). We don't use the direct * compactor's alloc_flags, as they are not relevant for freepage - * isolation. We however do use the direct compactor's classzone_idx to - * skip over zones where lowmem reserves would prevent allocation even - * if compaction succeeds. + * isolation. We however do use the direct compactor's highest_zoneidx + * to skip over zones where lowmem reserves would prevent allocation + * even if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. * ALLOC_CMA is used, as pages in CMA pageblocks are considered @@ -2002,7 +2002,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; @@ -2011,12 +2011,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx) + int highest_zoneidx) { enum compact_result ret; int fragindex; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2057,8 +2057,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, * Make sure at least one zone would pass __compaction_suitable if we continue * retrying the reclaim. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; enum compact_result compact_result; @@ -2071,7 +2071,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, - ac_classzone_idx(ac), available); + ac->highest_zoneidx, available); if (compact_result != COMPACT_SKIPPED) return true; } @@ -2102,7 +2102,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->classzone_idx); + cc->highest_zoneidx); /* Compaction is likely to fail */ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; @@ -2293,7 +2293,7 @@ out: static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, - unsigned int alloc_flags, int classzone_idx, + unsigned int alloc_flags, int highest_zoneidx, struct page **capture) { enum compact_result ret; @@ -2305,7 +2305,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .mode = (prio == COMPACT_PRIO_ASYNC) ? MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, .alloc_flags = alloc_flags, - .classzone_idx = classzone_idx, + .highest_zoneidx = highest_zoneidx, .direct_compaction = true, .whole_zone = (prio == MIN_COMPACT_PRIORITY), .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), @@ -2361,8 +2361,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); /* Compact each zone in the list */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { enum compact_result status; if (prio > MIN_COMPACT_PRIORITY @@ -2372,7 +2372,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac_classzone_idx(ac), capture); + alloc_flags, ac->highest_zoneidx, capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -2507,16 +2507,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) { int zoneid; struct zone *zone; - enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; - for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, - classzone_idx) == COMPACT_CONTINUE) + highest_zoneidx) == COMPACT_CONTINUE) return true; } @@ -2534,16 +2534,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct compact_control cc = { .order = pgdat->kcompactd_max_order, .search_order = pgdat->kcompactd_max_order, - .classzone_idx = pgdat->kcompactd_classzone_idx, + .highest_zoneidx = pgdat->kcompactd_highest_zoneidx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, - cc.classzone_idx); + cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); - for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) { int status; zone = &pgdat->node_zones[zoneid]; @@ -2592,16 +2592,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) /* * Regardless of success, we are done until woken up next. But remember - * the requested order/classzone_idx in case it was higher/tighter than - * our current ones + * the requested order/highest_zoneidx in case it was higher/tighter + * than our current ones */ if (pgdat->kcompactd_max_order <= cc.order) pgdat->kcompactd_max_order = 0; - if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; } -void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) { if (!order) return; @@ -2609,8 +2609,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; - if (pgdat->kcompactd_classzone_idx > classzone_idx) - pgdat->kcompactd_classzone_idx = classzone_idx; + if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = highest_zoneidx; /* * Pairs with implicit barrier in wait_event_freezable() @@ -2623,7 +2623,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) return; trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, - classzone_idx); + highest_zoneidx); wake_up_interruptible(&pgdat->kcompactd_wait); } @@ -2644,7 +2644,7 @@ static int kcompactd(void *p) set_freezable(); pgdat->kcompactd_max_order = 0; - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { unsigned long pflags; diff --git a/mm/internal.h b/mm/internal.h index 6220a5e6b3c7..b1f0afcbe016 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -127,10 +127,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * between functions involved in allocations, including the alloc_pages* * family of functions. * - * nodemask, migratetype and high_zoneidx are initialized only once in + * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages_nodemask() and then never change. * - * zonelist, preferred_zone and classzone_idx are set first in + * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages_nodemask() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole strucure * by a const pointer. @@ -140,12 +140,21 @@ struct alloc_context { nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; - enum zone_type high_zoneidx; + + /* + * highest_zoneidx represents highest usable zone index of + * the allocation request. Due to the nature of the zone, + * memory on lower zone than the highest_zoneidx will be + * protected by lowmem_reserve[highest_zoneidx]. + * + * highest_zoneidx is also used by reclaim/compaction to limit + * the target zone since higher zone than this index cannot be + * usable for this allocation request. + */ + enum zone_type highest_zoneidx; bool spread_dirty_pages; }; -#define ac_classzone_idx(ac) (ac->high_zoneidx) - /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). @@ -224,7 +233,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ + const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e67dc501576a..926ec704e835 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -879,13 +879,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } else { int cpu; /* - * Reset the nr_zones, order and classzone_idx before reuse. - * Note that kswapd will init kswapd_classzone_idx properly + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly * when it starts in the near future. */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_highest_zoneidx = 0; for_each_online_cpu(cpu) { struct per_cpu_nodestat *p; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dfc357614e56..4daedf7b91f6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -254,7 +254,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) { struct zone *zone; struct zoneref *z; - enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); + enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask); bool cpuset_limited = false; int nid; @@ -294,7 +294,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) /* Check this allocation failure is caused by cpuset's wall function */ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, - high_zoneidx, oc->nodemask) + highest_zoneidx, oc->nodemask) if (!cpuset_zone_allowed(zone, oc->gfp_mask)) cpuset_limited = true; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b8d0966d429..5ef1eff330a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2593,7 +2593,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, int order; bool ret; - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { /* * Preserve at least one pageblock unless memory pressure @@ -3462,7 +3462,7 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); * to check in the allocation paths if no pages are free. */ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages) { long min = mark; @@ -3507,7 +3507,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) return false; /* If this is an order-0 request then the watermark is fine */ @@ -3540,14 +3540,15 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags) { - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } static inline bool zone_watermark_fast(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, unsigned int alloc_flags) + unsigned long mark, int highest_zoneidx, + unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); long cma_pages = 0; @@ -3565,22 +3566,23 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && (free_pages - cma_pages) > + mark + z->lowmem_reserve[highest_zoneidx]) return true; - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages); } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx) + unsigned long mark, int highest_zoneidx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, 0, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, free_pages); } @@ -3657,8 +3659,8 @@ retry: */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { struct page *page; unsigned long mark; @@ -3713,7 +3715,7 @@ retry: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) { + ac->highest_zoneidx, alloc_flags)) { int ret; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -3746,7 +3748,7 @@ retry: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) goto try_this_zone; continue; @@ -3905,7 +3907,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_RETRY_MAYFAIL) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) + if (ac->highest_zoneidx < ZONE_NORMAL) goto out; if (pm_suspended_storage()) goto out; @@ -4108,10 +4110,10 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla * Let's give them a good hope and keep retrying while the order-0 * watermarks are OK. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) return true; } return false; @@ -4235,12 +4237,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; - enum zone_type high_zoneidx = ac->high_zoneidx; + enum zone_type highest_zoneidx = ac->highest_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -4375,8 +4377,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * request even if all reclaimable pages are considered then we are * screwed and have to go OOM. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; unsigned long reclaimable; unsigned long min_wmark = min_wmark_pages(zone); @@ -4390,7 +4392,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, - ac_classzone_idx(ac), alloc_flags, available); + ac->highest_zoneidx, alloc_flags, available); trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { @@ -4509,7 +4511,7 @@ retry_cpuset: * could end up iterating over non-eligible zones endlessly. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone) goto nopage; @@ -4596,7 +4598,7 @@ retry: if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* Attempt with potentially adjusted zonelist and alloc_flags */ @@ -4730,7 +4732,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { - ac->high_zoneidx = gfp_zone(gfp_mask); + ac->highest_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; ac->migratetype = gfpflags_to_migratetype(gfp_mask); @@ -4769,7 +4771,7 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) * may get reset for allocations that ignore memory policies. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* @@ -6867,7 +6869,7 @@ static void __init free_area_init_node(int nid) unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); diff --git a/mm/slab.c b/mm/slab.c index a89633603b2d..9350062ffc1a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3106,7 +3106,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *obj = NULL; struct page *page; int nid; @@ -3124,7 +3124,7 @@ retry: * Look through allowed nodes for objects available * from existing per node queues. */ - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { nid = zone_to_nid(zone); if (cpuset_zone_allowed(zone, flags) && diff --git a/mm/slub.c b/mm/slub.c index 527209d63278..d52487919278 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1938,7 +1938,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *object; unsigned int cpuset_mems_cookie; @@ -1967,7 +1967,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); diff --git a/mm/vmscan.c b/mm/vmscan.c index b2f5deb3603c..18bfbee9a581 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3131,8 +3131,8 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL) - WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL); + if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3385,7 +3385,7 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { int i; struct zone *zone; @@ -3397,7 +3397,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) * start prematurely when there is no boosting and a lower * zone is balanced. */ - for (i = classzone_idx; i >= 0; i--) { + for (i = highest_zoneidx; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3411,9 +3411,9 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) /* * Returns true if there is an eligible zone balanced for the request order - * and classzone_idx + * and highest_zoneidx */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long mark = -1; @@ -3423,19 +3423,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; mark = high_wmark_pages(zone); - if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } /* - * If a node has no populated zone within classzone_idx, it does not + * If a node has no populated zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ @@ -3461,7 +3461,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, + int highest_zoneidx) { /* * The throttled processes are normally woken up in balance_pgdat() as @@ -3483,7 +3484,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; - if (pgdat_balanced(pgdat, order, classzone_idx)) { + if (pgdat_balanced(pgdat, order, highest_zoneidx)) { clear_pgdat_congested(pgdat); return true; } @@ -3547,7 +3548,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * or lower is eligible for reclaim until at least one usable zone is * balanced. */ -static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; @@ -3575,7 +3576,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3593,7 +3594,7 @@ restart: bool balanced; bool ret; - sc.reclaim_idx = classzone_idx; + sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed @@ -3623,7 +3624,7 @@ restart: * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */ - balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); + balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; @@ -3723,7 +3724,7 @@ out: if (boosted) { unsigned long flags; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; @@ -3738,7 +3739,7 @@ out: * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ - wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); + wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); @@ -3756,22 +3757,22 @@ out: } /* - * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be - * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not - * a valid index then either kswapd runs for first time or kswapd couldn't sleep - * after previous reclaim attempt (node is still unbalanced). In that case - * return the zone index of the previous kswapd reclaim cycle. + * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to + * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is + * not a valid index then either kswapd runs for first time or kswapd couldn't + * sleep after previous reclaim attempt (node is still unbalanced). In that + * case return the zone index of the previous kswapd reclaim cycle. */ -static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, - enum zone_type prev_classzone_idx) +static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, + enum zone_type prev_highest_zoneidx) { - enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx; + return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, - unsigned int classzone_idx) + unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3788,7 +3789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * eligible zone balanced that it's also unlikely that compaction will * succeed. */ - if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3801,18 +3802,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, alloc_order, classzone_idx); + wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* - * If woken prematurely then reset kswapd_classzone_idx and + * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { - WRITE_ONCE(pgdat->kswapd_classzone_idx, - kswapd_classzone_idx(pgdat, classzone_idx)); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, + kswapd_highest_zoneidx(pgdat, + highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); @@ -3827,7 +3829,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * go fully to sleep until explicitly woken up. */ if (!remaining && - prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3869,7 +3871,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; - unsigned int classzone_idx = MAX_NR_ZONES - 1; + unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -3893,22 +3895,24 @@ static int kswapd(void *p) set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); for ( ; ; ) { bool ret; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, - classzone_idx); + highest_zoneidx); - /* Read the new order and classzone_idx */ + /* Read the new order and highest_zoneidx */ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); ret = try_to_freeze(); if (kthread_should_stop()) @@ -3929,9 +3933,10 @@ kswapd_try_sleep: * but kcompactd is woken to compact for the original * request (alloc_order). */ - trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); - reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + reclaim_order = balance_pgdat(pgdat, alloc_order, + highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3949,7 +3954,7 @@ kswapd_try_sleep: * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, - enum zone_type classzone_idx) + enum zone_type highest_zoneidx) { pg_data_t *pgdat; enum zone_type curr_idx; @@ -3961,10 +3966,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; pgdat = zone->zone_pgdat; - curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx) - WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx); + if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); if (READ_ONCE(pgdat->kswapd_order) < order) WRITE_ONCE(pgdat->kswapd_order, order); @@ -3974,8 +3979,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - (pgdat_balanced(pgdat, order, classzone_idx) && - !pgdat_watermark_boosted(pgdat, classzone_idx))) { + (pgdat_balanced(pgdat, order, highest_zoneidx) && + !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd @@ -3984,11 +3989,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) - wakeup_kcompactd(pgdat, order, classzone_idx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } -- cgit v1.2.3-59-g8ed1b From d0ddf49b7c4a3161d28f58612672d08ee9b5db94 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 3 Jun 2020 15:59:05 -0700 Subject: mm/page_alloc.c: use NODE_MASK_NONE in build_zonelists() Slightly simplify the code by initializing user_mask with NODE_MASK_NONE, instead of later calling nodes_clear(). This saves a line of code. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Link: http://lkml.kernel.org/r/20200330220840.21228-1-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ef1eff330a2..818c2644a200 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5692,14 +5692,13 @@ static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; int node, load, nr_nodes = 0; - nodemask_t used_mask; + nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = nr_online_nodes; prev_node = local_node; - nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { -- cgit v1.2.3-59-g8ed1b From 01c0bfe061f309b848d51619f20495ee2acd7727 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 3 Jun 2020 15:59:08 -0700 Subject: mm: rename gfpflags_to_migratetype to gfp_migratetype for same convention Pageblock migrate type is encoded in GFP flags, just as zone_type and zonelist. Currently we use gfp_zone() and gfp_zonelist() to extract related information, it would be proper to use the same naming convention for migrate type. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Pankaj Gupta Link: http://lkml.kernel.org/r/20200329080823.7735-1-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- mm/compaction.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_owner.c | 7 +++---- 4 files changed, 7 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fab6d486cbb7..67a0774e080b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -312,7 +312,7 @@ struct vm_area_struct; #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 -static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) +static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); diff --git a/mm/compaction.c b/mm/compaction.c index 883355de4ace..5e3e3a972cd2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2100,7 +2100,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); - cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); + cc->migratetype = gfp_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, cc->highest_zoneidx); /* Compaction is likely to fail */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 818c2644a200..2bd8d6893b3f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4285,7 +4285,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; #ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -4735,7 +4735,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, ac->highest_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; - ac->migratetype = gfpflags_to_migratetype(gfp_mask); + ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { *alloc_mask |= __GFP_HARDWALL; diff --git a/mm/page_owner.c b/mm/page_owner.c index 18ecde9f45b2..360461509423 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -312,8 +312,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; page_owner = get_page_owner(page_ext); - page_mt = gfpflags_to_migratetype( - page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; @@ -359,7 +358,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); - page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, @@ -416,7 +415,7 @@ void __dump_page_owner(struct page *page) page_owner = get_page_owner(page_ext); gfp_mask = page_owner->gfp_mask; - mt = gfpflags_to_migratetype(gfp_mask); + mt = gfp_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); -- cgit v1.2.3-59-g8ed1b From b418a0f9f0b0f17132daab8a3bb841142c2bdc44 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 3 Jun 2020 15:59:11 -0700 Subject: mm/page_alloc.c: reset numa stats for boot pagesets Initially, the per-cpu pagesets of each zone are set to the boot pagesets. The real pagesets are allocated later but before that happens, page allocations do occur and the numa stats for the boot pagesets get incremented since they are common to all zones at that point. The real pagesets, however, are allocated for the populated zones only. Unpopulated zones, like those associated with memory-less nodes, continue using the boot pageset and end up skewing the numa stats of the corresponding node. E.g. $ numactl -H available: 2 nodes (0-1) node 0 cpus: 0 1 2 3 node 0 size: 0 MB node 0 free: 0 MB node 1 cpus: 4 5 6 7 node 1 size: 8131 MB node 1 free: 6980 MB node distances: node 0 1 0: 10 40 1: 40 10 $ numastat node0 node1 numa_hit 108 56495 numa_miss 0 0 numa_foreign 0 0 interleave_hit 0 4537 local_node 108 31547 other_node 0 24948 Hence, the boot pageset stats need to be cleared after the real pagesets are allocated. After this point, the stats of the boot pagesets do not change as page allocations requested for a memory-less node will either fail (if __GFP_THISNODE is used) or get fulfilled by a preferred zone of a different node based on the fallback zonelist. [sandipan@linux.ibm.com: v3] Link: http://lkml.kernel.org/r/20200511170356.162531-1-sandipan@linux.ibm.com Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Konstantin Khlebnikov Cc: Michal Hocko Cc: "Kirill A . Shutemov" Cc: "Aneesh Kumar K.V" Link: http://lkml.kernel.org/r/9c9c2d1b15e37f6e6bf32f99e3100035e90c4ac9.1588868430.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bd8d6893b3f..4c5fdde62ff8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6250,10 +6250,25 @@ void __init setup_per_cpu_pageset(void) { struct pglist_data *pgdat; struct zone *zone; + int __maybe_unused cpu; for_each_populated_zone(zone) setup_zone_pageset(zone); +#ifdef CONFIG_NUMA + /* + * Unpopulated zones continue using the boot pagesets. + * The numa stats for these pagesets need to be reset. + * Otherwise, they will end up skewing the stats of + * the nodes these zones are associated with. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); + memset(pcp->vm_numa_stat_diff, 0, + sizeof(pcp->vm_numa_stat_diff)); + } +#endif + for_each_online_pgdat(pgdat) pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); -- cgit v1.2.3-59-g8ed1b From aa09259109583b98b9d9e7ed0d8eb1b880d1eb97 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Wed, 3 Jun 2020 15:59:14 -0700 Subject: mm, page_alloc: reset the zone->watermark_boost early Updating the zone watermarks by any means, like min_free_kbytes, water_mark_scale_factor etc, when ->watermark_boost is set will result in higher low and high watermarks than the user asked. Below are the steps to reproduce the problem on system setup of Android kernel running on Snapdragon hardware. 1) Default settings of the system are as below: #cat /proc/sys/vm/min_free_kbytes = 5162 #cat /proc/zoneinfo | grep -e boost -e low -e "high " -e min -e Node Node 0, zone Normal min 797 low 8340 high 8539 2) Monitor the zone->watermark_boost(by adding a debug print in the kernel) and whenever it is greater than zero value, write the same value of min_free_kbytes obtained from step 1. #echo 5162 > /proc/sys/vm/min_free_kbytes 3) Then read the zone watermarks in the system while the ->watermark_boost is zero. This should show the same values of watermarks as step 1 but shown a higher values than asked. #cat /proc/zoneinfo | grep -e boost -e low -e "high " -e min -e Node Node 0, zone Normal min 797 low 21148 high 21347 These higher values are because of updating the zone watermarks using the macro min_wmark_pages(zone) which also adds the zone->watermark_boost. #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) So the steps that lead to the issue are: 1) On the extfrag event, watermarks are boosted by storing the required value in ->watermark_boost. 2) User tries to update the zone watermarks level in the system through min_free_kbytes or watermark_scale_factor. 3) Later, when kswapd woke up, it resets the zone->watermark_boost to zero. In step 2), we use the min_wmark_pages() macro to store the watermarks in the zone structure thus the values are always offsetted by ->watermark_boost value. This can be avoided by resetting the ->watermark_boost to zero before it is used. Signed-off-by: Charan Teja Reddy Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Cc: Vinayak Menon Link: http://lkml.kernel.org/r/1589457511-4255-1-git-send-email-charante@codeaurora.org Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4c5fdde62ff8..d95442f7e478 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7788,9 +7788,9 @@ static void __setup_per_zone_wmarks(void) mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); + zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - zone->watermark_boost = 0; spin_unlock_irqrestore(&zone->lock, flags); } -- cgit v1.2.3-59-g8ed1b From ae70eddd5633fc71dccf210f237c5aefc96f4332 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 3 Jun 2020 15:59:17 -0700 Subject: mm/page_alloc: restrict and formalize compound_page_dtors[] Restrict elements in compound_page_dtors[] array per NR_COMPOUND_DTORS and explicitly position them according to enum compound_dtor_id. This improves protection against possible misalignment between compound_page_dtors[] and enum compound_dtor_id later on. Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Link: http://lkml.kernel.org/r/1589795958-19317-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/page_alloc.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4141ebcb3a65..32f3c17715ac 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -867,7 +867,7 @@ enum compound_dtor_id { #endif NR_COMPOUND_DTORS, }; -extern compound_page_dtor * const compound_page_dtors[]; +extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d95442f7e478..045c4aeeec9a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -302,14 +302,14 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -compound_page_dtor * const compound_page_dtors[] = { - NULL, - free_compound_page, +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { + [NULL_COMPOUND_DTOR] = NULL, + [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE - free_huge_page, + [HUGETLB_PAGE_DTOR] = free_huge_page, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - free_transhuge_page, + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, #endif }; -- cgit v1.2.3-59-g8ed1b From 117003c32771df617acf66e140fbdbdeb0ac71f5 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:20 -0700 Subject: mm/pagealloc.c: call touch_nmi_watchdog() on max order boundaries in deferred init Patch series "initialize deferred pages with interrupts enabled", v4. Keep interrupts enabled during deferred page initialization in order to make code more modular and allow jiffies to update. Original approach, and discussion can be found here: http://lkml.kernel.org/r/20200311123848.118638-1-shile.zhang@linux.alibaba.com This patch (of 3): deferred_init_memmap() disables interrupts the entire time, so it calls touch_nmi_watchdog() periodically to avoid soft lockup splats. Soon it will run with interrupts enabled, at which point cond_resched() should be used instead. deferred_grow_zone() makes the same watchdog calls through code shared with deferred init but will continue to run with interrupts disabled, so it can't call cond_resched(). Pull the watchdog calls up to these two places to allow the first to be changed later, independently of the second. The frequency reduces from twice per pageblock (init and free) to once per max order block. Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages") Signed-off-by: Daniel Jordan Signed-off-by: Pavel Tatashin Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Dan Williams Cc: Shile Zhang Cc: Kirill Tkhai Cc: James Morris Cc: Sasha Levin Cc: Yiqian Wei Cc: [4.17+] Link: http://lkml.kernel.org/r/20200403140952.17177-2-pasha.tatashin@soleen.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 045c4aeeec9a..148cf9a73f0b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1693,7 +1693,6 @@ static void __init deferred_free_pages(unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - touch_nmi_watchdog(); } else { nr_free++; } @@ -1723,7 +1722,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - touch_nmi_watchdog(); } else { page++; } @@ -1863,8 +1861,10 @@ static int __init deferred_init_memmap(void *data) * that we can avoid introducing any issues with the buddy * allocator. */ - while (spfn < epfn) + while (spfn < epfn) { nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + touch_nmi_watchdog(); + } zone_empty: pgdat_resize_unlock(pgdat, &flags); @@ -1948,6 +1948,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order) first_deferred_pfn = spfn; nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + touch_nmi_watchdog(); /* We should only stop along section boundaries */ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) -- cgit v1.2.3-59-g8ed1b From 3d060856adfc59afb9d029c233141334cfaba418 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 3 Jun 2020 15:59:24 -0700 Subject: mm: initialize deferred pages with interrupts enabled Initializing struct pages is a long task and keeping interrupts disabled for the duration of this operation introduces a number of problems. 1. jiffies are not updated for long period of time, and thus incorrect time is reported. See proposed solution and discussion here: lkml/20200311123848.118638-1-shile.zhang@linux.alibaba.com 2. It prevents farther improving deferred page initialization by allowing intra-node multi-threading. We are keeping interrupts disabled to solve a rather theoretical problem that was never observed in real world (See 3a2d7fa8a3d5). Let's keep interrupts enabled. In case we ever encounter a scenario where an interrupt thread wants to allocate large amount of memory this early in boot we can deal with that by growing zone (see deferred_grow_zone()) by the needed amount before starting deferred_init_memmap() threads. Before: [ 1.232459] node 0 initialised, 12058412 pages in 1ms After: [ 1.632580] node 0 initialised, 12051227 pages in 436ms Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages") Reported-by: Shile Zhang Signed-off-by: Pavel Tatashin Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Dan Williams Cc: James Morris Cc: Kirill Tkhai Cc: Sasha Levin Cc: Yiqian Wei Cc: [4.17+] Link: http://lkml.kernel.org/r/20200403140952.17177-3-pasha.tatashin@soleen.com Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ mm/page_alloc.c | 20 +++++++------------- 2 files changed, 9 insertions(+), 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd8bd5f90552..2f79ff4477ba 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -680,6 +680,8 @@ typedef struct pglist_data { /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. + * Also synchronizes pgdat->first_deferred_pfn during deferred page + * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 148cf9a73f0b..c75561a3f144 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1844,6 +1844,13 @@ static int __init deferred_init_memmap(void *data) BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); pgdat->first_deferred_pfn = ULONG_MAX; + /* + * Once we unlock here, the zone cannot be grown anymore, thus if an + * interrupt thread must allocate this early in boot, zone must be + * pre-grown prior to start of deferred page initialization. + */ + pgdat_resize_unlock(pgdat, &flags); + /* Only the highest zone is deferred so find it */ for (zid = 0; zid < MAX_NR_ZONES; zid++) { zone = pgdat->node_zones + zid; @@ -1866,8 +1873,6 @@ static int __init deferred_init_memmap(void *data) touch_nmi_watchdog(); } zone_empty: - pgdat_resize_unlock(pgdat, &flags); - /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); @@ -1909,17 +1914,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order) pgdat_resize_lock(pgdat, &flags); - /* - * If deferred pages have been initialized while we were waiting for - * the lock, return true, as the zone was grown. The caller will retry - * this zone. We won't return to this function since the caller also - * has this static branch. - */ - if (!static_branch_unlikely(&deferred_pages)) { - pgdat_resize_unlock(pgdat, &flags); - return true; - } - /* * If someone grew this zone while we were waiting for spinlock, return * true, as there might be enough pages already. -- cgit v1.2.3-59-g8ed1b From da97f2d56bbd880b4138916a7ef96f9881a551b2 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 3 Jun 2020 15:59:27 -0700 Subject: mm: call cond_resched() from deferred_init_memmap() Now that deferred pages are initialized with interrupts enabled we can replace touch_nmi_watchdog() with cond_resched(), as it was before 3a2d7fa8a3d5. For now, we cannot do the same in deferred_grow_zone() as it is still initializes pages with interrupts disabled. This change fixes RCU problem described in https://lkml.kernel.org/r/20200401104156.11564-2-david@redhat.com [ 60.474005] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [ 60.475000] rcu: 1-...0: (0 ticks this GP) idle=02a/1/0x4000000000000000 softirq=1/1 fqs=15000 [ 60.475000] rcu: (detected by 0, t=60002 jiffies, g=-1199, q=1) [ 60.475000] Sending NMI from CPU 0 to CPUs 1: [ 1.760091] NMI backtrace for cpu 1 [ 1.760091] CPU: 1 PID: 20 Comm: pgdatinit0 Not tainted 4.18.0-147.9.1.el8_1.x86_64 #1 [ 1.760091] Hardware name: Red Hat KVM, BIOS 1.13.0-1.module+el8.2.0+5520+4e5817f3 04/01/2014 [ 1.760091] RIP: 0010:__init_single_page.isra.65+0x10/0x4f [ 1.760091] Code: 48 83 cf 63 48 89 f8 0f 1f 40 00 48 89 c6 48 89 d7 e8 6b 18 80 ff 66 90 5b c3 31 c0 b9 10 00 00 00 49 89 f8 48 c1 e6 33 f3 ab 07 00 00 00 48 c1 e2 36 41 c7 40 34 01 00 00 00 48 c1 e0 33 41 [ 1.760091] RSP: 0000:ffffba783123be40 EFLAGS: 00000006 [ 1.760091] RAX: 0000000000000000 RBX: fffffad34405e300 RCX: 0000000000000000 [ 1.760091] RDX: 0000000000000000 RSI: 0010000000000000 RDI: fffffad34405e340 [ 1.760091] RBP: 0000000033f3177e R08: fffffad34405e300 R09: 0000000000000002 [ 1.760091] R10: 000000000000002b R11: ffff98afb691a500 R12: 0000000000000002 [ 1.760091] R13: 0000000000000000 R14: 000000003f03ea00 R15: 000000003e10178c [ 1.760091] FS: 0000000000000000(0000) GS:ffff9c9ebeb00000(0000) knlGS:0000000000000000 [ 1.760091] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.760091] CR2: 00000000ffffffff CR3: 000000a1cf20a001 CR4: 00000000003606e0 [ 1.760091] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1.760091] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1.760091] Call Trace: [ 1.760091] deferred_init_pages+0x8f/0xbf [ 1.760091] deferred_init_memmap+0x184/0x29d [ 1.760091] ? deferred_free_pages.isra.97+0xba/0xba [ 1.760091] kthread+0x112/0x130 [ 1.760091] ? kthread_flush_work_fn+0x10/0x10 [ 1.760091] ret_from_fork+0x35/0x40 [ 89.123011] node 0 initialised, 1055935372 pages in 88650ms Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages") Reported-by: Yiqian Wei Signed-off-by: Pavel Tatashin Signed-off-by: Andrew Morton Tested-by: David Hildenbrand Reviewed-by: Daniel Jordan Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Acked-by: Michal Hocko Cc: Dan Williams Cc: James Morris Cc: Kirill Tkhai Cc: Sasha Levin Cc: Shile Zhang Cc: Vlastimil Babka Cc: [4.17+] Link: http://lkml.kernel.org/r/20200403140952.17177-4-pasha.tatashin@soleen.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c75561a3f144..2dbc571f68de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1870,7 +1870,7 @@ static int __init deferred_init_memmap(void *data) */ while (spfn < epfn) { nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); - touch_nmi_watchdog(); + cond_resched(); } zone_empty: /* Sanity check that the next zone really is unpopulated */ -- cgit v1.2.3-59-g8ed1b From 89c7c4022dfccf0c48ab22f4a6fd2db3d98fe3bc Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:47 -0700 Subject: mm: don't track number of pages during deferred initialization Deferred page init used to report the number of pages initialized: node 0 initialised, 32439114 pages in 97ms Tracking this makes the code more complicated when using multiple threads. Given that the statistic probably has limited value, especially since a zone grows on demand so that the page count can vary, just remove it. The boot message now looks like node 0 deferred pages initialised in 97ms Suggested-by: Alexander Duyck Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Reviewed-by: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Josh Triplett Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-6-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2dbc571f68de..89bd57241e08 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1820,7 +1820,7 @@ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - unsigned long spfn = 0, epfn = 0, nr_pages = 0; + unsigned long spfn = 0, epfn = 0; unsigned long first_init_pfn, flags; unsigned long start = jiffies; struct zone *zone; @@ -1869,15 +1869,15 @@ static int __init deferred_init_memmap(void *data) * allocator. */ while (spfn < epfn) { - nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + deferred_init_maxorder(&i, zone, &spfn, &epfn); cond_resched(); } zone_empty: /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", - pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); + pr_info("node %d deferred pages initialised in %ums\n", + pgdat->node_id, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; -- cgit v1.2.3-59-g8ed1b From e44431498f5fbf427f139aa413cf381b4fa3a600 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:51 -0700 Subject: mm: parallelize deferred_init_memmap() Deferred struct page init is a significant bottleneck in kernel boot. Optimizing it maximizes availability for large-memory systems and allows spinning up short-lived VMs as needed without having to leave them running. It also benefits bare metal machines hosting VMs that are sensitive to downtime. In projects such as VMM Fast Restart[1], where guest state is preserved across kexec reboot, it helps prevent application and network timeouts in the guests. Multithread to take full advantage of system memory bandwidth. The maximum number of threads is capped at the number of CPUs on the node because speedups always improve with additional threads on every system tested, and at this phase of boot, the system is otherwise idle and waiting on page init to finish. Helper threads operate on section-aligned ranges to both avoid false sharing when setting the pageblock's migrate type and to avoid accessing uninitialized buddy pages, though max order alignment is enough for the latter. The minimum chunk size is also a section. There was benefit to using multiple threads even on relatively small memory (1G) systems, and this is the smallest size that the alignment allows. The time (milliseconds) is the slowest node to initialize since boot blocks until all nodes finish. intel_pstate is loaded in active mode without hwp and with turbo enabled, and intel_idle is active as well. Intel(R) Xeon(R) Platinum 8167M CPU @ 2.00GHz (Skylake, bare metal) 2 nodes * 26 cores * 2 threads = 104 CPUs 384G/node = 768G memory kernel boot deferred init ------------------------ ------------------------ node% (thr) speedup time_ms (stdev) speedup time_ms (stdev) ( 0) -- 4089.7 ( 8.1) -- 1785.7 ( 7.6) 2% ( 1) 1.7% 4019.3 ( 1.5) 3.8% 1717.7 ( 11.8) 12% ( 6) 34.9% 2662.7 ( 2.9) 79.9% 359.3 ( 0.6) 25% ( 13) 39.9% 2459.0 ( 3.6) 91.2% 157.0 ( 0.0) 37% ( 19) 39.2% 2485.0 ( 29.7) 90.4% 172.0 ( 28.6) 50% ( 26) 39.3% 2482.7 ( 25.7) 90.3% 173.7 ( 30.0) 75% ( 39) 39.0% 2495.7 ( 5.5) 89.4% 190.0 ( 1.0) 100% ( 52) 40.2% 2443.7 ( 3.8) 92.3% 138.0 ( 1.0) Intel(R) Xeon(R) CPU E5-2699C v4 @ 2.20GHz (Broadwell, kvm guest) 1 node * 16 cores * 2 threads = 32 CPUs 192G/node = 192G memory kernel boot deferred init ------------------------ ------------------------ node% (thr) speedup time_ms (stdev) speedup time_ms (stdev) ( 0) -- 1988.7 ( 9.6) -- 1096.0 ( 11.5) 3% ( 1) 1.1% 1967.0 ( 17.6) 0.3% 1092.7 ( 11.0) 12% ( 4) 41.1% 1170.3 ( 14.2) 73.8% 287.0 ( 3.6) 25% ( 8) 47.1% 1052.7 ( 21.9) 83.9% 177.0 ( 13.5) 38% ( 12) 48.9% 1016.3 ( 12.1) 86.8% 144.7 ( 1.5) 50% ( 16) 48.9% 1015.7 ( 8.1) 87.8% 134.0 ( 4.4) 75% ( 24) 49.1% 1012.3 ( 3.1) 88.1% 130.3 ( 2.3) 100% ( 32) 49.5% 1004.0 ( 5.3) 88.5% 125.7 ( 2.1) Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, bare metal) 2 nodes * 18 cores * 2 threads = 72 CPUs 128G/node = 256G memory kernel boot deferred init ------------------------ ------------------------ node% (thr) speedup time_ms (stdev) speedup time_ms (stdev) ( 0) -- 1680.0 ( 4.6) -- 627.0 ( 4.0) 3% ( 1) 0.3% 1675.7 ( 4.5) -0.2% 628.0 ( 3.6) 11% ( 4) 25.6% 1250.7 ( 2.1) 67.9% 201.0 ( 0.0) 25% ( 9) 30.7% 1164.0 ( 17.3) 81.8% 114.3 ( 17.7) 36% ( 13) 31.4% 1152.7 ( 10.8) 84.0% 100.3 ( 17.9) 50% ( 18) 31.5% 1150.7 ( 9.3) 83.9% 101.0 ( 14.1) 75% ( 27) 31.7% 1148.0 ( 5.6) 84.5% 97.3 ( 6.4) 100% ( 36) 32.0% 1142.3 ( 4.0) 85.6% 90.0 ( 1.0) AMD EPYC 7551 32-Core Processor (Zen, kvm guest) 1 node * 8 cores * 2 threads = 16 CPUs 64G/node = 64G memory kernel boot deferred init ------------------------ ------------------------ node% (thr) speedup time_ms (stdev) speedup time_ms (stdev) ( 0) -- 1029.3 ( 25.1) -- 240.7 ( 1.5) 6% ( 1) -0.6% 1036.0 ( 7.8) -2.2% 246.0 ( 0.0) 12% ( 2) 11.8% 907.7 ( 8.6) 44.7% 133.0 ( 1.0) 25% ( 4) 13.9% 886.0 ( 10.6) 62.6% 90.0 ( 6.0) 38% ( 6) 17.8% 845.7 ( 14.2) 69.1% 74.3 ( 3.8) 50% ( 8) 16.8% 856.0 ( 22.1) 72.9% 65.3 ( 5.7) 75% ( 12) 15.4% 871.0 ( 29.2) 79.8% 48.7 ( 7.4) 100% ( 16) 21.0% 813.7 ( 21.0) 80.5% 47.0 ( 5.2) Server-oriented distros that enable deferred page init sometimes run in small VMs, and they still benefit even though the fraction of boot time saved is smaller: AMD EPYC 7551 32-Core Processor (Zen, kvm guest) 1 node * 2 cores * 2 threads = 4 CPUs 16G/node = 16G memory kernel boot deferred init ------------------------ ------------------------ node% (thr) speedup time_ms (stdev) speedup time_ms (stdev) ( 0) -- 716.0 ( 14.0) -- 49.7 ( 0.6) 25% ( 1) 1.8% 703.0 ( 5.3) -4.0% 51.7 ( 0.6) 50% ( 2) 1.6% 704.7 ( 1.2) 43.0% 28.3 ( 0.6) 75% ( 3) 2.7% 696.7 ( 13.1) 49.7% 25.0 ( 0.0) 100% ( 4) 4.1% 687.0 ( 10.4) 55.7% 22.0 ( 0.0) Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, kvm guest) 1 node * 2 cores * 2 threads = 4 CPUs 14G/node = 14G memory kernel boot deferred init ------------------------ ------------------------ node% (thr) speedup time_ms (stdev) speedup time_ms (stdev) ( 0) -- 787.7 ( 6.4) -- 122.3 ( 0.6) 25% ( 1) 0.2% 786.3 ( 10.8) -2.5% 125.3 ( 2.1) 50% ( 2) 5.9% 741.0 ( 13.9) 37.6% 76.3 ( 19.7) 75% ( 3) 8.3% 722.0 ( 19.0) 49.9% 61.3 ( 3.2) 100% ( 4) 9.3% 714.7 ( 9.5) 56.4% 53.3 ( 1.5) On Josh's 96-CPU and 192G memory system: Without this patch series: [ 0.487132] node 0 initialised, 23398907 pages in 292ms [ 0.499132] node 1 initialised, 24189223 pages in 304ms ... [ 0.629376] Run /sbin/init as init process With this patch series: [ 0.231435] node 1 initialised, 24189223 pages in 32ms [ 0.236718] node 0 initialised, 23398907 pages in 36ms [1] https://static.sched.com/hosted_files/kvmforum2019/66/VMM-fast-restart_kvmforum2019.pdf Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Reviewed-by: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-7-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- mm/Kconfig | 6 +++--- mm/page_alloc.c | 46 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/Kconfig b/mm/Kconfig index 3af64646f343..e3490ecac839 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -747,13 +747,13 @@ config DEFERRED_STRUCT_PAGE_INIT depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT + select PADATA help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up - a subset of memmap at boot and then initialise the rest in parallel - by starting one-off "pgdatinitX" kernel thread for each node X. This - has a potential performance impact on processes running early in the + a subset of memmap at boot and then initialise the rest in parallel. + This has a potential performance impact on tasks running early in the lifetime of the system until these kthreads finish the initialisation. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89bd57241e08..27ec5dc4db33 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -1815,6 +1816,26 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, return nr_pages; } +static void __init +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, + void *arg) +{ + unsigned long spfn, epfn; + struct zone *zone = arg; + u64 i; + + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); + + /* + * Initialize and free pages in MAX_ORDER sized increments so that we + * can avoid introducing any issues with the buddy allocator. + */ + while (spfn < end_pfn) { + deferred_init_maxorder(&i, zone, &spfn, &epfn); + cond_resched(); + } +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { @@ -1824,7 +1845,7 @@ static int __init deferred_init_memmap(void *data) unsigned long first_init_pfn, flags; unsigned long start = jiffies; struct zone *zone; - int zid; + int zid, max_threads; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1864,13 +1885,26 @@ static int __init deferred_init_memmap(void *data) goto zone_empty; /* - * Initialize and free pages in MAX_ORDER sized increments so - * that we can avoid introducing any issues with the buddy - * allocator. + * More CPUs always led to greater speedups on tested systems, up to + * all the nodes' CPUs. Use all since the system is otherwise idle now. */ + max_threads = max(cpumask_weight(cpumask), 1u); + while (spfn < epfn) { - deferred_init_maxorder(&i, zone, &spfn, &epfn); - cond_resched(); + unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); + struct padata_mt_job job = { + .thread_fn = deferred_init_memmap_chunk, + .fn_arg = zone, + .start = spfn, + .size = epfn_align - spfn, + .align = PAGES_PER_SECTION, + .min_chunk = PAGES_PER_SECTION, + .max_threads = max_threads, + }; + + padata_do_multithreaded(&job); + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + epfn_align); } zone_empty: /* Sanity check that the next zone really is unpopulated */ -- cgit v1.2.3-59-g8ed1b From ecd096506922332fdb36ff1211e03601befe6e18 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:55 -0700 Subject: mm: make deferred init's max threads arch-specific Using padata during deferred init has only been tested on x86, so for now limit it to this architecture. If another arch wants this, it can find the max thread limit that's best for it and override deferred_page_init_max_threads(). Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-8-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 12 ++++++++++++ include/linux/memblock.h | 3 +++ mm/page_alloc.c | 13 ++++++++----- 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 96274a90c5ff..e08f1007f776 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1265,6 +1265,18 @@ void __init mem_init(void) mem_init_print_info(NULL); } +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + /* + * More CPUs always led to greater speedups on tested systems, up to + * all the nodes' CPUs. Use all since the system is otherwise idle + * now. + */ + return max_t(int, cpumask_weight(node_cpumask), 1); +} +#endif + int kernel_set_to_readonly; void mark_rodata_ro(void) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 45abfc54da37..807ab9daf0cd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -273,6 +273,9 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, #define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ for (; i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) + +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27ec5dc4db33..fb9dec1c1976 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1836,6 +1836,13 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, } } +/* An arch may override for more concurrency. */ +__weak int __init +deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + return 1; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { @@ -1884,11 +1891,7 @@ static int __init deferred_init_memmap(void *data) first_init_pfn)) goto zone_empty; - /* - * More CPUs always led to greater speedups on tested systems, up to - * all the nodes' CPUs. Use all since the system is otherwise idle now. - */ - max_threads = max(cpumask_weight(cpumask), 1u); + max_threads = deferred_page_init_max_threads(cpumask); while (spfn < epfn) { unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); -- cgit v1.2.3-59-g8ed1b From 633bf2fe8da0520c74ee73e9eb82d78948c3b3cc Mon Sep 17 00:00:00 2001 From: Chen Tao Date: Wed, 3 Jun 2020 16:00:02 -0700 Subject: mm/page_alloc.c: add missing newline Add missing line breaks on pr_warn(). Signed-off-by: Chen Tao Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200603063547.235825-1-chentao107@huawei.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fb9dec1c1976..987d1638588f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7182,7 +7182,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) } if (mem_below_4gb_not_mirrored) - pr_warn("This configuration results in unmirrored kernel memory."); + pr_warn("This configuration results in unmirrored kernel memory.\n"); goto out2; } -- cgit v1.2.3-59-g8ed1b From 730ec8c01a2bd6a311ada404398f44c142ac5e8e Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Wed, 3 Jun 2020 16:01:18 -0700 Subject: mm/vmscan.c: change prototype for shrink_page_list commit 3c710c1ad11b ("mm, vmscan extract shrink_page_list reclaim counters into a struct") changed data type for the function, so changing return type for funciton and its caller. Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Amit Sahrawat Cc: Mel Gorman Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/1588168259-25604-1-git-send-email-maninder1.s@samsung.com Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- mm/page_alloc.c | 2 +- mm/vmscan.c | 24 ++++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index b1f0afcbe016..9117bca90f4b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -538,7 +538,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 987d1638588f..cf0a1720e9d8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8355,7 +8355,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ - unsigned long nr_reclaimed; + unsigned int nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; int ret = 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 8be3d52548ca..0539c6766a24 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1066,17 +1066,17 @@ static void page_check_dirty_writeback(struct page *page, /* * shrink_page_list() returns the number of reclaimed pages */ -static unsigned long shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - enum ttu_flags ttu_flags, - struct reclaim_stat *stat, - bool ignore_references) +static unsigned int shrink_page_list(struct list_head *page_list, + struct pglist_data *pgdat, + struct scan_control *sc, + enum ttu_flags ttu_flags, + struct reclaim_stat *stat, + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); - unsigned nr_reclaimed = 0; - unsigned pgactivate = 0; + unsigned int nr_reclaimed = 0; + unsigned int pgactivate = 0; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1487,7 +1487,7 @@ keep: return nr_reclaimed; } -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list) { struct scan_control sc = { @@ -1496,7 +1496,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .may_unmap = 1, }; struct reclaim_stat stat; - unsigned long nr_reclaimed; + unsigned int nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1910,7 +1910,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, { LIST_HEAD(page_list); unsigned long nr_scanned; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat; int file = is_file_lru(lru); @@ -2106,7 +2106,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long reclaim_pages(struct list_head *page_list) { int nid = NUMA_NO_NODE; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; struct page *page; -- cgit v1.2.3-59-g8ed1b