From fc6daaf93151877748f8096af6b3fddb147f22d6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:09 -0700 Subject: mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute Some high end Intel Xeon systems report uncorrectable memory errors as a recoverable machine check. Linux has included code for some time to process these and just signal the affected processes (or even recover completely if the error was in a read only page that can be replaced by reading from disk). But we have no recovery path for errors encountered during kernel code execution. Except for some very specific cases were are unlikely to ever be able to recover. Enter memory mirroring. Actually 3rd generation of memory mirroing. Gen1: All memory is mirrored Pro: No s/w enabling - h/w just gets good data from other side of the mirror Con: Halves effective memory capacity available to OS/applications Gen2: Partial memory mirror - just mirror memory begind some memory controllers Pro: Keep more of the capacity Con: Nightmare to enable. Have to choose between allocating from mirrored memory for safety vs. NUMA local memory for performance Gen3: Address range partial memory mirror - some mirror on each memory controller Pro: Can tune the amount of mirror and keep NUMA performance Con: I have to write memory management code to implement The current plan is just to use mirrored memory for kernel allocations. This has been broken into two phases: 1) This patch series - find the mirrored memory, use it for boot time allocations 2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the unused mirrored memory from mm/memblock.c and only give it out to select kernel allocations (this is still being scoped because page_alloc.c is scary). This patch (of 3): Add extra "flags" to memblock to allow selection of memory based on attribute. No functional changes Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 55 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'mm/memblock.c') diff --git a/mm/memblock.c b/mm/memblock.c index 9318b567ed79..b9ff2f4f0285 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -107,6 +107,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * @@ -115,12 +116,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, */ static phys_addr_t __init_memblock __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) + phys_addr_t size, phys_addr_t align, int nid, + ulong flags) { phys_addr_t this_start, this_end, cand; u64 i; - for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { + for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); @@ -139,6 +141,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area top-down. * @@ -147,12 +150,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, */ static phys_addr_t __init_memblock __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) + phys_addr_t size, phys_addr_t align, int nid, + ulong flags) { phys_addr_t this_start, this_end, cand; u64 i; - for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { + for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end, + NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); @@ -174,6 +179,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Find @size free area aligned to @align in the specified range and node. * @@ -190,7 +196,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, ulong flags) { phys_addr_t kernel_end, ret; @@ -215,7 +221,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, /* ok, try bottom-up allocation first */ ret = __memblock_find_range_bottom_up(bottom_up_start, end, - size, align, nid); + size, align, nid, flags); if (ret) return ret; @@ -233,7 +239,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, "memory hotunplug may be affected\n"); } - return __memblock_find_range_top_down(start, end, size, align, nid); + return __memblock_find_range_top_down(start, end, size, align, nid, + flags); } /** @@ -253,7 +260,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t align) { return memblock_find_in_range_node(size, align, start, end, - NUMA_NO_NODE); + NUMA_NO_NODE, MEMBLOCK_NONE); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -782,6 +789,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL @@ -803,7 +811,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_mem_range(u64 *idx, int nid, +void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -895,6 +903,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, * * @idx: pointer to u64 loop variable * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL @@ -903,7 +912,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, * * Reverse of __next_mem_range(). */ -void __init_memblock __next_mem_range_rev(u64 *idx, int nid, +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -1050,14 +1059,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, ulong flags) { phys_addr_t found; if (!align) align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(size, align, start, end, nid); + found = memblock_find_in_range_node(size, align, start, end, nid, + flags); if (found && !memblock_reserve(found, size)) { /* * The min_count is set to 0 so that memblock allocations are @@ -1070,26 +1080,30 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, } phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end) + phys_addr_t start, phys_addr_t end, + ulong flags) { - return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, + flags); } static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr, - int nid) + int nid, ulong flags) { - return memblock_alloc_range_nid(size, align, 0, max_addr, nid); + return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, MEMBLOCK_NONE); } phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE, + MEMBLOCK_NONE); } phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) @@ -1173,13 +1187,14 @@ static void * __init memblock_virt_alloc_internal( again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, - nid); + nid, MEMBLOCK_NONE); if (alloc) goto done; if (nid != NUMA_NO_NODE) { alloc = memblock_find_in_range_node(size, align, min_addr, - max_addr, NUMA_NO_NODE); + max_addr, NUMA_NO_NODE, + MEMBLOCK_NONE); if (alloc) goto done; } -- cgit v1.2.3-59-g8ed1b From a3f5bafcc04aaf62990e0cf3ced1cc6d8dc6fe95 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:12 -0700 Subject: mm/memblock: allocate boot time data structures from mirrored memory Try to allocate all boot time kernel data structures from mirrored memory. If we run out of mirrored memory print warnings, but fall back to using non-mirrored memory to make sure that we still boot. By number of bytes, most of what we allocate at boot time is the page structures. 64 bytes per 4K page on x86_64 ... or about 1.5% of total system memory. For workloads where the bulk of memory is allocated to applications this may represent a useful improvement to system availability since 1.5% of total memory might be a third of the memory allocated to the kernel. Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 8 +++++ mm/memblock.c | 78 +++++++++++++++++++++++++++++++++++++++++------- mm/nobootmem.c | 10 ++++++- 3 files changed, 84 insertions(+), 12 deletions(-) (limited to 'mm/memblock.c') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 7aeec0cb4c27..0215ffd63069 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -24,6 +24,7 @@ enum { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ + MEMBLOCK_MIRROR = 0x2, /* mirrored region */ }; struct memblock_region { @@ -78,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); +int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); +ulong choose_memblock_flags(void); /* Low level functions */ int memblock_add_range(struct memblock_type *type, @@ -160,6 +163,11 @@ static inline bool movable_node_is_enabled(void) } #endif +static inline bool memblock_is_mirror(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_MIRROR; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); diff --git a/mm/memblock.c b/mm/memblock.c index b9ff2f4f0285..1b444c730846 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock; #ifdef CONFIG_MOVABLE_NODE bool movable_node_enabled __initdata_memblock = false; #endif +static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; +ulong __init_memblock choose_memblock_flags(void) +{ + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; +} + /* inline so we don't get a warning when pr_debug is compiled out */ static __init_memblock const char * memblock_type_name(struct memblock_type *type) @@ -259,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { - return memblock_find_in_range_node(size, align, start, end, - NUMA_NO_NODE, MEMBLOCK_NONE); + phys_addr_t ret; + ulong flags = choose_memblock_flags(); + +again: + ret = memblock_find_in_range_node(size, align, start, end, + NUMA_NO_NODE, flags); + + if (!ret && (flags & MEMBLOCK_MIRROR)) { + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + flags &= ~MEMBLOCK_MIRROR; + goto again; + } + + return ret; } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -785,6 +804,21 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); } +/** + * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) +{ + system_has_some_mirror = true; + + return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); +} + + /** * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable @@ -839,6 +873,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -944,6 +982,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1096,8 +1138,18 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, - nid, MEMBLOCK_NONE); + ulong flags = choose_memblock_flags(); + phys_addr_t ret; + +again: + ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, flags); + + if (!ret && (flags & MEMBLOCK_MIRROR)) { + flags &= ~MEMBLOCK_MIRROR; + goto again; + } + return ret; } phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) @@ -1167,6 +1219,7 @@ static void * __init memblock_virt_alloc_internal( { phys_addr_t alloc; void *ptr; + ulong flags = choose_memblock_flags(); if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; @@ -1187,14 +1240,14 @@ static void * __init memblock_virt_alloc_internal( again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, - nid, MEMBLOCK_NONE); + nid, flags); if (alloc) goto done; if (nid != NUMA_NO_NODE) { alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, NUMA_NO_NODE, - MEMBLOCK_NONE); + flags); if (alloc) goto done; } @@ -1202,10 +1255,16 @@ again: if (min_addr) { min_addr = 0; goto again; - } else { - goto error; } + if (flags & MEMBLOCK_MIRROR) { + flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + goto again; + } + + return NULL; done: memblock_reserve(alloc, size); ptr = phys_to_virt(alloc); @@ -1220,9 +1279,6 @@ done: kmemleak_alloc(ptr, size, 0, 0); return ptr; - -error: - return NULL; } /** diff --git a/mm/nobootmem.c b/mm/nobootmem.c index ad3641dcdbe7..5258386fa1be 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -37,12 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, { void *ptr; u64 addr; + ulong flags = choose_memblock_flags(); if (limit > memblock.current_limit) limit = memblock.current_limit; +again: addr = memblock_find_in_range_node(size, align, goal, limit, nid, - MEMBLOCK_NONE); + flags); + if (!addr && (flags & MEMBLOCK_MIRROR)) { + flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + goto again; + } if (!addr) return NULL; -- cgit v1.2.3-59-g8ed1b