From 2bd926b439b4cb6b9ed240a9781cd01958b53d85 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 28 Dec 2018 00:29:53 -0800 Subject: kasan: add CONFIG_KASAN_GENERIC and CONFIG_KASAN_SW_TAGS This commit splits the current CONFIG_KASAN config option into two: 1. CONFIG_KASAN_GENERIC, that enables the generic KASAN mode (the one that exists now); 2. CONFIG_KASAN_SW_TAGS, that enables the software tag-based KASAN mode. The name CONFIG_KASAN_SW_TAGS is chosen as in the future we will have another hardware tag-based KASAN mode, that will rely on hardware memory tagging support in arm64. With CONFIG_KASAN_SW_TAGS enabled, compiler options are changed to instrument kernel files with -fsantize=kernel-hwaddress (except the ones for which KASAN_SANITIZE := n is set). Both CONFIG_KASAN_GENERIC and CONFIG_KASAN_SW_TAGS support both CONFIG_KASAN_INLINE and CONFIG_KASAN_OUTLINE instrumentation modes. This commit also adds empty placeholder (for now) implementation of tag-based KASAN specific hooks inserted by the compiler and adjusts common hooks implementation. While this commit adds the CONFIG_KASAN_SW_TAGS config option, this option is not selectable, as it depends on HAVE_ARCH_KASAN_SW_TAGS, which we will enable once all the infrastracture code has been added. Link: http://lkml.kernel.org/r/b2550106eb8a68b10fefbabce820910b115aa853.1544099024.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Andrey Ryabinin Reviewed-by: Dmitry Vyukov Cc: Christoph Lameter Cc: Mark Rutland Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kasan | 98 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 22 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index d0bad1bd9a2b..d8c474b6691e 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -1,36 +1,92 @@ +# This config refers to the generic KASAN mode. config HAVE_ARCH_KASAN bool -if HAVE_ARCH_KASAN +config HAVE_ARCH_KASAN_SW_TAGS + bool + +config CC_HAS_KASAN_GENERIC + def_bool $(cc-option, -fsanitize=kernel-address) + +config CC_HAS_KASAN_SW_TAGS + def_bool $(cc-option, -fsanitize=kernel-hwaddress) config KASAN - bool "KASan: runtime memory debugger" + bool "KASAN: runtime memory debugger" + depends on (HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \ + (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS) + depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) + help + Enables KASAN (KernelAddressSANitizer) - runtime memory debugger, + designed to find out-of-bounds accesses and use-after-free bugs. + See Documentation/dev-tools/kasan.rst for details. + +choice + prompt "KASAN mode" + depends on KASAN + default KASAN_GENERIC + help + KASAN has two modes: generic KASAN (similar to userspace ASan, + x86_64/arm64/xtensa, enabled with CONFIG_KASAN_GENERIC) and + software tag-based KASAN (a version based on software memory + tagging, arm64 only, similar to userspace HWASan, enabled with + CONFIG_KASAN_SW_TAGS). + Both generic and tag-based KASAN are strictly debugging features. + +config KASAN_GENERIC + bool "Generic mode" + depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) select SLUB_DEBUG if SLUB select CONSTRUCTORS select STACKDEPOT help - Enables kernel address sanitizer - runtime memory debugger, - designed to find out-of-bounds accesses and use-after-free bugs. - This is strictly a debugging feature and it requires a gcc version - of 4.9.2 or later. Detection of out of bounds accesses to stack or - global variables requires gcc 5.0 or later. - This feature consumes about 1/8 of available memory and brings about - ~x3 performance slowdown. + Enables generic KASAN mode. + Supported in both GCC and Clang. With GCC it requires version 4.9.2 + or later for basic support and version 5.0 or later for detection of + out-of-bounds accesses for stack and global variables and for inline + instrumentation mode (CONFIG_KASAN_INLINE). With Clang it requires + version 3.7.0 or later and it doesn't support detection of + out-of-bounds accesses for global variables yet. + This mode consumes about 1/8th of available memory at kernel start + and introduces an overhead of ~x1.5 for the rest of the allocations. + The performance slowdown is ~x3. For better error detection enable CONFIG_STACKTRACE. - Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB + Currently CONFIG_KASAN_GENERIC doesn't work with CONFIG_DEBUG_SLAB (the resulting kernel does not boot). +config KASAN_SW_TAGS + bool "Software tag-based mode" + depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS + depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) + select SLUB_DEBUG if SLUB + select CONSTRUCTORS + select STACKDEPOT + help + Enables software tag-based KASAN mode. + This mode requires Top Byte Ignore support by the CPU and therefore + is only supported for arm64. + This mode requires Clang version 7.0.0 or later. + This mode consumes about 1/16th of available memory at kernel start + and introduces an overhead of ~20% for the rest of the allocations. + This mode may potentially introduce problems relating to pointer + casting and comparison, as it embeds tags into the top byte of each + pointer. + For better error detection enable CONFIG_STACKTRACE. + Currently CONFIG_KASAN_SW_TAGS doesn't work with CONFIG_DEBUG_SLAB + (the resulting kernel does not boot). + +endchoice + config KASAN_EXTRA - bool "KAsan: extra checks" - depends on KASAN && DEBUG_KERNEL && !COMPILE_TEST + bool "KASAN: extra checks" + depends on KASAN_GENERIC && DEBUG_KERNEL && !COMPILE_TEST help - This enables further checks in the kernel address sanitizer, for now - it only includes the address-use-after-scope check that can lead - to excessive kernel stack usage, frame size warnings and longer + This enables further checks in generic KASAN, for now it only + includes the address-use-after-scope check that can lead to + excessive kernel stack usage, frame size warnings and longer compile time. - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 has more - + See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 choice prompt "Instrumentation type" @@ -53,7 +109,7 @@ config KASAN_INLINE memory accesses. This is faster than outline (in some workloads it gives about x2 boost over outline instrumentation), but make kernel's .text size much bigger. - This requires a gcc version of 5.0 or later. + For CONFIG_KASAN_GENERIC this requires GCC 5.0 or later. endchoice @@ -67,11 +123,9 @@ config KASAN_S390_4_LEVEL_PAGING 4-level paging instead. config TEST_KASAN - tristate "Module for testing kasan for bug detection" + tristate "Module for testing KASAN for bug detection" depends on m && KASAN help This is a test module doing various nasty things like out of bounds accesses, use after free. It is useful for testing - kernel debugging features like kernel address sanitizer. - -endif + kernel debugging features like KASAN. -- cgit v1.2.3-59-g8ed1b From a9ee3a63dbfff5237bc682b88c02d91a3c798e35 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 28 Dec 2018 00:32:32 -0800 Subject: debugobjects: call debug_objects_mem_init eariler The current value of the early boot static pool size, 1024 is not big enough for systems with large number of CPUs with timer or/and workqueue objects selected. As the results, systems have 60+ CPUs with both timer and workqueue objects enabled could trigger "ODEBUG: Out of memory. ODEBUG disabled". Some debug objects are allocated during the early boot. Enabling some options like timers or workqueue objects may increase the size required significantly with large number of CPUs. For example, CONFIG_DEBUG_OBJECTS_TIMERS: No. CPUs x 2 (worker pool) objects: start_kernel workqueue_init_early init_worker_pool init_timer_key debug_object_init plus No. CPUs objects (CONFIG_HIGH_RES_TIMERS): sched_init hrtick_rq_init hrtimer_init CONFIG_DEBUG_OBJECTS_WORK: No. CPUs objects: vmalloc_init __init_work plus No. CPUs x 6 (workqueue) objects: workqueue_init_early alloc_workqueue __alloc_workqueue_key alloc_and_link_pwqs init_pwq Also, plus No. CPUs objects: perf_event_init __init_srcu_struct init_srcu_struct_fields init_srcu_struct_nodes __init_work However, none of the things are actually used or required before debug_objects_mem_init() is invoked, so just move the call right before vmalloc_init(). According to tglx, "the reason why the call is at this place in start_kernel() is historical. It's because back in the days when debugobjects were added the memory allocator was enabled way later than today." Link: http://lkml.kernel.org/r/20181126102407.1836-1-cai@gmx.us Signed-off-by: Qian Cai Suggested-by: Thomas Gleixner Cc: Waiman Long Cc: Yang Shi Cc: Arnd Bergmann Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 2 +- lib/debugobjects.c | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/init/main.c b/init/main.c index 954d9b6c62c6..f6e901ec6b78 100644 --- a/init/main.c +++ b/init/main.c @@ -521,6 +521,7 @@ static void __init mm_init(void) mem_init(); kmem_cache_init(); pgtable_init(); + debug_objects_mem_init(); vmalloc_init(); ioremap_huge_init(); /* Should be run before the first non-init thread is created */ @@ -697,7 +698,6 @@ asmlinkage __visible void __init start_kernel(void) #endif page_ext_init(); kmemleak_init(); - debug_objects_mem_init(); setup_per_cpu_pageset(); numa_policy_init(); acpi_early_init(); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 14afeeb7d6ef..55437fd5128b 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1131,11 +1131,10 @@ static int __init debug_objects_replace_static_objects(void) } /* - * When debug_objects_mem_init() is called we know that only - * one CPU is up, so disabling interrupts is enough - * protection. This avoids the lockdep hell of lock ordering. + * debug_objects_mem_init() is now called early that only one CPU is up + * and interrupts have been disabled, so it is safe to replace the + * active object references. */ - local_irq_disable(); /* Remove the statically allocated objects from the pool */ hlist_for_each_entry_safe(obj, tmp, &obj_pool, node) @@ -1156,7 +1155,6 @@ static int __init debug_objects_replace_static_objects(void) cnt++; } } - local_irq_enable(); pr_debug("%d of %d active objects replaced\n", cnt, obj_pool_used); -- cgit v1.2.3-59-g8ed1b From 9705bea5f833f4fc21d5bef5fce7348427f76ea4 Mon Sep 17 00:00:00 2001 From: Arun KS Date: Fri, 28 Dec 2018 00:34:24 -0800 Subject: mm: convert zone->managed_pages to atomic variable totalram_pages, zone->managed_pages and totalhigh_pages updates are protected by managed_page_count_lock, but readers never care about it. Convert these variables to atomic to avoid readers potentially seeing a store tear. This patch converts zone->managed_pages. Subsequent patches will convert totalram_panges, totalhigh_pages and eventually managed_page_count_lock will be removed. Main motivation was that managed_page_count_lock handling was complicating things. It was discussed in length here, https://lore.kernel.org/patchwork/patch/995739/#1181785 So it seemes better to remove the lock and convert variables to atomic, with preventing poteintial store-to-read tearing as a bonus. Link: http://lkml.kernel.org/r/1542090790-21750-3-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Suggested-by: Michal Hocko Suggested-by: Vlastimil Babka Reviewed-by: Konstantin Khlebnikov Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 2 +- include/linux/mmzone.h | 9 +++++-- lib/show_mem.c | 2 +- mm/memblock.c | 2 +- mm/page_alloc.c | 44 +++++++++++++++++------------------ mm/vmstat.c | 4 ++-- 6 files changed, 34 insertions(+), 29 deletions(-) (limited to 'lib') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index c02adbbeef2a..b7bc7d7d048f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -853,7 +853,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, */ pgdat = NODE_DATA(numa_node_id); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; + mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]); mem_in_bytes <<= PAGE_SHIFT; sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 077d797d1f60..a23e34e21178 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -435,7 +435,7 @@ struct zone { * adjust_managed_page_count() should be used instead of directly * touching zone->managed_pages and totalram_pages. */ - unsigned long managed_pages; + atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; @@ -524,6 +524,11 @@ enum pgdat_flags { PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; +static inline unsigned long zone_managed_pages(struct zone *zone) +{ + return (unsigned long)atomic_long_read(&zone->managed_pages); +} + static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; @@ -820,7 +825,7 @@ static inline bool is_dev_zone(const struct zone *zone) */ static inline bool managed_zone(struct zone *zone) { - return zone->managed_pages; + return zone_managed_pages(zone); } /* Returns true if a zone has memory */ diff --git a/lib/show_mem.c b/lib/show_mem.c index 0beaa1d899aa..eefe67d50e84 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -28,7 +28,7 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) continue; total += zone->present_pages; - reserved += zone->present_pages - zone->managed_pages; + reserved += zone->present_pages - zone_managed_pages(zone); if (is_highmem_idx(zoneid)) highmem += zone->present_pages; diff --git a/mm/memblock.c b/mm/memblock.c index 81ae63ca78d0..0068f87af1e8 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1950,7 +1950,7 @@ void reset_node_managed_pages(pg_data_t *pgdat) struct zone *z; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->managed_pages = 0; + atomic_long_set(&z->managed_pages, 0); } void __init reset_all_zones_managed_pages(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b79e79caea99..4b5c4ff68f18 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1280,7 +1280,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order) __ClearPageReserved(p); set_page_count(p, 0); - page_zone(page)->managed_pages += nr_pages; + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); set_page_refcounted(page); __free_pages(page, order); } @@ -2259,7 +2259,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, * Limit the number reserved to 1 pageblock or roughly 1% of a zone. * Check is race-prone but harmless. */ - max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; + max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; if (zone->nr_reserved_highatomic >= max_managed) return; @@ -4661,7 +4661,7 @@ static unsigned long nr_free_zone_pages(int offset) struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); for_each_zone_zonelist(zone, z, zonelist, offset) { - unsigned long size = zone->managed_pages; + unsigned long size = zone_managed_pages(zone); unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; @@ -4768,7 +4768,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += pgdat->node_zones[zone_type].managed_pages; + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); val->totalram = managed_pages; val->sharedram = node_page_state(pgdat, NR_SHMEM); val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); @@ -4777,7 +4777,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) struct zone *zone = &pgdat->node_zones[zone_type]; if (is_highmem(zone)) { - managed_highpages += zone->managed_pages; + managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } @@ -4984,7 +4984,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), - K(zone->managed_pages), + K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), @@ -5656,7 +5656,7 @@ static int zone_batchsize(struct zone *zone) * The per-cpu-pages pools are set to around 1000th of the * size of the zone. */ - batch = zone->managed_pages / 1024; + batch = zone_managed_pages(zone) / 1024; /* But no more than a meg. */ if (batch * PAGE_SIZE > 1024 * 1024) batch = (1024 * 1024) / PAGE_SIZE; @@ -5766,7 +5766,7 @@ static void pageset_set_high_and_batch(struct zone *zone, { if (percpu_pagelist_fraction) pageset_set_high(pcp, - (zone->managed_pages / + (zone_managed_pages(zone) / percpu_pagelist_fraction)); else pageset_set_batch(pcp, zone_batchsize(zone)); @@ -6323,7 +6323,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, unsigned long remaining_pages) { - zone->managed_pages = remaining_pages; + atomic_long_set(&zone->managed_pages, remaining_pages); zone_set_nid(zone, nid); zone->name = zone_names[idx]; zone->zone_pgdat = NODE_DATA(nid); @@ -7076,7 +7076,7 @@ early_param("movablecore", cmdline_parse_movablecore); void adjust_managed_page_count(struct page *page, long count) { spin_lock(&managed_page_count_lock); - page_zone(page)->managed_pages += count; + atomic_long_add(count, &page_zone(page)->managed_pages); totalram_pages += count; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) @@ -7124,7 +7124,7 @@ void free_highmem_page(struct page *page) { __free_reserved_page(page); totalram_pages++; - page_zone(page)->managed_pages++; + atomic_long_inc(&page_zone(page)->managed_pages); totalhigh_pages++; } #endif @@ -7257,7 +7257,7 @@ static void calculate_totalreserve_pages(void) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; - unsigned long managed_pages = zone->managed_pages; + unsigned long managed_pages = zone_managed_pages(zone); /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -7293,7 +7293,7 @@ static void setup_per_zone_lowmem_reserve(void) for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long managed_pages = zone->managed_pages; + unsigned long managed_pages = zone_managed_pages(zone); zone->lowmem_reserve[j] = 0; @@ -7311,7 +7311,7 @@ static void setup_per_zone_lowmem_reserve(void) lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; } - managed_pages += lower_zone->managed_pages; + managed_pages += zone_managed_pages(lower_zone); } } } @@ -7330,14 +7330,14 @@ static void __setup_per_zone_wmarks(void) /* Calculate total number of !ZONE_HIGHMEM pages */ for_each_zone(zone) { if (!is_highmem(zone)) - lowmem_pages += zone->managed_pages; + lowmem_pages += zone_managed_pages(zone); } for_each_zone(zone) { u64 tmp; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; + tmp = (u64)pages_min * zone_managed_pages(zone); do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* @@ -7351,7 +7351,7 @@ static void __setup_per_zone_wmarks(void) */ unsigned long min_pages; - min_pages = zone->managed_pages / 1024; + min_pages = zone_managed_pages(zone) / 1024; min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); zone->watermark[WMARK_MIN] = min_pages; } else { @@ -7368,7 +7368,7 @@ static void __setup_per_zone_wmarks(void) * ensure a minimum size on small systems. */ tmp = max_t(u64, tmp >> 2, - mult_frac(zone->managed_pages, + mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; @@ -7498,8 +7498,8 @@ static void setup_min_unmapped_ratio(void) pgdat->min_unmapped_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * - sysctl_min_unmapped_ratio) / 100; + zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * + sysctl_min_unmapped_ratio) / 100; } @@ -7526,8 +7526,8 @@ static void setup_min_slab_ratio(void) pgdat->min_slab_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_slab_pages += (zone->managed_pages * - sysctl_min_slab_ratio) / 100; + zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * + sysctl_min_slab_ratio) / 100; } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, diff --git a/mm/vmstat.c b/mm/vmstat.c index 9c624595e904..83b30edc2f7f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -227,7 +227,7 @@ int calculate_normal_threshold(struct zone *zone) * 125 1024 10 16-32 GB 9 */ - mem = zone->managed_pages >> (27 - PAGE_SHIFT); + mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT); threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); @@ -1569,7 +1569,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, high_wmark_pages(zone), zone->spanned_pages, zone->present_pages, - zone->managed_pages); + zone_managed_pages(zone)); seq_printf(m, "\n protection: (%ld", -- cgit v1.2.3-59-g8ed1b From c3a5c77afefa697bf87f15272c7257e1574cad56 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 28 Dec 2018 00:37:16 -0800 Subject: lib/show_mem.c: drop pgdat_resize_lock in show_mem() Function show_mem() is used to print system memory status when user requires or fail to allocate memory. Generally, this is a best effort information so any races with memory hotplug (or very theoretically an early initialization) should be tolerable and the worst that could happen is to print an imprecise node state. Drop the resize lock because this is the only place which might hold the lock from the interrupt context and so all other callers might use a simple spinlock. Even though this doesn't solve any real issue it makes the code easier to follow and tiny more effective. Link: http://lkml.kernel.org/r/20181129235532.9328-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/show_mem.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'lib') diff --git a/lib/show_mem.c b/lib/show_mem.c index eefe67d50e84..6a042f53e7bb 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -18,10 +18,8 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) show_free_areas(filter, nodemask); for_each_online_pgdat(pgdat) { - unsigned long flags; int zoneid; - pgdat_resize_lock(pgdat, &flags); for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { struct zone *zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) @@ -33,7 +31,6 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) if (is_highmem_idx(zoneid)) highmem += zone->present_pages; } - pgdat_resize_unlock(pgdat, &flags); } printk("%lu pages RAM\n", total); -- cgit v1.2.3-59-g8ed1b From d239865ac804c91a621294ca7bece4241b006fae Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 28 Dec 2018 00:37:38 -0800 Subject: ioremap: rework pXd_free_pYd_page() API The recently merged API for ensuring break-before-make on page-table entries when installing huge mappings in the vmalloc/ioremap region is fairly counter-intuitive, resulting in the arch freeing functions (e.g. pmd_free_pte_page()) being called even on entries that aren't present. This resulted in a minor bug in the arm64 implementation, giving rise to spurious VM_WARN messages. This patch moves the pXd_present() checks out into the core code, refactoring the callsites at the same time so that we avoid the complex conjunctions when determining whether or not we can put down a huge mapping. Link: http://lkml.kernel.org/r/1544120495-17438-2-git-send-email-will.deacon@arm.com Signed-off-by: Will Deacon Reviewed-by: Toshi Kani Suggested-by: Linus Torvalds Cc: Chintan Pandya Cc: Toshi Kani Cc: Thomas Gleixner Cc: Michal Hocko Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Sean Christopherson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/ioremap.c | 56 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/ioremap.c b/lib/ioremap.c index 517f5853ffed..6c72764af19c 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -76,6 +76,25 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, return 0; } +static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, + pgprot_t prot) +{ + if (!ioremap_pmd_enabled()) + return 0; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(phys_addr, PMD_SIZE)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + return pmd_set_huge(pmd, phys_addr, prot); +} + static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { @@ -89,13 +108,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end); - if (ioremap_pmd_enabled() && - ((next - addr) == PMD_SIZE) && - IS_ALIGNED(phys_addr + addr, PMD_SIZE) && - pmd_free_pte_page(pmd, addr)) { - if (pmd_set_huge(pmd, phys_addr + addr, prot)) - continue; - } + if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot)) + continue; if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot)) return -ENOMEM; @@ -103,6 +117,25 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, return 0; } +static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, + pgprot_t prot) +{ + if (!ioremap_pud_enabled()) + return 0; + + if ((end - addr) != PUD_SIZE) + return 0; + + if (!IS_ALIGNED(phys_addr, PUD_SIZE)) + return 0; + + if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) + return 0; + + return pud_set_huge(pud, phys_addr, prot); +} + static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { @@ -116,13 +149,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, do { next = pud_addr_end(addr, end); - if (ioremap_pud_enabled() && - ((next - addr) == PUD_SIZE) && - IS_ALIGNED(phys_addr + addr, PUD_SIZE) && - pud_free_pmd_page(pud, addr)) { - if (pud_set_huge(pud, phys_addr + addr, prot)) - continue; - } + if (ioremap_try_huge_pud(pud, addr, next, phys_addr + addr, prot)) + continue; if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot)) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From 36ddc5a78c878e9b10c323d2fe88b0dae2f487eb Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 28 Dec 2018 00:37:49 -0800 Subject: lib/ioremap: ensure phys_addr actually corresponds to a physical address The current ioremap() code uses a phys_addr variable at each level of page table, which is confusingly offset by subtracting the base virtual address being mapped so that adding the current virtual address back on when iterating through the page table entries gives back the corresponding physical address. This is fairly confusing and results in all users of phys_addr having to add the current virtual address back on. Instead, this patch just updates phys_addr when iterating over the page table entries, ensuring that it's always up-to-date and doesn't require explicit offsetting. Link: http://lkml.kernel.org/r/1544120495-17438-5-git-send-email-will.deacon@arm.com Signed-off-by: Will Deacon Tested-by: Sean Christopherson Reviewed-by: Sean Christopherson Cc: Chintan Pandya Cc: Toshi Kani Cc: Thomas Gleixner Cc: Michal Hocko Cc: Sean Christopherson Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/ioremap.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/ioremap.c b/lib/ioremap.c index 6c72764af19c..10d7c5485c39 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -101,19 +101,18 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, pmd_t *pmd; unsigned long next; - phys_addr -= addr; pmd = pmd_alloc(&init_mm, pud, addr); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot)) + if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) continue; - if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot)) + if (ioremap_pte_range(pmd, addr, next, phys_addr, prot)) return -ENOMEM; - } while (pmd++, addr = next, addr != end); + } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; } @@ -142,19 +141,18 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, pud_t *pud; unsigned long next; - phys_addr -= addr; pud = pud_alloc(&init_mm, p4d, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (ioremap_try_huge_pud(pud, addr, next, phys_addr + addr, prot)) + if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) continue; - if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot)) + if (ioremap_pmd_range(pud, addr, next, phys_addr, prot)) return -ENOMEM; - } while (pud++, addr = next, addr != end); + } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; } @@ -164,7 +162,6 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, p4d_t *p4d; unsigned long next; - phys_addr -= addr; p4d = p4d_alloc(&init_mm, pgd, addr); if (!p4d) return -ENOMEM; @@ -173,14 +170,14 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, if (ioremap_p4d_enabled() && ((next - addr) == P4D_SIZE) && - IS_ALIGNED(phys_addr + addr, P4D_SIZE)) { - if (p4d_set_huge(p4d, phys_addr + addr, prot)) + IS_ALIGNED(phys_addr, P4D_SIZE)) { + if (p4d_set_huge(p4d, phys_addr, prot)) continue; } - if (ioremap_pud_range(p4d, addr, next, phys_addr + addr, prot)) + if (ioremap_pud_range(p4d, addr, next, phys_addr, prot)) return -ENOMEM; - } while (p4d++, addr = next, addr != end); + } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; } @@ -196,14 +193,13 @@ int ioremap_page_range(unsigned long addr, BUG_ON(addr >= end); start = addr; - phys_addr -= addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = ioremap_p4d_range(pgd, addr, next, phys_addr+addr, prot); + err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot); if (err) break; - } while (pgd++, addr = next, addr != end); + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); flush_cache_vmap(start, end); -- cgit v1.2.3-59-g8ed1b From 8e2d43405b22e98cf5f3730c1829ec1fdbe17ae7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 28 Dec 2018 00:37:53 -0800 Subject: lib/ioremap: ensure break-before-make is used for huge p4d mappings Whilst no architectures actually enable support for huge p4d mappings in the vmap area, the code that is implemented should be using break-before-make, as we do for pud and pmd huge entries. Link: http://lkml.kernel.org/r/1544120495-17438-6-git-send-email-will.deacon@arm.com Signed-off-by: Will Deacon Reviewed-by: Toshi Kani Cc: Chintan Pandya Cc: Toshi Kani Cc: Thomas Gleixner Cc: Michal Hocko Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Sean Christopherson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 5 +++++ arch/x86/mm/pgtable.c | 8 ++++++++ include/asm-generic/pgtable.h | 5 +++++ lib/ioremap.c | 27 +++++++++++++++++++++------ 4 files changed, 39 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 13b80361d9f5..b6f5aa52ac67 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1043,6 +1043,11 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) return 1; } +int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) +{ + return 0; /* Don't attempt a block mapping */ +} + #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index e95a7d6ac8f8..b0284eab14dc 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -794,6 +794,14 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } +/* + * Until we support 512GB pages, skip them in the vmap area. + */ +int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) +{ + return 0; +} + #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear pud entry and free pmd page. diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a9cac82e9a7a..05e61e6c843f 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1057,6 +1057,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); +int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ @@ -1084,6 +1085,10 @@ static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } +static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) +{ + return 0; +} static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; diff --git a/lib/ioremap.c b/lib/ioremap.c index 10d7c5485c39..063213685563 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -156,6 +156,25 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, return 0; } +static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, + pgprot_t prot) +{ + if (!ioremap_p4d_enabled()) + return 0; + + if ((end - addr) != P4D_SIZE) + return 0; + + if (!IS_ALIGNED(phys_addr, P4D_SIZE)) + return 0; + + if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) + return 0; + + return p4d_set_huge(p4d, phys_addr, prot); +} + static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { @@ -168,12 +187,8 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); - if (ioremap_p4d_enabled() && - ((next - addr) == P4D_SIZE) && - IS_ALIGNED(phys_addr, P4D_SIZE)) { - if (p4d_set_huge(p4d, phys_addr, prot)) - continue; - } + if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) + continue; if (ioremap_pud_range(p4d, addr, next, phys_addr, prot)) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From d53ce042277a94eadf9a8a31fc41fac54c67dec5 Mon Sep 17 00:00:00 2001 From: Sri Krishna chowdary Date: Fri, 28 Dec 2018 00:38:54 -0800 Subject: kmemleak: add config to select auto scan Kmemleak scan can be cpu intensive and can stall user tasks at times. To prevent this, add config DEBUG_KMEMLEAK_AUTO_SCAN to enable/disable auto scan on boot up. Also protect first_run with DEBUG_KMEMLEAK_AUTO_SCAN as this is meant for only first automatic scan. Link: http://lkml.kernel.org/r/1540231723-7087-1-git-send-email-prpatel@nvidia.com Signed-off-by: Sri Krishna chowdary Signed-off-by: Sachin Nikam Signed-off-by: Prateek Reviewed-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 15 +++++++++++++++ mm/kmemleak.c | 10 ++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b3c91b9e32f8..2b5a4256e88b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -593,6 +593,21 @@ config DEBUG_KMEMLEAK_DEFAULT_OFF Say Y here to disable kmemleak by default. It can then be enabled on the command line via kmemleak=on. +config DEBUG_KMEMLEAK_AUTO_SCAN + bool "Enable kmemleak auto scan thread on boot up" + default y + depends on DEBUG_KMEMLEAK + help + Depending on the cpu, kmemleak scan may be cpu intensive and can + stall user tasks at times. This option enables/disables automatic + kmemleak scan at boot up. + + Say N here to disable kmemleak auto scan thread to stop automatic + scanning. Disabling this option disables automatic reporting of + memory leaks. + + If unsure, say Y. + config DEBUG_STACK_USAGE bool "Stack utilization instrumentation" depends on DEBUG_KERNEL && !IA64 diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 9c3d2dea0861..f9d9dc250428 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1650,7 +1650,7 @@ static void kmemleak_scan(void) */ static int kmemleak_scan_thread(void *arg) { - static int first_run = 1; + static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN); pr_info("Automatic memory scanning thread started\n"); set_user_nice(current, 10); @@ -2144,9 +2144,11 @@ static int __init kmemleak_late_init(void) return -ENOMEM; } - mutex_lock(&scan_mutex); - start_scan_thread(); - mutex_unlock(&scan_mutex); + if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) { + mutex_lock(&scan_mutex); + start_scan_thread(); + mutex_unlock(&scan_mutex); + } pr_info("Kernel memory leak detector initialized\n"); -- cgit v1.2.3-59-g8ed1b