From 2bd926b439b4cb6b9ed240a9781cd01958b53d85 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Fri, 28 Dec 2018 00:29:53 -0800
Subject: kasan: add CONFIG_KASAN_GENERIC and CONFIG_KASAN_SW_TAGS

This commit splits the current CONFIG_KASAN config option into two:
1. CONFIG_KASAN_GENERIC, that enables the generic KASAN mode (the one
   that exists now);
2. CONFIG_KASAN_SW_TAGS, that enables the software tag-based KASAN mode.

The name CONFIG_KASAN_SW_TAGS is chosen as in the future we will have
another hardware tag-based KASAN mode, that will rely on hardware memory
tagging support in arm64.

With CONFIG_KASAN_SW_TAGS enabled, compiler options are changed to
instrument kernel files with -fsantize=kernel-hwaddress (except the ones
for which KASAN_SANITIZE := n is set).

Both CONFIG_KASAN_GENERIC and CONFIG_KASAN_SW_TAGS support both
CONFIG_KASAN_INLINE and CONFIG_KASAN_OUTLINE instrumentation modes.

This commit also adds empty placeholder (for now) implementation of
tag-based KASAN specific hooks inserted by the compiler and adjusts
common hooks implementation.

While this commit adds the CONFIG_KASAN_SW_TAGS config option, this option
is not selectable, as it depends on HAVE_ARCH_KASAN_SW_TAGS, which we will
enable once all the infrastracture code has been added.

Link: http://lkml.kernel.org/r/b2550106eb8a68b10fefbabce820910b115aa853.1544099024.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.kasan | 98 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 76 insertions(+), 22 deletions(-)

(limited to 'lib')

diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index d0bad1bd9a2b..d8c474b6691e 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -1,36 +1,92 @@
+# This config refers to the generic KASAN mode.
 config HAVE_ARCH_KASAN
 	bool
 
-if HAVE_ARCH_KASAN
+config HAVE_ARCH_KASAN_SW_TAGS
+	bool
+
+config CC_HAS_KASAN_GENERIC
+	def_bool $(cc-option, -fsanitize=kernel-address)
+
+config CC_HAS_KASAN_SW_TAGS
+	def_bool $(cc-option, -fsanitize=kernel-hwaddress)
 
 config KASAN
-	bool "KASan: runtime memory debugger"
+	bool "KASAN: runtime memory debugger"
+	depends on (HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \
+		   (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)
+	depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB)
+	help
+	  Enables KASAN (KernelAddressSANitizer) - runtime memory debugger,
+	  designed to find out-of-bounds accesses and use-after-free bugs.
+	  See Documentation/dev-tools/kasan.rst for details.
+
+choice
+	prompt "KASAN mode"
+	depends on KASAN
+	default KASAN_GENERIC
+	help
+	  KASAN has two modes: generic KASAN (similar to userspace ASan,
+	  x86_64/arm64/xtensa, enabled with CONFIG_KASAN_GENERIC) and
+	  software tag-based KASAN (a version based on software memory
+	  tagging, arm64 only, similar to userspace HWASan, enabled with
+	  CONFIG_KASAN_SW_TAGS).
+	  Both generic and tag-based KASAN are strictly debugging features.
+
+config KASAN_GENERIC
+	bool "Generic mode"
+	depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC
 	depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB)
 	select SLUB_DEBUG if SLUB
 	select CONSTRUCTORS
 	select STACKDEPOT
 	help
-	  Enables kernel address sanitizer - runtime memory debugger,
-	  designed to find out-of-bounds accesses and use-after-free bugs.
-	  This is strictly a debugging feature and it requires a gcc version
-	  of 4.9.2 or later. Detection of out of bounds accesses to stack or
-	  global variables requires gcc 5.0 or later.
-	  This feature consumes about 1/8 of available memory and brings about
-	  ~x3 performance slowdown.
+	  Enables generic KASAN mode.
+	  Supported in both GCC and Clang. With GCC it requires version 4.9.2
+	  or later for basic support and version 5.0 or later for detection of
+	  out-of-bounds accesses for stack and global variables and for inline
+	  instrumentation mode (CONFIG_KASAN_INLINE). With Clang it requires
+	  version 3.7.0 or later and it doesn't support detection of
+	  out-of-bounds accesses for global variables yet.
+	  This mode consumes about 1/8th of available memory at kernel start
+	  and introduces an overhead of ~x1.5 for the rest of the allocations.
+	  The performance slowdown is ~x3.
 	  For better error detection enable CONFIG_STACKTRACE.
-	  Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB
+	  Currently CONFIG_KASAN_GENERIC doesn't work with CONFIG_DEBUG_SLAB
 	  (the resulting kernel does not boot).
 
+config KASAN_SW_TAGS
+	bool "Software tag-based mode"
+	depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS
+	depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB)
+	select SLUB_DEBUG if SLUB
+	select CONSTRUCTORS
+	select STACKDEPOT
+	help
+	  Enables software tag-based KASAN mode.
+	  This mode requires Top Byte Ignore support by the CPU and therefore
+	  is only supported for arm64.
+	  This mode requires Clang version 7.0.0 or later.
+	  This mode consumes about 1/16th of available memory at kernel start
+	  and introduces an overhead of ~20% for the rest of the allocations.
+	  This mode may potentially introduce problems relating to pointer
+	  casting and comparison, as it embeds tags into the top byte of each
+	  pointer.
+	  For better error detection enable CONFIG_STACKTRACE.
+	  Currently CONFIG_KASAN_SW_TAGS doesn't work with CONFIG_DEBUG_SLAB
+	  (the resulting kernel does not boot).
+
+endchoice
+
 config KASAN_EXTRA
-	bool "KAsan: extra checks"
-	depends on KASAN && DEBUG_KERNEL && !COMPILE_TEST
+	bool "KASAN: extra checks"
+	depends on KASAN_GENERIC && DEBUG_KERNEL && !COMPILE_TEST
 	help
-	  This enables further checks in the kernel address sanitizer, for now
-	  it only includes the address-use-after-scope check that can lead
-	  to excessive kernel stack usage, frame size warnings and longer
+	  This enables further checks in generic KASAN, for now it only
+	  includes the address-use-after-scope check that can lead to
+	  excessive kernel stack usage, frame size warnings and longer
 	  compile time.
-	  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 has more
-
+	  See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715
 
 choice
 	prompt "Instrumentation type"
@@ -53,7 +109,7 @@ config KASAN_INLINE
 	  memory accesses. This is faster than outline (in some workloads
 	  it gives about x2 boost over outline instrumentation), but
 	  make kernel's .text size much bigger.
-	  This requires a gcc version of 5.0 or later.
+	  For CONFIG_KASAN_GENERIC this requires GCC 5.0 or later.
 
 endchoice
 
@@ -67,11 +123,9 @@ config KASAN_S390_4_LEVEL_PAGING
 	  4-level paging instead.
 
 config TEST_KASAN
-	tristate "Module for testing kasan for bug detection"
+	tristate "Module for testing KASAN for bug detection"
 	depends on m && KASAN
 	help
 	  This is a test module doing various nasty things like
 	  out of bounds accesses, use after free. It is useful for testing
-	  kernel debugging features like kernel address sanitizer.
-
-endif
+	  kernel debugging features like KASAN.
-- 
cgit v1.2.3-59-g8ed1b


From a9ee3a63dbfff5237bc682b88c02d91a3c798e35 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@gmx.us>
Date: Fri, 28 Dec 2018 00:32:32 -0800
Subject: debugobjects: call debug_objects_mem_init eariler

The current value of the early boot static pool size, 1024 is not big
enough for systems with large number of CPUs with timer or/and workqueue
objects selected.  As the results, systems have 60+ CPUs with both timer
and workqueue objects enabled could trigger "ODEBUG: Out of memory.
ODEBUG disabled".

Some debug objects are allocated during the early boot.  Enabling some
options like timers or workqueue objects may increase the size required
significantly with large number of CPUs.  For example,

CONFIG_DEBUG_OBJECTS_TIMERS:
No. CPUs x 2 (worker pool) objects:
start_kernel
  workqueue_init_early
    init_worker_pool
      init_timer_key
        debug_object_init

plus No. CPUs objects (CONFIG_HIGH_RES_TIMERS):
sched_init
  hrtick_rq_init
    hrtimer_init

CONFIG_DEBUG_OBJECTS_WORK:
No. CPUs objects:
vmalloc_init
  __init_work

plus No. CPUs x 6 (workqueue) objects:
workqueue_init_early
  alloc_workqueue
    __alloc_workqueue_key
      alloc_and_link_pwqs
        init_pwq

Also, plus No. CPUs objects:
perf_event_init
  __init_srcu_struct
    init_srcu_struct_fields
      init_srcu_struct_nodes
        __init_work

However, none of the things are actually used or required before
debug_objects_mem_init() is invoked, so just move the call right before
vmalloc_init().

According to tglx, "the reason why the call is at this place in
start_kernel() is historical.  It's because back in the days when
debugobjects were added the memory allocator was enabled way later than
today."

Link: http://lkml.kernel.org/r/20181126102407.1836-1-cai@gmx.us
Signed-off-by: Qian Cai <cai@gmx.us>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/main.c        | 2 +-
 lib/debugobjects.c | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'lib')

diff --git a/init/main.c b/init/main.c
index 954d9b6c62c6..f6e901ec6b78 100644
--- a/init/main.c
+++ b/init/main.c
@@ -521,6 +521,7 @@ static void __init mm_init(void)
 	mem_init();
 	kmem_cache_init();
 	pgtable_init();
+	debug_objects_mem_init();
 	vmalloc_init();
 	ioremap_huge_init();
 	/* Should be run before the first non-init thread is created */
@@ -697,7 +698,6 @@ asmlinkage __visible void __init start_kernel(void)
 #endif
 	page_ext_init();
 	kmemleak_init();
-	debug_objects_mem_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
 	acpi_early_init();
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 14afeeb7d6ef..55437fd5128b 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -1131,11 +1131,10 @@ static int __init debug_objects_replace_static_objects(void)
 	}
 
 	/*
-	 * When debug_objects_mem_init() is called we know that only
-	 * one CPU is up, so disabling interrupts is enough
-	 * protection. This avoids the lockdep hell of lock ordering.
+	 * debug_objects_mem_init() is now called early that only one CPU is up
+	 * and interrupts have been disabled, so it is safe to replace the
+	 * active object references.
 	 */
-	local_irq_disable();
 
 	/* Remove the statically allocated objects from the pool */
 	hlist_for_each_entry_safe(obj, tmp, &obj_pool, node)
@@ -1156,7 +1155,6 @@ static int __init debug_objects_replace_static_objects(void)
 			cnt++;
 		}
 	}
-	local_irq_enable();
 
 	pr_debug("%d of %d active objects replaced\n",
 		 cnt, obj_pool_used);
-- 
cgit v1.2.3-59-g8ed1b


From 9705bea5f833f4fc21d5bef5fce7348427f76ea4 Mon Sep 17 00:00:00 2001
From: Arun KS <arunks@codeaurora.org>
Date: Fri, 28 Dec 2018 00:34:24 -0800
Subject: mm: convert zone->managed_pages to atomic variable

totalram_pages, zone->managed_pages and totalhigh_pages updates are
protected by managed_page_count_lock, but readers never care about it.
Convert these variables to atomic to avoid readers potentially seeing a
store tear.

This patch converts zone->managed_pages.  Subsequent patches will convert
totalram_panges, totalhigh_pages and eventually managed_page_count_lock
will be removed.

Main motivation was that managed_page_count_lock handling was complicating
things.  It was discussed in length here,
https://lore.kernel.org/patchwork/patch/995739/#1181785 So it seemes
better to remove the lock and convert variables to atomic, with preventing
poteintial store-to-read tearing as a bonus.

Link: http://lkml.kernel.org/r/1542090790-21750-3-git-send-email-arunks@codeaurora.org
Signed-off-by: Arun KS <arunks@codeaurora.org>
Suggested-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c |  2 +-
 include/linux/mmzone.h                |  9 +++++--
 lib/show_mem.c                        |  2 +-
 mm/memblock.c                         |  2 +-
 mm/page_alloc.c                       | 44 +++++++++++++++++------------------
 mm/vmstat.c                           |  4 ++--
 6 files changed, 34 insertions(+), 29 deletions(-)

(limited to 'lib')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index c02adbbeef2a..b7bc7d7d048f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -853,7 +853,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
 	 */
 	pgdat = NODE_DATA(numa_node_id);
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
-		mem_in_bytes += pgdat->node_zones[zone_type].managed_pages;
+		mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]);
 	mem_in_bytes <<= PAGE_SHIFT;
 
 	sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 077d797d1f60..a23e34e21178 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -435,7 +435,7 @@ struct zone {
 	 * adjust_managed_page_count() should be used instead of directly
 	 * touching zone->managed_pages and totalram_pages.
 	 */
-	unsigned long		managed_pages;
+	atomic_long_t		managed_pages;
 	unsigned long		spanned_pages;
 	unsigned long		present_pages;
 
@@ -524,6 +524,11 @@ enum pgdat_flags {
 	PGDAT_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
 };
 
+static inline unsigned long zone_managed_pages(struct zone *zone)
+{
+	return (unsigned long)atomic_long_read(&zone->managed_pages);
+}
+
 static inline unsigned long zone_end_pfn(const struct zone *zone)
 {
 	return zone->zone_start_pfn + zone->spanned_pages;
@@ -820,7 +825,7 @@ static inline bool is_dev_zone(const struct zone *zone)
  */
 static inline bool managed_zone(struct zone *zone)
 {
-	return zone->managed_pages;
+	return zone_managed_pages(zone);
 }
 
 /* Returns true if a zone has memory */
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 0beaa1d899aa..eefe67d50e84 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -28,7 +28,7 @@ void show_mem(unsigned int filter, nodemask_t *nodemask)
 				continue;
 
 			total += zone->present_pages;
-			reserved += zone->present_pages - zone->managed_pages;
+			reserved += zone->present_pages - zone_managed_pages(zone);
 
 			if (is_highmem_idx(zoneid))
 				highmem += zone->present_pages;
diff --git a/mm/memblock.c b/mm/memblock.c
index 81ae63ca78d0..0068f87af1e8 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1950,7 +1950,7 @@ void reset_node_managed_pages(pg_data_t *pgdat)
 	struct zone *z;
 
 	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
-		z->managed_pages = 0;
+		atomic_long_set(&z->managed_pages, 0);
 }
 
 void __init reset_all_zones_managed_pages(void)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b79e79caea99..4b5c4ff68f18 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1280,7 +1280,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order)
 	__ClearPageReserved(p);
 	set_page_count(p, 0);
 
-	page_zone(page)->managed_pages += nr_pages;
+	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }
@@ -2259,7 +2259,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
 	 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
 	 * Check is race-prone but harmless.
 	 */
-	max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+	max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
 	if (zone->nr_reserved_highatomic >= max_managed)
 		return;
 
@@ -4661,7 +4661,7 @@ static unsigned long nr_free_zone_pages(int offset)
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
-		unsigned long size = zone->managed_pages;
+		unsigned long size = zone_managed_pages(zone);
 		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
@@ -4768,7 +4768,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 	pg_data_t *pgdat = NODE_DATA(nid);
 
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
-		managed_pages += pgdat->node_zones[zone_type].managed_pages;
+		managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
 	val->totalram = managed_pages;
 	val->sharedram = node_page_state(pgdat, NR_SHMEM);
 	val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
@@ -4777,7 +4777,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 		struct zone *zone = &pgdat->node_zones[zone_type];
 
 		if (is_highmem(zone)) {
-			managed_highpages += zone->managed_pages;
+			managed_highpages += zone_managed_pages(zone);
 			free_highpages += zone_page_state(zone, NR_FREE_PAGES);
 		}
 	}
@@ -4984,7 +4984,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
 			K(zone->present_pages),
-			K(zone->managed_pages),
+			K(zone_managed_pages(zone)),
 			K(zone_page_state(zone, NR_MLOCK)),
 			zone_page_state(zone, NR_KERNEL_STACK_KB),
 			K(zone_page_state(zone, NR_PAGETABLE)),
@@ -5656,7 +5656,7 @@ static int zone_batchsize(struct zone *zone)
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.
 	 */
-	batch = zone->managed_pages / 1024;
+	batch = zone_managed_pages(zone) / 1024;
 	/* But no more than a meg. */
 	if (batch * PAGE_SIZE > 1024 * 1024)
 		batch = (1024 * 1024) / PAGE_SIZE;
@@ -5766,7 +5766,7 @@ static void pageset_set_high_and_batch(struct zone *zone,
 {
 	if (percpu_pagelist_fraction)
 		pageset_set_high(pcp,
-			(zone->managed_pages /
+			(zone_managed_pages(zone) /
 				percpu_pagelist_fraction));
 	else
 		pageset_set_batch(pcp, zone_batchsize(zone));
@@ -6323,7 +6323,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
 							unsigned long remaining_pages)
 {
-	zone->managed_pages = remaining_pages;
+	atomic_long_set(&zone->managed_pages, remaining_pages);
 	zone_set_nid(zone, nid);
 	zone->name = zone_names[idx];
 	zone->zone_pgdat = NODE_DATA(nid);
@@ -7076,7 +7076,7 @@ early_param("movablecore", cmdline_parse_movablecore);
 void adjust_managed_page_count(struct page *page, long count)
 {
 	spin_lock(&managed_page_count_lock);
-	page_zone(page)->managed_pages += count;
+	atomic_long_add(count, &page_zone(page)->managed_pages);
 	totalram_pages += count;
 #ifdef CONFIG_HIGHMEM
 	if (PageHighMem(page))
@@ -7124,7 +7124,7 @@ void free_highmem_page(struct page *page)
 {
 	__free_reserved_page(page);
 	totalram_pages++;
-	page_zone(page)->managed_pages++;
+	atomic_long_inc(&page_zone(page)->managed_pages);
 	totalhigh_pages++;
 }
 #endif
@@ -7257,7 +7257,7 @@ static void calculate_totalreserve_pages(void)
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			long max = 0;
-			unsigned long managed_pages = zone->managed_pages;
+			unsigned long managed_pages = zone_managed_pages(zone);
 
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
@@ -7293,7 +7293,7 @@ static void setup_per_zone_lowmem_reserve(void)
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
-			unsigned long managed_pages = zone->managed_pages;
+			unsigned long managed_pages = zone_managed_pages(zone);
 
 			zone->lowmem_reserve[j] = 0;
 
@@ -7311,7 +7311,7 @@ static void setup_per_zone_lowmem_reserve(void)
 					lower_zone->lowmem_reserve[j] =
 						managed_pages / sysctl_lowmem_reserve_ratio[idx];
 				}
-				managed_pages += lower_zone->managed_pages;
+				managed_pages += zone_managed_pages(lower_zone);
 			}
 		}
 	}
@@ -7330,14 +7330,14 @@ static void __setup_per_zone_wmarks(void)
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
-			lowmem_pages += zone->managed_pages;
+			lowmem_pages += zone_managed_pages(zone);
 	}
 
 	for_each_zone(zone) {
 		u64 tmp;
 
 		spin_lock_irqsave(&zone->lock, flags);
-		tmp = (u64)pages_min * zone->managed_pages;
+		tmp = (u64)pages_min * zone_managed_pages(zone);
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
@@ -7351,7 +7351,7 @@ static void __setup_per_zone_wmarks(void)
 			 */
 			unsigned long min_pages;
 
-			min_pages = zone->managed_pages / 1024;
+			min_pages = zone_managed_pages(zone) / 1024;
 			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
 			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
@@ -7368,7 +7368,7 @@ static void __setup_per_zone_wmarks(void)
 		 * ensure a minimum size on small systems.
 		 */
 		tmp = max_t(u64, tmp >> 2,
-			    mult_frac(zone->managed_pages,
+			    mult_frac(zone_managed_pages(zone),
 				      watermark_scale_factor, 10000));
 
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
@@ -7498,8 +7498,8 @@ static void setup_min_unmapped_ratio(void)
 		pgdat->min_unmapped_pages = 0;
 
 	for_each_zone(zone)
-		zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
-				sysctl_min_unmapped_ratio) / 100;
+		zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
+						         sysctl_min_unmapped_ratio) / 100;
 }
 
 
@@ -7526,8 +7526,8 @@ static void setup_min_slab_ratio(void)
 		pgdat->min_slab_pages = 0;
 
 	for_each_zone(zone)
-		zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
-				sysctl_min_slab_ratio) / 100;
+		zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
+						     sysctl_min_slab_ratio) / 100;
 }
 
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9c624595e904..83b30edc2f7f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -227,7 +227,7 @@ int calculate_normal_threshold(struct zone *zone)
 	 * 125		1024		10	16-32 GB	9
 	 */
 
-	mem = zone->managed_pages >> (27 - PAGE_SHIFT);
+	mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
 
 	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
 
@@ -1569,7 +1569,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   high_wmark_pages(zone),
 		   zone->spanned_pages,
 		   zone->present_pages,
-		   zone->managed_pages);
+		   zone_managed_pages(zone));
 
 	seq_printf(m,
 		   "\n        protection: (%ld",
-- 
cgit v1.2.3-59-g8ed1b


From c3a5c77afefa697bf87f15272c7257e1574cad56 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Fri, 28 Dec 2018 00:37:16 -0800
Subject: lib/show_mem.c: drop pgdat_resize_lock in show_mem()

Function show_mem() is used to print system memory status when user
requires or fail to allocate memory.  Generally, this is a best effort
information so any races with memory hotplug (or very theoretically an
early initialization) should be tolerable and the worst that could happen
is to print an imprecise node state.

Drop the resize lock because this is the only place which might hold the
lock from the interrupt context and so all other callers might use a
simple spinlock.  Even though this doesn't solve any real issue it makes
the code easier to follow and tiny more effective.

Link: http://lkml.kernel.org/r/20181129235532.9328-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/show_mem.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'lib')

diff --git a/lib/show_mem.c b/lib/show_mem.c
index eefe67d50e84..6a042f53e7bb 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -18,10 +18,8 @@ void show_mem(unsigned int filter, nodemask_t *nodemask)
 	show_free_areas(filter, nodemask);
 
 	for_each_online_pgdat(pgdat) {
-		unsigned long flags;
 		int zoneid;
 
-		pgdat_resize_lock(pgdat, &flags);
 		for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
 			struct zone *zone = &pgdat->node_zones[zoneid];
 			if (!populated_zone(zone))
@@ -33,7 +31,6 @@ void show_mem(unsigned int filter, nodemask_t *nodemask)
 			if (is_highmem_idx(zoneid))
 				highmem += zone->present_pages;
 		}
-		pgdat_resize_unlock(pgdat, &flags);
 	}
 
 	printk("%lu pages RAM\n", total);
-- 
cgit v1.2.3-59-g8ed1b


From d239865ac804c91a621294ca7bece4241b006fae Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 28 Dec 2018 00:37:38 -0800
Subject: ioremap: rework pXd_free_pYd_page() API

The recently merged API for ensuring break-before-make on page-table
entries when installing huge mappings in the vmalloc/ioremap region is
fairly counter-intuitive, resulting in the arch freeing functions (e.g.
pmd_free_pte_page()) being called even on entries that aren't present.
This resulted in a minor bug in the arm64 implementation, giving rise to
spurious VM_WARN messages.

This patch moves the pXd_present() checks out into the core code,
refactoring the callsites at the same time so that we avoid the complex
conjunctions when determining whether or not we can put down a huge
mapping.

Link: http://lkml.kernel.org/r/1544120495-17438-2-git-send-email-will.deacon@arm.com
Signed-off-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Toshi Kani <toshi.kani@hpe.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/ioremap.c | 56 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 14 deletions(-)

(limited to 'lib')

diff --git a/lib/ioremap.c b/lib/ioremap.c
index 517f5853ffed..6c72764af19c 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -76,6 +76,25 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
+static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
+				unsigned long end, phys_addr_t phys_addr,
+				pgprot_t prot)
+{
+	if (!ioremap_pmd_enabled())
+		return 0;
+
+	if ((end - addr) != PMD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+		return 0;
+
+	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+		return 0;
+
+	return pmd_set_huge(pmd, phys_addr, prot);
+}
+
 static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
@@ -89,13 +108,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 	do {
 		next = pmd_addr_end(addr, end);
 
-		if (ioremap_pmd_enabled() &&
-		    ((next - addr) == PMD_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
-		    pmd_free_pte_page(pmd, addr)) {
-			if (pmd_set_huge(pmd, phys_addr + addr, prot))
-				continue;
-		}
+		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot))
+			continue;
 
 		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
 			return -ENOMEM;
@@ -103,6 +117,25 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 	return 0;
 }
 
+static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
+				unsigned long end, phys_addr_t phys_addr,
+				pgprot_t prot)
+{
+	if (!ioremap_pud_enabled())
+		return 0;
+
+	if ((end - addr) != PUD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+		return 0;
+
+	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+		return 0;
+
+	return pud_set_huge(pud, phys_addr, prot);
+}
+
 static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
@@ -116,13 +149,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 	do {
 		next = pud_addr_end(addr, end);
 
-		if (ioremap_pud_enabled() &&
-		    ((next - addr) == PUD_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
-		    pud_free_pmd_page(pud, addr)) {
-			if (pud_set_huge(pud, phys_addr + addr, prot))
-				continue;
-		}
+		if (ioremap_try_huge_pud(pud, addr, next, phys_addr + addr, prot))
+			continue;
 
 		if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
 			return -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From 36ddc5a78c878e9b10c323d2fe88b0dae2f487eb Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 28 Dec 2018 00:37:49 -0800
Subject: lib/ioremap: ensure phys_addr actually corresponds to a physical
 address

The current ioremap() code uses a phys_addr variable at each level of page
table, which is confusingly offset by subtracting the base virtual address
being mapped so that adding the current virtual address back on when
iterating through the page table entries gives back the corresponding
physical address.

This is fairly confusing and results in all users of phys_addr having to
add the current virtual address back on.  Instead, this patch just updates
phys_addr when iterating over the page table entries, ensuring that it's
always up-to-date and doesn't require explicit offsetting.

Link: http://lkml.kernel.org/r/1544120495-17438-5-git-send-email-will.deacon@arm.com
Signed-off-by: Will Deacon <will.deacon@arm.com>
Tested-by: Sean Christopherson <sean.j.christopherson@intel.com>
Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/ioremap.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'lib')

diff --git a/lib/ioremap.c b/lib/ioremap.c
index 6c72764af19c..10d7c5485c39 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -101,19 +101,18 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 	pmd_t *pmd;
 	unsigned long next;
 
-	phys_addr -= addr;
 	pmd = pmd_alloc(&init_mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
 
-		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot))
+		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))
 			continue;
 
-		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
+		if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))
 			return -ENOMEM;
-	} while (pmd++, addr = next, addr != end);
+	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
@@ -142,19 +141,18 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 	pud_t *pud;
 	unsigned long next;
 
-	phys_addr -= addr;
 	pud = pud_alloc(&init_mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
 
-		if (ioremap_try_huge_pud(pud, addr, next, phys_addr + addr, prot))
+		if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot))
 			continue;
 
-		if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
+		if (ioremap_pmd_range(pud, addr, next, phys_addr, prot))
 			return -ENOMEM;
-	} while (pud++, addr = next, addr != end);
+	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
@@ -164,7 +162,6 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
 	p4d_t *p4d;
 	unsigned long next;
 
-	phys_addr -= addr;
 	p4d = p4d_alloc(&init_mm, pgd, addr);
 	if (!p4d)
 		return -ENOMEM;
@@ -173,14 +170,14 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
 
 		if (ioremap_p4d_enabled() &&
 		    ((next - addr) == P4D_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, P4D_SIZE)) {
-			if (p4d_set_huge(p4d, phys_addr + addr, prot))
+		    IS_ALIGNED(phys_addr, P4D_SIZE)) {
+			if (p4d_set_huge(p4d, phys_addr, prot))
 				continue;
 		}
 
-		if (ioremap_pud_range(p4d, addr, next, phys_addr + addr, prot))
+		if (ioremap_pud_range(p4d, addr, next, phys_addr, prot))
 			return -ENOMEM;
-	} while (p4d++, addr = next, addr != end);
+	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
@@ -196,14 +193,13 @@ int ioremap_page_range(unsigned long addr,
 	BUG_ON(addr >= end);
 
 	start = addr;
-	phys_addr -= addr;
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = ioremap_p4d_range(pgd, addr, next, phys_addr+addr, prot);
+		err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot);
 		if (err)
 			break;
-	} while (pgd++, addr = next, addr != end);
+	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
 
 	flush_cache_vmap(start, end);
 
-- 
cgit v1.2.3-59-g8ed1b


From 8e2d43405b22e98cf5f3730c1829ec1fdbe17ae7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 28 Dec 2018 00:37:53 -0800
Subject: lib/ioremap: ensure break-before-make is used for huge p4d mappings

Whilst no architectures actually enable support for huge p4d mappings in
the vmap area, the code that is implemented should be using
break-before-make, as we do for pud and pmd huge entries.

Link: http://lkml.kernel.org/r/1544120495-17438-6-git-send-email-will.deacon@arm.com
Signed-off-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c           |  5 +++++
 arch/x86/mm/pgtable.c         |  8 ++++++++
 include/asm-generic/pgtable.h |  5 +++++
 lib/ioremap.c                 | 27 +++++++++++++++++++++------
 4 files changed, 39 insertions(+), 6 deletions(-)

(limited to 'lib')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 13b80361d9f5..b6f5aa52ac67 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1043,6 +1043,11 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
 	return 1;
 }
 
+int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
+{
+	return 0;	/* Don't attempt a block mapping */
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 		    bool want_memblock)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e95a7d6ac8f8..b0284eab14dc 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -794,6 +794,14 @@ int pmd_clear_huge(pmd_t *pmd)
 	return 0;
 }
 
+/*
+ * Until we support 512GB pages, skip them in the vmap area.
+ */
+int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
+{
+	return 0;
+}
+
 #ifdef CONFIG_X86_64
 /**
  * pud_free_pmd_page - Clear pud entry and free pmd page.
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a9cac82e9a7a..05e61e6c843f 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1057,6 +1057,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
 int pud_clear_huge(pud_t *pud);
 int pmd_clear_huge(pmd_t *pmd);
+int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
 int pud_free_pmd_page(pud_t *pud, unsigned long addr);
 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
 #else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */
@@ -1084,6 +1085,10 @@ static inline int pmd_clear_huge(pmd_t *pmd)
 {
 	return 0;
 }
+static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
+{
+	return 0;
+}
 static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 {
 	return 0;
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 10d7c5485c39..063213685563 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -156,6 +156,25 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 	return 0;
 }
 
+static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
+				unsigned long end, phys_addr_t phys_addr,
+				pgprot_t prot)
+{
+	if (!ioremap_p4d_enabled())
+		return 0;
+
+	if ((end - addr) != P4D_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+		return 0;
+
+	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+		return 0;
+
+	return p4d_set_huge(p4d, phys_addr, prot);
+}
+
 static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
 		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
@@ -168,12 +187,8 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
 	do {
 		next = p4d_addr_end(addr, end);
 
-		if (ioremap_p4d_enabled() &&
-		    ((next - addr) == P4D_SIZE) &&
-		    IS_ALIGNED(phys_addr, P4D_SIZE)) {
-			if (p4d_set_huge(p4d, phys_addr, prot))
-				continue;
-		}
+		if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot))
+			continue;
 
 		if (ioremap_pud_range(p4d, addr, next, phys_addr, prot))
 			return -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From d53ce042277a94eadf9a8a31fc41fac54c67dec5 Mon Sep 17 00:00:00 2001
From: Sri Krishna chowdary <schowdary@nvidia.com>
Date: Fri, 28 Dec 2018 00:38:54 -0800
Subject: kmemleak: add config to select auto scan

Kmemleak scan can be cpu intensive and can stall user tasks at times.  To
prevent this, add config DEBUG_KMEMLEAK_AUTO_SCAN to enable/disable auto
scan on boot up.  Also protect first_run with DEBUG_KMEMLEAK_AUTO_SCAN as
this is meant for only first automatic scan.

Link: http://lkml.kernel.org/r/1540231723-7087-1-git-send-email-prpatel@nvidia.com
Signed-off-by: Sri Krishna chowdary <schowdary@nvidia.com>
Signed-off-by: Sachin Nikam <snikam@nvidia.com>
Signed-off-by: Prateek <prpatel@nvidia.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug | 15 +++++++++++++++
 mm/kmemleak.c     | 10 ++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b3c91b9e32f8..2b5a4256e88b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -593,6 +593,21 @@ config DEBUG_KMEMLEAK_DEFAULT_OFF
 	  Say Y here to disable kmemleak by default. It can then be enabled
 	  on the command line via kmemleak=on.
 
+config DEBUG_KMEMLEAK_AUTO_SCAN
+	bool "Enable kmemleak auto scan thread on boot up"
+	default y
+	depends on DEBUG_KMEMLEAK
+	help
+	  Depending on the cpu, kmemleak scan may be cpu intensive and can
+	  stall user tasks at times. This option enables/disables automatic
+	  kmemleak scan at boot up.
+
+	  Say N here to disable kmemleak auto scan thread to stop automatic
+	  scanning. Disabling this option disables automatic reporting of
+	  memory leaks.
+
+	  If unsure, say Y.
+
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
 	depends on DEBUG_KERNEL && !IA64
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 9c3d2dea0861..f9d9dc250428 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1650,7 +1650,7 @@ static void kmemleak_scan(void)
  */
 static int kmemleak_scan_thread(void *arg)
 {
-	static int first_run = 1;
+	static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN);
 
 	pr_info("Automatic memory scanning thread started\n");
 	set_user_nice(current, 10);
@@ -2144,9 +2144,11 @@ static int __init kmemleak_late_init(void)
 		return -ENOMEM;
 	}
 
-	mutex_lock(&scan_mutex);
-	start_scan_thread();
-	mutex_unlock(&scan_mutex);
+	if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) {
+		mutex_lock(&scan_mutex);
+		start_scan_thread();
+		mutex_unlock(&scan_mutex);
+	}
 
 	pr_info("Kernel memory leak detector initialized\n");
 
-- 
cgit v1.2.3-59-g8ed1b