From a1b2289cef92ef0e9a92afcd2e1ea71d5bcaaf64 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Tue, 3 Oct 2017 16:15:00 -0700
Subject: android: binder: drop lru lock in isolate callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the global lru lock in isolate callback before calling
zap_page_range which calls cond_resched, and re-acquire the global lru
lock before returning.  Also change return code to LRU_REMOVED_RETRY.

Use mmput_async when fail to acquire mmap sem in an atomic context.

Fix "BUG: sleeping function called from invalid context"
errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled.

Also restore mmput_async, which was initially introduced in commit
ec8d7c14ea14 ("mm, oom_reaper: do not mmput synchronously from the oom
reaper context"), and was removed in commit 212925802454 ("mm: oom: let
oom_reap_task and exit_mmap run concurrently").

Link: http://lkml.kernel.org/r/20170914182231.90908-1-sherryy@android.com
Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder")
Signed-off-by: Sherry Yang <sherryy@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reported-by: Kyle Yan <kyan@codeaurora.org>
Acked-by: Arve Hjønnevåg <arve@android.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Martijn Coenen <maco@google.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Riley Andrews <riandrews@android.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hoeun Ryu <hoeun.ryu@gmail.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..e702cb9ffbd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct,
+					    async_put_work);
+
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_users)) {
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
+	}
+}
+#endif
+
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
  *
-- 
cgit v1.3-8-gc7d7


From ca182551857cc2c1e6a2b7f1e72090a137a15008 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Fri, 13 Oct 2017 15:58:22 -0700
Subject: kmemleak: clear stale pointers from task stacks

Kmemleak considers any pointers on task stacks as references.  This
patch clears newly allocated and reused vmap stacks.

Link: http://lkml.kernel.org/r/150728990124.744199.8403409836394318684.stgit@buzz
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/thread_info.h | 2 +-
 kernel/fork.c               | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 905d769d8ddc..5f7eeab990fe 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -42,7 +42,7 @@ enum {
 #define THREAD_ALIGN	THREAD_SIZE
 #endif
 
-#ifdef CONFIG_DEBUG_STACK_USAGE
+#if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)
 # define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
 				 __GFP_ZERO)
 #else
diff --git a/kernel/fork.c b/kernel/fork.c
index e702cb9ffbd8..07cc743698d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		if (!s)
 			continue;
 
+#ifdef CONFIG_DEBUG_KMEMLEAK
+		/* Clear stale pointers from reused stack. */
+		memset(s->addr, 0, THREAD_SIZE);
+#endif
 		tsk->stack_vm_area = s;
 		return s->addr;
 	}
-- 
cgit v1.3-8-gc7d7


From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:33 -0800
Subject: mm: account pud page tables

On a machine with 5-level paging support a process can allocate
significant amount of memory and stay unnoticed by oom-killer and memory
cgroup.  The trick is to allocate a lot of PUD page tables.  We don't
account PUD page tables, only PMD and PTE.

We already addressed the same issue for PMD page tables, see commit
dc6c9a35b66b ("mm: account pmd page tables to the process").
Introduction of 5-level paging brings the same issue for PUD page
tables.

The patch expands accounting to PUD level.

[kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/]
  Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com
[heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting]
  Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com
Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt         |  8 ++++----
 arch/powerpc/mm/hugetlbpage.c       |  1 +
 arch/s390/include/asm/mmu_context.h |  4 +++-
 arch/sparc/mm/hugetlbpage.c         |  1 +
 fs/proc/task_mmu.c                  |  5 ++++-
 include/linux/mm.h                  | 36 +++++++++++++++++++++++++++++++++---
 include/linux/mm_types.h            |  3 +++
 kernel/fork.c                       |  4 ++++
 mm/debug.c                          |  6 ++++--
 mm/memory.c                         | 15 +++++++++------
 mm/oom_kill.c                       |  8 +++++---
 11 files changed, 71 insertions(+), 20 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 30fd16b14196..2729e8db9492 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -629,10 +629,10 @@ oom_dump_tasks
 
 Enables a system-wide task dump (excluding kernel threads) to be produced
 when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
-score, and name.  This is helpful to determine why the OOM killer was
-invoked, to identify the rogue task that caused it, and to determine why
-the OOM killer chose the task it did to kill.
+pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
+oom_score_adj score, and name.  This is helpful to determine why the OOM
+killer was invoked, to identify the rogue task that caused it, and to
+determine why the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1571a498a33f..a9b9083c5e49 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 /*
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 43607bb12cc2..cf4c1cb17dcd 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk,
 		mm->context.asce_limit = STACK_TOP_MAX;
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+		/* pgd_alloc() did not account this pud */
+		mm_inc_nr_puds(mm);
 		break;
 	case -PAGE_SIZE:
 		/* forked 5-level task, set new asce with new_mm->pgd */
@@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk,
 		/* forked 2-level compat task, set new asce with new mm->pgd */
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
-		/* pgd_alloc() did not increase mm->nr_pmds */
+		/* pgd_alloc() did not account this pmd */
 		mm_inc_nr_pmds(mm);
 	}
 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5078b7f68890..01f63b4ee2b4 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6744bd706ecf..a05ce6186f99 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
+	unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -52,6 +52,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	swap = get_mm_counter(mm, MM_SWAPENTS);
 	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
 	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
+	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -68,6 +69,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
 		"VmPMD:\t%8lu kB\n"
+		"VmPUD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -82,6 +84,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		ptes >> 10,
 		pmds >> 10,
+		puds >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 91b46f99b4d2..9af86f39d928 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
 int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
 #endif
 
-#ifdef __PAGETABLE_PUD_FOLDED
+#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
 static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 						unsigned long address)
 {
 	return 0;
 }
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_nr_puds_init(struct mm_struct *mm) {}
+static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
+static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
+
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
+
+static inline void mm_nr_puds_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_puds, 0);
+}
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_puds);
+}
+
+static inline void mm_inc_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_puds);
+}
+
+static inline void mm_dec_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_puds);
+}
 #endif
 
 #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
@@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 
 static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return 0;
 }
@@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm)
 	atomic_long_set(&mm->nr_pmds, 0);
 }
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return atomic_long_read(&mm->nr_pmds);
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d1b8e8f97fc2..e9e561e02d22 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -404,6 +404,9 @@ struct mm_struct {
 	atomic_long_t nr_ptes;			/* PTE page table pages */
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
+#endif
+#if CONFIG_PGTABLE_LEVELS > 3
+	atomic_long_t nr_puds;			/* PUD page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3..a4eb6f289365 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
 	mm_nr_pmds_init(mm);
+	mm_nr_puds_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
@@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm)
 	if (mm_nr_pmds(mm))
 		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
 				mm_nr_pmds(mm));
+	if (mm_nr_puds(mm))
+		pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
+				mm_nr_puds(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
diff --git a/mm/debug.c b/mm/debug.c
index 6726bec731c9..a12d826bb774 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d\n"
+		"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm)
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
 		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
-		mm_nr_pmds((struct mm_struct *)mm),
+		mm_nr_pmds(mm),
+		mm_nr_puds(mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/memory.c b/mm/memory.c
index 42fb30300bb5..6bbd4078ec98 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 	pud = pud_offset(p4d, start);
 	p4d_clear(p4d);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -4149,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_5LEVEL_HACK
-	if (p4d_present(*p4d))		/* Another has populated it */
-		pud_free(mm, new);
-	else
+	if (!p4d_present(*p4d)) {
+		mm_inc_nr_puds(mm);
 		p4d_populate(mm, p4d, new);
-#else
-	if (pgd_present(*p4d))		/* Another has populated it */
+	} else	/* Another has populated it */
 		pud_free(mm, new);
-	else
+#else
+	if (!pgd_present(*p4d)) {
+		mm_inc_nr_puds(mm);
 		pgd_populate(mm, p4d, new);
+	} else	/* Another has populated it */
+		pud_free(mm, new);
 #endif /* __ARCH_HAS_5LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3023919970f7..f642a45b7f14 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,7 +221,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
+		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
+		mm_nr_puds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -397,7 +398,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -413,11 +414,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			atomic_long_read(&task->mm->nr_ptes),
 			mm_nr_pmds(task->mm),
+			mm_nr_puds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.3-8-gc7d7


From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:37 -0800
Subject: mm: introduce wrappers to access mm->nr_ptes

Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd
and nr_pud.

The patch also makes nr_ptes accounting dependent onto CONFIG_MMU.  Page
table accounting doesn't make sense if you don't have page tables.

It's preparation for consolidation of page-table counters in mm_struct.

Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/pgd.c           |  2 +-
 arch/sparc/mm/hugetlbpage.c |  2 +-
 arch/unicore32/mm/pgd.c     |  2 +-
 fs/proc/task_mmu.c          |  2 +-
 include/linux/mm.h          | 32 ++++++++++++++++++++++++++++++++
 include/linux/mm_types.h    |  2 ++
 kernel/fork.c               |  6 +++---
 mm/debug.c                  |  2 +-
 mm/huge_memory.c            | 10 +++++-----
 mm/khugepaged.c             |  2 +-
 mm/memory.c                 |  8 ++++----
 mm/oom_kill.c               |  5 ++---
 12 files changed, 54 insertions(+), 21 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index c1c1a5c67da1..61e281cb29fb 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -141,7 +141,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base)
 	pte = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free(mm, pte);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 no_pmd:
 	pud_clear(pud);
 	pmd_free(mm, pmd);
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 01f63b4ee2b4..0112d6942288 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	atomic_long_dec(&tlb->mm->nr_ptes);
+	mm_dec_nr_ptes(tlb->mm);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c
index c572a28c76c9..a830a300aaa1 100644
--- a/arch/unicore32/mm/pgd.c
+++ b/arch/unicore32/mm/pgd.c
@@ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd)
 	pte = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free(mm, pte);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 	pmd_free(mm, pmd);
 	mm_dec_nr_pmds(mm);
 free:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a05ce6186f99..9bd2a0294ac1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -50,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+	ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm);
 	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
 	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9af86f39d928..2ca799f0d762 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1680,6 +1680,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_MMU
+static inline void mm_nr_ptes_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_ptes, 0);
+}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_ptes);
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_ptes);
+}
+
+static inline void mm_dec_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_ptes);
+}
+#else
+static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
+static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
+#endif
+
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e9e561e02d22..e42048020664 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -401,7 +401,9 @@ struct mm_struct {
 	 */
 	atomic_t mm_count;
 
+#ifdef CONFIG_MMU
 	atomic_long_t nr_ptes;			/* PTE page table pages */
+#endif
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a4eb6f289365..946922a30ede 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,7 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
-	atomic_long_set(&mm->nr_ptes, 0);
+	mm_nr_ptes_init(mm);
 	mm_nr_pmds_init(mm);
 	mm_nr_puds_init(mm);
 	mm->map_count = 0;
@@ -873,9 +873,9 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-	if (atomic_long_read(&mm->nr_ptes))
+	if (mm_nr_ptes(mm))
 		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
-				atomic_long_read(&mm->nr_ptes));
+				mm_nr_ptes(mm));
 	if (mm_nr_pmds(mm))
 		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
 				mm_nr_pmds(mm));
diff --git a/mm/debug.c b/mm/debug.c
index a12d826bb774..c9888a6d7875 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
-		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+		mm_nr_ptes(mm),
 		mm_nr_pmds(mm),
 		mm_nr_puds(mm),
 		mm->map_count,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cc65fb87c9db..3610d81c062a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		atomic_long_inc(&vma->vm_mm->nr_ptes);
+		mm_inc_nr_ptes(vma->vm_mm);
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
 	}
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	if (pgtable)
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, haddr, pmd, entry);
-	atomic_long_inc(&mm->nr_ptes);
+	mm_inc_nr_ptes(mm);
 	return true;
 }
 
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 	if (pgtable) {
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
-		atomic_long_inc(&mm->nr_ptes);
+		mm_inc_nr_ptes(mm);
 	}
 
 	set_pmd_at(mm, addr, pmd, entry);
@@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	get_page(src_page);
 	page_dup_rmap(src_page, true);
 	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-	atomic_long_inc(&dst_mm->nr_ptes);
+	mm_inc_nr_ptes(dst_mm);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1695,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
 
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pte_free(mm, pgtable);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 }
 
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43cb3043311b..ea4ff259b671 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
 			spin_unlock(ptl);
 			up_write(&vma->vm_mm->mmap_sem);
-			atomic_long_dec(&vma->vm_mm->nr_ptes);
+			mm_dec_nr_ptes(vma->vm_mm);
 			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
 		}
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 6bbd4078ec98..6dec21b182b0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	atomic_long_dec(&tlb->mm->nr_ptes);
+	mm_dec_nr_ptes(tlb->mm);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -666,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 
 	ptl = pmd_lock(mm, pmd);
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
-		atomic_long_inc(&mm->nr_ptes);
+		mm_inc_nr_ptes(mm);
 		pmd_populate(mm, pmd, new);
 		new = NULL;
 	}
@@ -3238,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
 			goto map_pte;
 		}
 
-		atomic_long_inc(&vma->vm_mm->nr_ptes);
+		mm_inc_nr_ptes(vma->vm_mm);
 		pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
 		spin_unlock(vmf->ptl);
 		vmf->prealloc_pte = NULL;
@@ -3297,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
 	 * We are going to consume the prealloc table,
 	 * count that as nr_ptes.
 	 */
-	atomic_long_inc(&vma->vm_mm->nr_ptes);
+	mm_inc_nr_ptes(vma->vm_mm);
 	vmf->prealloc_pte = NULL;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f642a45b7f14..f9300141480e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,8 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
-		mm_nr_puds(p->mm);
+		mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -417,7 +416,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			atomic_long_read(&task->mm->nr_ptes),
+			mm_nr_ptes(task->mm),
 			mm_nr_pmds(task->mm),
 			mm_nr_puds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
-- 
cgit v1.3-8-gc7d7


From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:40 -0800
Subject: mm: consolidate page table accounting

Currently, we account page tables separately for each page table level,
but that's redundant -- we only make use of total memory allocated to
page tables for oom_badness calculation.  We also provide the
information to userspace, but it has dubious value there too.

This patch switches page table accounting to single counter.

mm->pgtables_bytes is now used to account all page table levels.  We use
bytes, because page table size for different levels of page table tree
may be different.

The change has user-visible effect: we don't have VmPMD and VmPUD
reported in /proc/[pid]/status.  Not sure if anybody uses them.  (As
alternative, we can always report 0 kB for them.)

OOM-killer report is also slightly changed: we now report pgtables_bytes
instead of nr_ptes, nr_pmd, nr_puds.

Apart from reducing number of counters per-mm, the benefit is that we
now calculate oom_badness() more correctly for machines which have
different size of page tables depending on level or where page tables
are less than a page in size.

The only downside can be debuggability because we do not know which page
table level could leak.  But I do not remember many bugs that would be
caught by separate counters so I wouldn't lose sleep over this.

[akpm@linux-foundation.org: fix mm/huge_memory.c]
Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
[kirill.shutemov@linux.intel.com: fix build]
  Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  1 -
 Documentation/sysctl/vm.txt        |  8 +++---
 fs/proc/task_mmu.c                 | 11 ++------
 include/linux/mm.h                 | 58 ++++++++------------------------------
 include/linux/mm_types.h           |  8 +-----
 kernel/fork.c                      | 16 +++--------
 mm/debug.c                         |  7 ++---
 mm/huge_memory.c                   |  2 +-
 mm/oom_kill.c                      | 14 ++++-----
 9 files changed, 32 insertions(+), 93 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index adba21b5ada7..ec571b9bb18a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -250,7 +250,6 @@ Table 1-2: Contents of the status files (as of 4.8)
  VmExe                       size of text segment
  VmLib                       size of shared library code
  VmPTE                       size of page table entries
- VmPMD                       size of second level page tables
  VmSwap                      amount of swap used by anonymous private data
                              (shmem swap usage is not included)
  HugetlbPages                size of hugetlb memory portions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 2729e8db9492..3e579740b49f 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -629,10 +629,10 @@ oom_dump_tasks
 
 Enables a system-wide task dump (excluding kernel threads) to be produced
 when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
-oom_score_adj score, and name.  This is helpful to determine why the OOM
-killer was invoked, to identify the rogue task that caused it, and to
-determine why the OOM killer chose the task it did to kill.
+pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj
+score, and name.  This is helpful to determine why the OOM killer was
+invoked, to identify the rogue task that caused it, and to determine why
+the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd2a0294ac1..875231c36cb3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
+	unsigned long text, lib, swap, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -50,9 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm);
-	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
-	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -68,8 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
-		"VmPMD:\t%8lu kB\n"
-		"VmPUD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -82,9 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		shmem << (PAGE_SHIFT-10),
 		mm->data_vm << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		ptes >> 10,
-		pmds >> 10,
-		puds >> 10,
+		mm_pgtables_bytes(mm) >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ca799f0d762..7c1e82a1aa77 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1605,37 +1605,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 {
 	return 0;
 }
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void mm_nr_puds_init(struct mm_struct *mm) {}
 static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
 
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
 
-static inline void mm_nr_puds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_puds, 0);
-}
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_puds);
-}
-
 static inline void mm_inc_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_puds);
+	atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_puds);
+	atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 #endif
 
@@ -1646,64 +1629,47 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 	return 0;
 }
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
 
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_pmds, 0);
-}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_pmds);
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_pmds);
+	atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_pmds);
+	atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 #endif
 
 #ifdef CONFIG_MMU
-static inline void mm_nr_ptes_init(struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
 {
-	atomic_long_set(&mm->nr_ptes, 0);
+	atomic_long_set(&mm->pgtables_bytes, 0);
 }
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
-	return atomic_long_read(&mm->nr_ptes);
+	return atomic_long_read(&mm->pgtables_bytes);
 }
 
 static inline void mm_inc_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_ptes);
+	atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_ptes);
+	atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 #else
-static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
 	return 0;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e42048020664..09643e0472fc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -402,13 +402,7 @@ struct mm_struct {
 	atomic_t mm_count;
 
 #ifdef CONFIG_MMU
-	atomic_long_t nr_ptes;			/* PTE page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 2
-	atomic_long_t nr_pmds;			/* PMD page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 3
-	atomic_long_t nr_puds;			/* PUD page table pages */
+	atomic_long_t pgtables_bytes;		/* PTE page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 946922a30ede..006dc5899a1a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,9 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
-	mm_nr_ptes_init(mm);
-	mm_nr_pmds_init(mm);
-	mm_nr_puds_init(mm);
+	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
@@ -873,15 +871,9 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-	if (mm_nr_ptes(mm))
-		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
-				mm_nr_ptes(mm));
-	if (mm_nr_pmds(mm))
-		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
-				mm_nr_pmds(mm));
-	if (mm_nr_puds(mm))
-		pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
-				mm_nr_puds(mm));
+	if (mm_pgtables_bytes(mm))
+		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+				mm_pgtables_bytes(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
diff --git a/mm/debug.c b/mm/debug.c
index c9888a6d7875..d947f3e03b0d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,8 +105,7 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d\n"
-		"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -136,9 +135,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
-		mm_nr_ptes(mm),
-		mm_nr_pmds(mm),
-		mm_nr_puds(mm),
+		mm_pgtables_bytes(mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3610d81c062a..86fe697e8bfb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
 		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		atomic_long_inc(&dst_mm->nr_ptes);
+		mm_inc_nr_ptes(dst_mm);
 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 		ret = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f9300141480e..26add8a0d1f7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm);
+		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
 	task_unlock(p);
 
 	/*
@@ -389,15 +389,15 @@ static void select_bad_process(struct oom_control *oc)
  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
  * are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
- * swapents, oom_score_adj value, and name.
+ * State information includes task's pid, uid, tgid, vm size, rss,
+ * pgtables_bytes, swapents, oom_score_adj value, and name.
  */
 static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -413,12 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			mm_nr_ptes(task->mm),
-			mm_nr_pmds(task->mm),
-			mm_nr_puds(task->mm),
+			mm_pgtables_bytes(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.3-8-gc7d7


From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:54 -0800
Subject: kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK

Convert all allocations that used a NOTRACK flag to stop using it.

Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/pgalloc.h       |  2 +-
 arch/arm64/include/asm/pgalloc.h     |  2 +-
 arch/powerpc/include/asm/pgalloc.h   |  2 +-
 arch/sh/kernel/dwarf.c               |  4 ++--
 arch/sh/kernel/process.c             |  2 +-
 arch/sparc/mm/init_64.c              |  4 ++--
 arch/unicore32/include/asm/pgalloc.h |  2 +-
 arch/x86/kernel/espfix_64.c          |  2 +-
 arch/x86/mm/init.c                   |  3 +--
 arch/x86/mm/init_64.c                |  2 +-
 arch/x86/mm/pageattr.c               | 10 +++++-----
 arch/x86/mm/pgtable.c                |  2 +-
 arch/x86/platform/efi/efi_64.c       |  2 +-
 crypto/xor.c                         |  7 +------
 include/linux/thread_info.h          |  5 ++---
 init/do_mounts.c                     |  3 +--
 kernel/fork.c                        | 12 ++++++------
 kernel/signal.c                      |  3 +--
 mm/kmemcheck.c                       |  2 +-
 mm/slab.c                            |  2 +-
 mm/slab.h                            |  5 ++---
 mm/slab_common.c                     |  2 +-
 mm/slub.c                            |  4 +---
 23 files changed, 36 insertions(+), 48 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index b2902a5cd780..2d7344f0e208 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 
 static inline void clean_pte_table(pte_t *pte)
 {
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index d25f4f137c2a..5ca6a573a701 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -26,7 +26,7 @@
 
 #define check_pgt_cache()		do { } while (0)
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
 
 #if CONFIG_PGTABLE_LEVELS > 2
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index a14203c005f1..e11f03007b57 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -18,7 +18,7 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp)
 }
 #endif /* MODULE */
 
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
 
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgalloc.h>
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index e1d751ae2498..1a2526676a87 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -1172,11 +1172,11 @@ static int __init dwarf_unwinder_init(void)
 
 	dwarf_frame_cachep = kmem_cache_create("dwarf_frames",
 			sizeof(struct dwarf_frame), 0,
-			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
+			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
 
 	dwarf_reg_cachep = kmem_cache_create("dwarf_regs",
 			sizeof(struct dwarf_reg), 0,
-			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
+			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
 
 	dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ,
 						    dwarf_frame_cachep);
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index b2d9963d5978..68b1a67533ce 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -59,7 +59,7 @@ void arch_task_cache_init(void)
 
 	task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size,
 					       __alignof__(union thread_xstate),
-					       SLAB_PANIC | SLAB_NOTRACK, NULL);
+					       SLAB_PANIC, NULL);
 }
 
 #ifdef CONFIG_SH_FPU_EMU
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 61bdc1270d19..2de22d703076 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2927,7 +2927,7 @@ void __flush_tlb_all(void)
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 			    unsigned long address)
 {
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	pte_t *pte = NULL;
 
 	if (page)
@@ -2939,7 +2939,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 pgtable_t pte_alloc_one(struct mm_struct *mm,
 			unsigned long address)
 {
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
 		return NULL;
 	if (!pgtable_page_ctor(page)) {
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
index 26775793c204..f0fdb268f8f2 100644
--- a/arch/unicore32/include/asm/pgalloc.h
+++ b/arch/unicore32/include/asm/pgalloc.h
@@ -28,7 +28,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd);
 #define pgd_alloc(mm)			get_pgd_slow(mm)
 #define pgd_free(mm, pgd)		free_pgd_slow(mm, pgd)
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 
 /*
  * Allocate one PTE table.
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 7d7715dde901..e5ec3cafa72e 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,7 +57,7 @@
 # error "Need more virtual address space for the ESPFIX hack"
 #endif
 
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
 
 /* This contains the *bottom* address of the espfix stack */
 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..ef94620ceb8a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num)
 		unsigned int order;
 
 		order = get_order((unsigned long)num << PAGE_SHIFT);
-		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
-						__GFP_ZERO, order);
+		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
 	}
 
 	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index adcea90a2046..5fa3a58b5d78 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
 	void *ptr;
 
 	if (after_bootmem)
-		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
 	else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3fe68483463c..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 
 	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	base = alloc_pages(GFP_KERNEL, 0);
 	if (!debug_pagealloc_enabled())
 		spin_lock(&cpa_lock);
 	if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
 
 static int alloc_pte_page(pmd_t *pmd)
 {
-	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pte)
 		return -1;
 
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
 
 static int alloc_pmd_page(pud_t *pud)
 {
-	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pmd)
 		return -1;
 
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	pgd_entry = cpa->pgd + pgd_index(addr);
 
 	if (pgd_none(*pgd_entry)) {
-		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
 		if (!p4d)
 			return -1;
 
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	 */
 	p4d = p4d_offset(pgd_entry, addr);
 	if (p4d_none(*p4d)) {
-		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
 		if (!pud)
 			return -1;
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..96d456a94b03 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -7,7 +7,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 
-#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9e4ee5b04b2d..6a151ce70e86 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void)
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
-	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
+	gfp_mask = GFP_KERNEL | __GFP_ZERO;
 	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
 	if (!efi_pgd)
 		return -ENOMEM;
diff --git a/crypto/xor.c b/crypto/xor.c
index 263af9fb45ea..bce9fe7af40a 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -122,12 +122,7 @@ calibrate_xor_blocks(void)
 		goto out;
 	}
 
-	/*
-	 * Note: Since the memory is not actually used for _anything_ but to
-	 * test the XOR speed, we don't really want kmemcheck to warn about
-	 * reading uninitialized bytes here.
-	 */
-	b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2);
+	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
 	if (!b1) {
 		printk(KERN_WARNING "xor: Yikes!  No memory available.\n");
 		return -ENOMEM;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 4bcdf00c110f..34f053a150a9 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -44,10 +44,9 @@ enum {
 #endif
 
 #if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
-				 __GFP_ZERO)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 #else
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT)
 #endif
 
 /*
diff --git a/init/do_mounts.c b/init/do_mounts.c
index f6d4dd764a52..7cf4f6dafd5f 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -380,8 +380,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data)
 
 void __init mount_block_root(char *name, int flags)
 {
-	struct page *page = alloc_page(GFP_KERNEL |
-					__GFP_NOTRACK_FALSE_POSITIVE);
+	struct page *page = alloc_page(GFP_KERNEL);
 	char *fs_names = page_address(page);
 	char *p;
 #ifdef CONFIG_BLOCK
diff --git a/kernel/fork.c b/kernel/fork.c
index 006dc5899a1a..4e55eedba8d6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -469,7 +469,7 @@ void __init fork_init(void)
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
 			arch_task_struct_size, align,
-			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
+			SLAB_PANIC|SLAB_ACCOUNT, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -2205,18 +2205,18 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
-			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
+			SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	/*
 	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
@@ -2227,7 +2227,7 @@ void __init proc_caches_init(void)
 	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
 	mmap_init();
diff --git a/kernel/signal.c b/kernel/signal.c
index 8dcd8825b2de..aa1fb9f905db 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	else
 		override_rlimit = 0;
 
-	q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
-		override_rlimit);
+	q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
 	if (q) {
 		list_add_tail(&q->list, &pending->list);
 		switch ((unsigned long) info) {
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 800d64b854ea..b3a4d61d341c 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -18,7 +18,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
 	 * With kmemcheck enabled, we need to allocate a memory area for the
 	 * shadow bits as well.
 	 */
-	shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
+	shadow = alloc_pages_node(node, flags, order);
 	if (!shadow) {
 		if (printk_ratelimit())
 			pr_err("kmemcheck: failed to allocate shadow bitmap\n");
diff --git a/mm/slab.c b/mm/slab.c
index c84365e9a591..183e996dde5f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1410,7 +1410,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 
 	flags |= cachep->allocflags;
 
-	page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+	page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
 	if (!page) {
 		slab_out_of_memory(cachep, flags, nodeid);
 		return NULL;
diff --git a/mm/slab.h b/mm/slab.h
index e60a3d1d8f6f..ad657ffa44e5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -141,10 +141,10 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
 #if defined(CONFIG_SLAB)
 #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
 			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
-			  SLAB_NOTRACK | SLAB_ACCOUNT)
+			  SLAB_ACCOUNT)
 #elif defined(CONFIG_SLUB)
 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
-			  SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
+			  SLAB_TEMPORARY | SLAB_ACCOUNT)
 #else
 #define SLAB_CACHE_FLAGS (0)
 #endif
@@ -163,7 +163,6 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
 			      SLAB_NOLEAKTRACE | \
 			      SLAB_RECLAIM_ACCOUNT | \
 			      SLAB_TEMPORARY | \
-			      SLAB_NOTRACK | \
 			      SLAB_ACCOUNT)
 
 int __kmem_cache_shutdown(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 175e86637afd..c8cb36774ba1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
-			 SLAB_NOTRACK | SLAB_ACCOUNT)
+			 SLAB_ACCOUNT)
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index ac3b50b9abec..91aa99b4b836 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1436,8 +1436,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
 	struct page *page;
 	int order = oo_order(oo);
 
-	flags |= __GFP_NOTRACK;
-
 	if (node == NUMA_NO_NODE)
 		page = alloc_pages(flags, order);
 	else
@@ -3774,7 +3772,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	struct page *page;
 	void *ptr = NULL;
 
-	flags |= __GFP_COMP | __GFP_NOTRACK;
+	flags |= __GFP_COMP;
 	page = alloc_pages_node(node, flags, get_order(size));
 	if (page)
 		ptr = page_address(page);
-- 
cgit v1.3-8-gc7d7


From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:34 -0800
Subject: pid: remove pidhash

pidhash is no longer required as all the information can be looked up
from idr tree.  nr_hashed represented the number of pids that had been
hashed.  Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant,
it has been renamed to pid_allocated and PIDNS_ADDING respectively.

[gs051095@gmail.com: v6]
  Link: http://lkml.kernel.org/r/1507760379-21662-3-git-send-email-gs051095@gmail.com
Link: http://lkml.kernel.org/r/1507583624-22146-3-git-send-email-gs051095@gmail.com
Signed-off-by: Gargi Sharma <gs051095@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>	[ia64]
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/asm-offsets.c |  4 ++--
 include/linux/init_task.h      |  1 -
 include/linux/pid.h            |  2 --
 include/linux/pid_namespace.h  |  4 ++--
 init/main.c                    |  1 -
 kernel/fork.c                  |  2 +-
 kernel/pid.c                   | 48 +++++++++---------------------------------
 kernel/pid_namespace.c         |  6 +++---
 8 files changed, 18 insertions(+), 50 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index f7693f49c573..f4db2168d1b8 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -31,8 +31,8 @@ void foo(void)
 	DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
 	DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
 
-	BUILD_BUG_ON(sizeof(struct upid) != 32);
-	DEFINE(IA64_UPID_SHIFT, 5);
+	BUILD_BUG_ON(sizeof(struct upid) != 16);
+	DEFINE(IA64_UPID_SHIFT, 4);
 
 	BLANK();
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 8062e6cc607c..6a532629c983 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -105,7 +105,6 @@ extern struct group_info init_groups;
 	.numbers	= { {						\
 		.nr		= 0,					\
 		.ns		= &init_pid_ns,				\
-		.pid_chain	= { .next = NULL, .pprev = NULL },	\
 	}, }								\
 }
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index dfd684ce0787..7633d55d9a24 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -51,10 +51,8 @@ enum pid_type
  */
 
 struct upid {
-	/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
 	int nr;
 	struct pid_namespace *ns;
-	struct hlist_node pid_chain;
 };
 
 struct pid
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 92c6aa509d2e..49538b172483 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -25,7 +25,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct idr idr;
 	struct rcu_head rcu;
-	unsigned int nr_hashed;
+	unsigned int pid_allocated;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
@@ -49,7 +49,7 @@ struct pid_namespace {
 
 extern struct pid_namespace init_pid_ns;
 
-#define PIDNS_HASH_ADDING (1U << 31)
+#define PIDNS_ADDING (1U << 31)
 
 #ifdef CONFIG_PID_NS
 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
diff --git a/init/main.c b/init/main.c
index d0cbcfc06124..dfec3809e740 100644
--- a/init/main.c
+++ b/init/main.c
@@ -562,7 +562,6 @@ asmlinkage __visible void __init start_kernel(void)
 	 * kmem_cache_init()
 	 */
 	setup_log_buf(0);
-	pidhash_init();
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e55eedba8d6..432eadf6b58c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process(
 		retval = -ERESTARTNOINTR;
 		goto bad_fork_cancel_cgroup;
 	}
-	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
 		retval = -ENOMEM;
 		goto bad_fork_cancel_cgroup;
 	}
diff --git a/kernel/pid.c b/kernel/pid.c
index 0ce59369632f..b13b624e2c49 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,10 +41,6 @@
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
-#define pid_hashfn(nr, ns)	\
-	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
 int pid_max = PID_MAX_DEFAULT;
@@ -54,7 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-
 /*
  * PID-map pages start out as NULL, they get allocated upon
  * first use and are never deallocated. This way a low pid_max
@@ -64,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT;
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
 	.idr = IDR_INIT,
-	.nr_hashed = PIDNS_HASH_ADDING,
+	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
@@ -123,8 +118,7 @@ void free_pid(struct pid *pid)
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
-		hlist_del_rcu(&upid->pid_chain);
-		switch (--ns->nr_hashed) {
+		switch (--ns->pid_allocated) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -133,10 +127,10 @@ void free_pid(struct pid *pid)
 			 */
 			wake_up_process(ns->child_reaper);
 			break;
-		case PIDNS_HASH_ADDING:
+		case PIDNS_ADDING:
 			/* Handle a fork failure of the first process */
 			WARN_ON(ns->child_reaper);
-			ns->nr_hashed = 0;
+			ns->pid_allocated = 0;
 			/* fall through */
 		case 0:
 			schedule_work(&ns->proc_work);
@@ -212,14 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-	if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
 	for ( ; upid >= pid->numbers; --upid) {
-		hlist_add_head_rcu(&upid->pid_chain,
-				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
-		upid->ns->nr_hashed++;
+		upid->ns->pid_allocated++;
 	}
 	spin_unlock_irq(&pidmap_lock);
 
@@ -243,21 +235,13 @@ out_free:
 void disable_pid_allocation(struct pid_namespace *ns)
 {
 	spin_lock_irq(&pidmap_lock);
-	ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+	ns->pid_allocated &= ~PIDNS_ADDING;
 	spin_unlock_irq(&pidmap_lock);
 }
 
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
-	struct upid *pnr;
-
-	hlist_for_each_entry_rcu(pnr,
-			&pid_hash[pid_hashfn(nr, ns)], pid_chain)
-		if (pnr->nr == nr && pnr->ns == ns)
-			return container_of(pnr, struct pid,
-					numbers[ns->level]);
-
-	return NULL;
+	return idr_find(&ns->idr, nr);
 }
 EXPORT_SYMBOL_GPL(find_pid_ns);
 
@@ -413,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 		if (type != PIDTYPE_PID) {
 			if (type == __PIDTYPE_TGID)
 				type = PIDTYPE_PID;
+
 			task = task->group_leader;
 		}
 		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -439,23 +424,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 	return idr_get_next(&ns->idr, &nr);
 }
 
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
-	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
-					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
-					   &pidhash_shift, NULL,
-					   0, 4096);
-}
-
 void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
-	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
 	pid_max = min(pid_max_max, max_t(int, pid_max,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ca7c8a8823b1..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
-	ns->nr_hashed = PIDNS_HASH_ADDING;
+	ns->pid_allocated = PIDNS_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	return ns;
@@ -254,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
 	 * really care, we could reparent them to the global init. We could
 	 * exit and reap ->child_reaper even if it is not the last thread in
-	 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
 	 * pid_ns can not go away until proc_kill_sb() drops the reference.
 	 *
 	 * But this ns can also have other tasks injected by setns()+fork().
@@ -268,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 */
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (pid_ns->nr_hashed == init_pids)
+		if (pid_ns->pid_allocated == init_pids)
 			break;
 		schedule();
 	}
-- 
cgit v1.3-8-gc7d7