From 62f75532b583c03840f31e40386ce2df73be9ca0 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 14 Apr 2008 18:50:44 +0300
Subject: slub: Initialize per-cpu stats

As spotted by kmemcheck, we need to initialize the per-CPU ->stat array before
using it.

[kmem_cache_cpu structures are usually allocated from arrays defined via
DEFINE_PER_CPU that are zeroed so we have not noticed this so far --cl].

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index acc975fcc8cc..15a7a0d45d71 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1886,6 +1886,9 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
 	c->node = 0;
 	c->offset = s->offset / sizeof(void *);
 	c->objsize = s->objsize;
+#ifdef CONFIG_SLUB_STATS
+	memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
+#endif
 }
 
 static void init_kmem_cache_node(struct kmem_cache_node *n)
-- 
cgit v1.2.3-59-g8ed1b


From 4097d6017576a5e138f442f5e3c393ad00d10f58 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:51:18 +0300
Subject: slub: Reduce #ifdef ZONE_DMA by moving kmalloc_caches_dma near dma
 logic

Move the definition of kmalloc_caches_dma() into a later #ifdef CONFIG_ZONE_DMA.
This saves one #ifdef and leaves us with a total of two #ifdefs for dma slab support.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 15a7a0d45d71..3df6d5bdd711 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2412,10 +2412,6 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
-#endif
-
 static int __init setup_slub_min_order(char *str)
 {
 	get_option(&str, &slub_min_order);
@@ -2475,6 +2471,7 @@ panic:
 }
 
 #ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
 
 static void sysfs_add_func(struct work_struct *w)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 5b06c853ad447636e31d105e95c48ae9abb6bfb5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:51:34 +0300
Subject: slub: Deal with config variable dependencies

count_partial() is used by both slabinfo and the sysfs proc support. Move
the function directly before the beginning of the sysfs code so that it can
be easily found. Rework the preprocessor conditional to take into account
that slub sysfs support depends on CONFIG_SYSFS *and* CONFIG_SLUB_DEBUG.

Make CONFIG_SLUB_STATS depend on CONFIG_SLUB_DEBUG and CONFIG_SYSFS. There
is no point of keeping statistics if no one can restrive them.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 lib/Kconfig.debug |  2 +-
 mm/slub.c         | 30 +++++++++++++++---------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0796c1a090c0..eef557dc46c3 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -211,7 +211,7 @@ config SLUB_DEBUG_ON
 config SLUB_STATS
 	default n
 	bool "Enable SLUB performance statistics"
-	depends on SLUB
+	depends on SLUB && SLUB_DEBUG && SYSFS
 	help
 	  SLUB statistics are useful to debug SLUBs allocation behavior in
 	  order find ways to optimize the allocator. This should never be
diff --git a/mm/slub.c b/mm/slub.c
index 3df6d5bdd711..3fcdcf7d77ba 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2688,21 +2688,6 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
 
-#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SLABINFO)
-static unsigned long count_partial(struct kmem_cache_node *n)
-{
-	unsigned long flags;
-	unsigned long x = 0;
-	struct page *page;
-
-	spin_lock_irqsave(&n->list_lock, flags);
-	list_for_each_entry(page, &n->partial, lru)
-		x += page->inuse;
-	spin_unlock_irqrestore(&n->list_lock, flags);
-	return x;
-}
-#endif
-
 /*
  * kmem_cache_shrink removes empty slabs from the partial lists and sorts
  * the remaining slabs by the number of items in use. The slabs with the
@@ -3181,6 +3166,21 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	return slab_alloc(s, gfpflags, node, caller);
 }
 
+#if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO)
+static unsigned long count_partial(struct kmem_cache_node *n)
+{
+	unsigned long flags;
+	unsigned long x = 0;
+	struct page *page;
+
+	spin_lock_irqsave(&n->list_lock, flags);
+	list_for_each_entry(page, &n->partial, lru)
+		x += page->inuse;
+	spin_unlock_irqrestore(&n->list_lock, flags);
+	return x;
+}
+#endif
+
 #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
 static int validate_slab(struct kmem_cache *s, struct page *page,
 						unsigned long *map)
-- 
cgit v1.2.3-59-g8ed1b


From 50ef37b96c11e76625067ae413dc54585ea22585 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:52:05 +0300
Subject: slub: Fixes to per cpu stat output in sysfs

Only output per cpu stats if the kernel is build for SMP.

Use a capital "C" as a leading character for the processor number
(same as the numa statistics that also use a capital letter "N").

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 3fcdcf7d77ba..23e5ee7b149f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3979,10 +3979,12 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
 
 	len = sprintf(buf, "%lu", sum);
 
+#ifdef CONFIG_SMP
 	for_each_online_cpu(cpu) {
 		if (data[cpu] && len < PAGE_SIZE - 20)
-			len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]);
+			len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
 	}
+#endif
 	kfree(data);
 	return len + sprintf(buf + len, "\n");
 }
-- 
cgit v1.2.3-59-g8ed1b


From 49bd5221ce8fb55d12c04a3ffd375201c5bbfb7a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:52:18 +0300
Subject: slub: Move map/flag clearing to __free_slab

__free_slab does some diagnostics. The resetting of mapcount etc
in discard_slab() can interfere with debug processing. So move
the reset immediately before the page is freed.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 23e5ee7b149f..f924cffb29e7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1125,6 +1125,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 		-pages);
 
+	__ClearPageSlab(page);
+	reset_page_mapcount(page);
 	__free_pages(page, s->order);
 }
 
@@ -1154,8 +1156,6 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
 	atomic_long_dec(&n->nr_slabs);
-	reset_page_mapcount(page);
-	__ClearPageSlab(page);
 	free_slab(s, page);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0f389ec63077521166f071e1e970aed36147fd45 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:53:02 +0300
Subject: slub: No need for per node slab counters if !SLUB_DEBUG

The per node counters are used mainly for showing data through the sysfs API.
If that API is not compiled in then there is no point in keeping track of this
data. Disable counters for the number of slabs and the number of total slabs
if !SLUB_DEBUG. Incrementing the per node counters is also accessing a
potentially contended cacheline so this could actually be a performance
benefit to embedded systems.

SLABINFO support is also affected. It now must depends on SLUB_DEBUG (which
is on by default).

Patch also avoids a check for a NULL kmem_cache_node pointer in new_slab()
if the system is not compiled with NUMA support.

[penberg@cs.helsinki.fi: fix oops and move ->nr_slabs into CONFIG_SLUB_DEBUG]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h |  2 +-
 init/Kconfig             |  2 +-
 mm/slub.c                | 51 +++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index b00c1c73eb0a..79d59c937fac 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -45,9 +45,9 @@ struct kmem_cache_cpu {
 struct kmem_cache_node {
 	spinlock_t list_lock;	/* Protect partial list and nr_partial */
 	unsigned long nr_partial;
-	atomic_long_t nr_slabs;
 	struct list_head partial;
 #ifdef CONFIG_SLUB_DEBUG
+	atomic_long_t nr_slabs;
 	struct list_head full;
 #endif
 };
diff --git a/init/Kconfig b/init/Kconfig
index a97924bc5b8d..7fccf09bb95a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -763,7 +763,7 @@ endmenu		# General setup
 config SLABINFO
 	bool
 	depends on PROC_FS
-	depends on SLAB || SLUB
+	depends on SLAB || SLUB_DEBUG
 	default y
 
 config RT_MUTEXES
diff --git a/mm/slub.c b/mm/slub.c
index f924cffb29e7..7f8aaa291a4e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -837,6 +837,35 @@ static void remove_full(struct kmem_cache *s, struct page *page)
 	spin_unlock(&n->list_lock);
 }
 
+/* Tracking of the number of slabs for debugging purposes */
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+{
+	struct kmem_cache_node *n = get_node(s, node);
+
+	return atomic_long_read(&n->nr_slabs);
+}
+
+static inline void inc_slabs_node(struct kmem_cache *s, int node)
+{
+	struct kmem_cache_node *n = get_node(s, node);
+
+	/*
+	 * May be called early in order to allocate a slab for the
+	 * kmem_cache_node structure. Solve the chicken-egg
+	 * dilemma by deferring the increment of the count during
+	 * bootstrap (see early_kmem_cache_node_alloc).
+	 */
+	if (!NUMA_BUILD || n)
+		atomic_long_inc(&n->nr_slabs);
+}
+static inline void dec_slabs_node(struct kmem_cache *s, int node)
+{
+	struct kmem_cache_node *n = get_node(s, node);
+
+	atomic_long_dec(&n->nr_slabs);
+}
+
+/* Object debug checks for alloc/free paths */
 static void setup_object_debug(struct kmem_cache *s, struct page *page,
 								void *object)
 {
@@ -1028,6 +1057,11 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
 	return flags;
 }
 #define slub_debug 0
+
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+							{ return 0; }
+static inline void inc_slabs_node(struct kmem_cache *s, int node) {}
+static inline void dec_slabs_node(struct kmem_cache *s, int node) {}
 #endif
 /*
  * Slab allocation and freeing
@@ -1066,7 +1100,6 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
 	struct page *page;
-	struct kmem_cache_node *n;
 	void *start;
 	void *last;
 	void *p;
@@ -1078,9 +1111,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (!page)
 		goto out;
 
-	n = get_node(s, page_to_nid(page));
-	if (n)
-		atomic_long_inc(&n->nr_slabs);
+	inc_slabs_node(s, page_to_nid(page));
 	page->slab = s;
 	page->flags |= 1 << PG_slab;
 	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1153,9 +1184,7 @@ static void free_slab(struct kmem_cache *s, struct page *page)
 
 static void discard_slab(struct kmem_cache *s, struct page *page)
 {
-	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-
-	atomic_long_dec(&n->nr_slabs);
+	dec_slabs_node(s, page_to_nid(page));
 	free_slab(s, page);
 }
 
@@ -1894,10 +1923,10 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
 static void init_kmem_cache_node(struct kmem_cache_node *n)
 {
 	n->nr_partial = 0;
-	atomic_long_set(&n->nr_slabs, 0);
 	spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
+	atomic_long_set(&n->nr_slabs, 0);
 	INIT_LIST_HEAD(&n->full);
 #endif
 }
@@ -2066,7 +2095,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
 	init_tracking(kmalloc_caches, n);
 #endif
 	init_kmem_cache_node(n);
-	atomic_long_inc(&n->nr_slabs);
+	inc_slabs_node(kmalloc_caches, node);
 
 	/*
 	 * lockdep requires consistent irq usage for each lock
@@ -2379,7 +2408,7 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 		struct kmem_cache_node *n = get_node(s, node);
 
 		n->nr_partial -= free_list(s, n, &n->partial);
-		if (atomic_long_read(&n->nr_slabs))
+		if (slabs_node(s, node))
 			return 1;
 	}
 	free_kmem_cache_nodes(s);
@@ -2801,7 +2830,7 @@ static void slab_mem_offline_callback(void *arg)
 			 * and offline_pages() function shoudn't call this
 			 * callback. So, we must fail.
 			 */
-			BUG_ON(atomic_long_read(&n->nr_slabs));
+			BUG_ON(slabs_node(s, offline_node));
 
 			s->node[offline_node] = NULL;
 			kmem_cache_free(kmalloc_caches, n);
-- 
cgit v1.2.3-59-g8ed1b


From bead9a3abd15710b0bdfd418daef606722d86282 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 16 Apr 2008 01:40:00 +0200
Subject: mm: sparsemem memory_present() fix

Fix memory corruption and crash on 32-bit x86 systems.

If a !PAE x86 kernel is booted on a 32-bit system with more than 4GB of
RAM, then we call memory_present() with a start/end that goes outside
the scope of MAX_PHYSMEM_BITS.

That causes this loop to happily walk over the limit of the sparse
memory section map:

    for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
                unsigned long section = pfn_to_section_nr(pfn);
                struct mem_section *ms;

                sparse_index_init(section, nid);
                set_section_nid(section, nid);

                ms = __nr_to_section(section);
                if (!ms->section_mem_map)
                        ms->section_mem_map = sparse_encode_early_nid(nid) |
			                                SECTION_MARKED_PRESENT;

'ms' will be out of bounds and we'll corrupt a small amount of memory by
encoding the node ID and writing SECTION_MARKED_PRESENT (==0x1) over it.

The corruption might happen when encoding a non-zero node ID, or due to
the SECTION_MARKED_PRESENT which is 0x1:

	mmzone.h:#define	SECTION_MARKED_PRESENT	(1UL<<0)

The fix is to sanity check anything the architecture passes to
sparsemem.

This bug seems to be rather old (as old as sparsemem support itself),
but the exact incarnation depended on random details like configs, which
made this bug more prominent in v2.6.25-to-be.

An additional enhancement might be to print a warning about ignored or
trimmed memory ranges.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Yinghai Lu <Yinghai.Lu@sun.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index f6a43c09c322..98d6b39c3472 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -149,8 +149,18 @@ static inline int sparse_early_nid(struct mem_section *section)
 /* Record a memory area against a node. */
 void __init memory_present(int nid, unsigned long start, unsigned long end)
 {
+	unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
 	unsigned long pfn;
 
+	/*
+	 * Sanity checks - do not allow an architecture to pass
+	 * in larger pfns than the maximum scope of sparsemem:
+	 */
+	if (start >= max_arch_pfn)
+		return;
+	if (end >= max_arch_pfn)
+		end = max_arch_pfn;
+
 	start &= PAGE_SECTION_MASK;
 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
 		unsigned long section = pfn_to_section_nr(pfn);
-- 
cgit v1.2.3-59-g8ed1b


From e115f2d89253490fb2dbf304b627f8d908df26f1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 15 Apr 2008 14:34:37 -0700
Subject: memcg: fix oops in oom handling

When I used a test program to fork mass processes and immediately move them to
a cgroup where the memory limit is low enough to trigger oom kill, I got oops:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000808
IP: [<ffffffff8045c47f>] _spin_lock_irqsave+0x8/0x18
PGD 4c95f067 PUD 4406c067 PMD 0
Oops: 0002 [1] SMP
CPU 2
Modules linked in:

Pid: 11973, comm: a.out Not tainted 2.6.25-rc7 #5
RIP: 0010:[<ffffffff8045c47f>]  [<ffffffff8045c47f>] _spin_lock_irqsave+0x8/0x18
RSP: 0018:ffff8100448c7c30  EFLAGS: 00010002
RAX: 0000000000000202 RBX: 0000000000000009 RCX: 000000000001c9f3
RDX: 0000000000000100 RSI: 0000000000000001 RDI: 0000000000000808
RBP: ffff81007e444080 R08: 0000000000000000 R09: ffff8100448c7900
R10: ffff81000105f480 R11: 00000100ffffffff R12: ffff810067c84140
R13: 0000000000000001 R14: ffff8100441d0018 R15: ffff81007da56200
FS:  00007f70eb1856f0(0000) GS:ffff81007fbad3c0(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000808 CR3: 000000004498a000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process a.out (pid: 11973, threadinfo ffff8100448c6000, task ffff81007da533e0)
Stack:  ffffffff8023ef5a 00000000000000d0 ffffffff80548dc0 00000000000000d0
 ffff810067c84140 ffff81007e444080 ffffffff8026cef9 00000000000000d0
 ffff8100441d0000 00000000000000d0 ffff8100441d0000 ffff8100505445c0
Call Trace:
 [<ffffffff8023ef5a>] ? force_sig_info+0x25/0xb9
 [<ffffffff8026cef9>] ? oom_kill_task+0x77/0xe2
 [<ffffffff8026d696>] ? mem_cgroup_out_of_memory+0x55/0x67
 [<ffffffff802910ad>] ? mem_cgroup_charge_common+0xec/0x202
 [<ffffffff8027997b>] ? handle_mm_fault+0x24e/0x77f
 [<ffffffff8022c4af>] ? default_wake_function+0x0/0xe
 [<ffffffff8027a17a>] ? get_user_pages+0x2ce/0x3af
 [<ffffffff80290fee>] ? mem_cgroup_charge_common+0x2d/0x202
 [<ffffffff8027a441>] ? make_pages_present+0x8e/0xa4
 [<ffffffff8027d1ab>] ? mmap_region+0x373/0x429
 [<ffffffff8027d7eb>] ? do_mmap_pgoff+0x2ff/0x364
 [<ffffffff80210471>] ? sys_mmap+0xe5/0x111
 [<ffffffff8020bfc9>] ? tracesys+0xdc/0xe1

Code: 00 00 01 48 8b 3c 24 e9 46 d4 dd ff f0 ff 07 48 8b 3c 24 e9 3a d4 dd ff fe 07 48 8b 3c 24 e9 2f d4 dd ff 9c 58 fa ba 00 01 00 00 <f0> 66 0f c1 17 38 f2 74 06 f3 90 8a 17 eb f6 c3 fa b8 00 01 00
RIP  [<ffffffff8045c47f>] _spin_lock_irqsave+0x8/0x18
 RSP <ffff8100448c7c30>
CR2: 0000000000000808
---[ end trace c3702fa668021ea4 ]---

It's reproducable in a x86_64 box, but doesn't happen in x86_32.

This is because tsk->sighand is not guarded by RCU, so we have to
hold tasklist_lock, just as what out_of_memory() does.

Signed-off-by: Li Zefan <lizf@cn.fujitsu>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: David Rientjes <rientjes@cs.washington.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f255eda693b0..beb592fe9389 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -423,7 +423,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 	struct task_struct *p;
 
 	cgroup_lock();
-	rcu_read_lock();
+	read_lock(&tasklist_lock);
 retry:
 	p = select_bad_process(&points, mem);
 	if (PTR_ERR(p) == -1UL)
@@ -436,7 +436,7 @@ retry:
 				"Memory cgroup out of memory"))
 		goto retry;
 out:
-	rcu_read_unlock();
+	read_unlock(&tasklist_lock);
 	cgroup_unlock();
 }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 91446b064c748fc2a238fd68b677c9671e536bfd Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 15 Apr 2008 14:34:42 -0700
Subject: add "Isolate" migratetype name to /proc/pagetypeinfo

In a5d76b54a3f3a40385d7f76069a2feac9f1bad63 (memory unplug: page isolation by
KAMEZAWA Hiroyuki), "isolate" migratetype added.  but unfortunately, it
doesn't treat /proc/pagetypeinfo display logic.

this patch add "Isolate" to pagetype name field.

/proc/pagetype
before:
------------------------------------------------------------------------------------------------------------------------
Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10
Node    0, zone      DMA, type    Unmovable      1      2      2      2      1      2      2      1      1      0      0
Node    0, zone      DMA, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0
Node    0, zone      DMA, type      Movable      2      3      3      1      3      3      2      0      0      0      0
Node    0, zone      DMA, type      Reserve      0      0      0      0      0      0      0      0      0      0      1
Node    0, zone      DMA, type       <NULL>      0      0      0      0      0      0      0      0      0      0      0
Node    0, zone   Normal, type    Unmovable      1      9      7      4      1      1      1      1      0      0      0
Node    0, zone   Normal, type  Reclaimable      5      2      0      0      1      1      0      0      0      1      0
Node    0, zone   Normal, type      Movable      0      1      1      0      0      0      1      0      0      1     60
Node    0, zone   Normal, type      Reserve      0      0      0      0      0      0      0      0      0      0      1
Node    0, zone   Normal, type       <NULL>      0      0      0      0      0      0      0      0      0      0      0
Node    0, zone  HighMem, type    Unmovable      0      0      1      1      1      0      1      1      2      2      0
Node    0, zone  HighMem, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0
Node    0, zone  HighMem, type      Movable    236     62      6      2      2      1      1      0      1      1     16
Node    0, zone  HighMem, type      Reserve      0      0      0      0      0      0      0      0      0      0      1
Node    0, zone  HighMem, type       <NULL>      0      0      0      0      0      0      0      0      0      0      0

Number of blocks type     Unmovable  Reclaimable      Movable      Reserve       <NULL>
Node 0, zone      DMA            1            0            2       1            0
Node 0, zone   Normal           10           40          169       1            0
Node 0, zone  HighMem            2            0          283       1            0

after:
------------------------------------------------------------------------------------------------------------------------
Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10
Node    0, zone      DMA, type    Unmovable      1      2      2      2      1      2      2      1      1      0      0
Node    0, zone      DMA, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0
Node    0, zone      DMA, type      Movable      2      3      3      1      3      3      2      0      0      0      0
Node    0, zone      DMA, type      Reserve      0      0      0      0      0      0      0      0      0      0      1
Node    0, zone      DMA, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
Node    0, zone   Normal, type    Unmovable      0      2      1      1      0      1      0      0      0      0      0
Node    0, zone   Normal, type  Reclaimable      1      1      1      1      1      0      1      1      1      0      0
Node    0, zone   Normal, type      Movable      0      1      1      1      0      1      0      1      0      0    196
Node    0, zone   Normal, type      Reserve      0      0      0      0      0      0      0      0      0      0      1
Node    0, zone   Normal, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
Node    0, zone  HighMem, type    Unmovable      0      1      0      0      0      1      1      1      2      2      0
Node    0, zone  HighMem, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0
Node    0, zone  HighMem, type      Movable      1      0      1      1      0      0      0      0      1      0    200
Node    0, zone  HighMem, type      Reserve      0      0      0      0      0      0      0      0      0      0      1
Node    0, zone  HighMem, type      Isolate      0      0      0      0      0      0      0      0      0      0      0

Number of blocks type     Unmovable  Reclaimable      Movable      Reserve      Isolate
Node 0, zone      DMA            1            0            2       1            0
Node 0, zone   Normal            8            4          207       1            0
Node 0, zone  HighMem            2            0          283       1            0

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 422d960ffcd8..7c7286e9506d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -388,6 +388,7 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
 	"Reclaimable",
 	"Movable",
 	"Reserve",
+	"Isolate",
 };
 
 static void *frag_start(struct seq_file *m, loff_t *pos)
-- 
cgit v1.2.3-59-g8ed1b


From c33fa9f5609e918824446ef9a75319d4a802f1f4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 17 Apr 2008 20:05:36 +0200
Subject: uaccess: add probe_kernel_write()

add probe_kernel_read() and probe_kernel_write().

Uninlined and restricted to kernel range memory only, as suggested
by Linus.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/uaccess.h | 22 ++++++++++++++++++++++
 mm/Makefile             |  2 +-
 mm/maccess.c            | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 mm/maccess.c

(limited to 'mm')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 975c963e5789..fec6decfb983 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -84,4 +84,26 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 		ret;					\
 	})
 
+/*
+ * probe_kernel_read(): safely attempt to read from a location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+extern long probe_kernel_read(void *dst, void *src, size_t size);
+
+/*
+ * probe_kernel_write(): safely attempt to write to a location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+extern long probe_kernel_write(void *dst, void *src, size_t size);
+
 #endif		/* __LINUX_UACCESS_H__ */
diff --git a/mm/Makefile b/mm/Makefile
index a5b0dd93427a..18c143b3c46c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   vmalloc.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
-			   page_alloc.o page-writeback.o pdflush.o \
+			   maccess.o page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   page_isolation.o $(mmu-y)
diff --git a/mm/maccess.c b/mm/maccess.c
new file mode 100644
index 000000000000..24f81b971403
--- /dev/null
+++ b/mm/maccess.c
@@ -0,0 +1,49 @@
+/*
+ * Access kernel memory without faulting.
+ */
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+/**
+ * probe_kernel_read(): safely attempt to read from a location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_read(void *dst, void *src, size_t size)
+{
+	long ret;
+
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(dst,
+			(__force const void __user *)src, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_read);
+
+/**
+ * probe_kernel_write(): safely attempt to write to a location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_write(void *dst, void *src, size_t size)
+{
+	long ret;
+
+	pagefault_disable();
+	ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_write);
-- 
cgit v1.2.3-59-g8ed1b


From b4b8ac524d9b6ed7229017145afa1d7afbea4a48 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 20 Feb 2008 13:33:38 -0600
Subject: kgdb: fix optional arch functions and probe_kernel_*

Fix two regressions dealing with the kgdb core.

1) kgdb_skipexception and kgdb_post_primary_code are optional
functions that are only required on archs that need special exception
fixups.

2) The kernel address space scope must be set on any probe_kernel_*
function or archs such as ARCH=arm will not allow access to the kernel
memory space.  As an example, it is required to allow the full kernel
address space is when you the kernel debugger to inspect a system
call.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kgdb.c | 11 +++++++++++
 mm/maccess.c  |  6 ++++++
 2 files changed, 17 insertions(+)

(limited to 'mm')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 68aea78407e4..31425e0fbf20 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -200,6 +200,17 @@ int __weak kgdb_arch_init(void)
 	return 0;
 }
 
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+	return 0;
+}
+
+void __weak
+kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
+{
+	return;
+}
+
 /**
  *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
  *	@regs: Current &struct pt_regs.
diff --git a/mm/maccess.c b/mm/maccess.c
index 24f81b971403..ac40796cfb15 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -17,11 +17,14 @@
 long probe_kernel_read(void *dst, void *src, size_t size)
 {
 	long ret;
+	mm_segment_t old_fs = get_fs();
 
+	set_fs(KERNEL_DS);
 	pagefault_disable();
 	ret = __copy_from_user_inatomic(dst,
 			(__force const void __user *)src, size);
 	pagefault_enable();
+	set_fs(old_fs);
 
 	return ret ? -EFAULT : 0;
 }
@@ -39,10 +42,13 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
 long probe_kernel_write(void *dst, void *src, size_t size)
 {
 	long ret;
+	mm_segment_t old_fs = get_fs();
 
+	set_fs(KERNEL_DS);
 	pagefault_disable();
 	ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
 	pagefault_enable();
+	set_fs(old_fs);
 
 	return ret ? -EFAULT : 0;
 }
-- 
cgit v1.2.3-59-g8ed1b