From 62f75532b583c03840f31e40386ce2df73be9ca0 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Mon, 14 Apr 2008 18:50:44 +0300 Subject: slub: Initialize per-cpu stats As spotted by kmemcheck, we need to initialize the per-CPU ->stat array before using it. [kmem_cache_cpu structures are usually allocated from arrays defined via DEFINE_PER_CPU that are zeroed so we have not noticed this so far --cl]. Reported-by: Vegard Nossum Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index acc975fcc8cc..15a7a0d45d71 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1886,6 +1886,9 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, c->node = 0; c->offset = s->offset / sizeof(void *); c->objsize = s->objsize; +#ifdef CONFIG_SLUB_STATS + memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); +#endif } static void init_kmem_cache_node(struct kmem_cache_node *n) -- cgit v1.2.3-59-g8ed1b From 4097d6017576a5e138f442f5e3c393ad00d10f58 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 18:51:18 +0300 Subject: slub: Reduce #ifdef ZONE_DMA by moving kmalloc_caches_dma near dma logic Move the definition of kmalloc_caches_dma() into a later #ifdef CONFIG_ZONE_DMA. This saves one #ifdef and leaves us with a total of two #ifdefs for dma slab support. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 15a7a0d45d71..3df6d5bdd711 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2412,10 +2412,6 @@ EXPORT_SYMBOL(kmem_cache_destroy); struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; -#endif - static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2475,6 +2471,7 @@ panic: } #ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; static void sysfs_add_func(struct work_struct *w) { -- cgit v1.2.3-59-g8ed1b From 5b06c853ad447636e31d105e95c48ae9abb6bfb5 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 18:51:34 +0300 Subject: slub: Deal with config variable dependencies count_partial() is used by both slabinfo and the sysfs proc support. Move the function directly before the beginning of the sysfs code so that it can be easily found. Rework the preprocessor conditional to take into account that slub sysfs support depends on CONFIG_SYSFS *and* CONFIG_SLUB_DEBUG. Make CONFIG_SLUB_STATS depend on CONFIG_SLUB_DEBUG and CONFIG_SYSFS. There is no point of keeping statistics if no one can restrive them. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- lib/Kconfig.debug | 2 +- mm/slub.c | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0796c1a090c0..eef557dc46c3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -211,7 +211,7 @@ config SLUB_DEBUG_ON config SLUB_STATS default n bool "Enable SLUB performance statistics" - depends on SLUB + depends on SLUB && SLUB_DEBUG && SYSFS help SLUB statistics are useful to debug SLUBs allocation behavior in order find ways to optimize the allocator. This should never be diff --git a/mm/slub.c b/mm/slub.c index 3df6d5bdd711..3fcdcf7d77ba 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2688,21 +2688,6 @@ void kfree(const void *x) } EXPORT_SYMBOL(kfree); -#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SLABINFO) -static unsigned long count_partial(struct kmem_cache_node *n) -{ - unsigned long flags; - unsigned long x = 0; - struct page *page; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += page->inuse; - spin_unlock_irqrestore(&n->list_lock, flags); - return x; -} -#endif - /* * kmem_cache_shrink removes empty slabs from the partial lists and sorts * the remaining slabs by the number of items in use. The slabs with the @@ -3181,6 +3166,21 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return slab_alloc(s, gfpflags, node, caller); } +#if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO) +static unsigned long count_partial(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += page->inuse; + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} +#endif + #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) -- cgit v1.2.3-59-g8ed1b From 50ef37b96c11e76625067ae413dc54585ea22585 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 18:52:05 +0300 Subject: slub: Fixes to per cpu stat output in sysfs Only output per cpu stats if the kernel is build for SMP. Use a capital "C" as a leading character for the processor number (same as the numa statistics that also use a capital letter "N"). Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 3fcdcf7d77ba..23e5ee7b149f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3979,10 +3979,12 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) len = sprintf(buf, "%lu", sum); +#ifdef CONFIG_SMP for_each_online_cpu(cpu) { if (data[cpu] && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]); + len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); } +#endif kfree(data); return len + sprintf(buf + len, "\n"); } -- cgit v1.2.3-59-g8ed1b From 49bd5221ce8fb55d12c04a3ffd375201c5bbfb7a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 18:52:18 +0300 Subject: slub: Move map/flag clearing to __free_slab __free_slab does some diagnostics. The resetting of mapcount etc in discard_slab() can interfere with debug processing. So move the reset immediately before the page is freed. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 23e5ee7b149f..f924cffb29e7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1125,6 +1125,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); + __ClearPageSlab(page); + reset_page_mapcount(page); __free_pages(page, s->order); } @@ -1154,8 +1156,6 @@ static void discard_slab(struct kmem_cache *s, struct page *page) struct kmem_cache_node *n = get_node(s, page_to_nid(page)); atomic_long_dec(&n->nr_slabs); - reset_page_mapcount(page); - __ClearPageSlab(page); free_slab(s, page); } -- cgit v1.2.3-59-g8ed1b From 0f389ec63077521166f071e1e970aed36147fd45 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 18:53:02 +0300 Subject: slub: No need for per node slab counters if !SLUB_DEBUG The per node counters are used mainly for showing data through the sysfs API. If that API is not compiled in then there is no point in keeping track of this data. Disable counters for the number of slabs and the number of total slabs if !SLUB_DEBUG. Incrementing the per node counters is also accessing a potentially contended cacheline so this could actually be a performance benefit to embedded systems. SLABINFO support is also affected. It now must depends on SLUB_DEBUG (which is on by default). Patch also avoids a check for a NULL kmem_cache_node pointer in new_slab() if the system is not compiled with NUMA support. [penberg@cs.helsinki.fi: fix oops and move ->nr_slabs into CONFIG_SLUB_DEBUG] Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 2 +- init/Kconfig | 2 +- mm/slub.c | 51 +++++++++++++++++++++++++++++++++++++----------- 3 files changed, 42 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index b00c1c73eb0a..79d59c937fac 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -45,9 +45,9 @@ struct kmem_cache_cpu { struct kmem_cache_node { spinlock_t list_lock; /* Protect partial list and nr_partial */ unsigned long nr_partial; - atomic_long_t nr_slabs; struct list_head partial; #ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; struct list_head full; #endif }; diff --git a/init/Kconfig b/init/Kconfig index a97924bc5b8d..7fccf09bb95a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -763,7 +763,7 @@ endmenu # General setup config SLABINFO bool depends on PROC_FS - depends on SLAB || SLUB + depends on SLAB || SLUB_DEBUG default y config RT_MUTEXES diff --git a/mm/slub.c b/mm/slub.c index f924cffb29e7..7f8aaa291a4e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -837,6 +837,35 @@ static void remove_full(struct kmem_cache *s, struct page *page) spin_unlock(&n->list_lock); } +/* Tracking of the number of slabs for debugging purposes */ +static inline unsigned long slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + return atomic_long_read(&n->nr_slabs); +} + +static inline void inc_slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + /* + * May be called early in order to allocate a slab for the + * kmem_cache_node structure. Solve the chicken-egg + * dilemma by deferring the increment of the count during + * bootstrap (see early_kmem_cache_node_alloc). + */ + if (!NUMA_BUILD || n) + atomic_long_inc(&n->nr_slabs); +} +static inline void dec_slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + atomic_long_dec(&n->nr_slabs); +} + +/* Object debug checks for alloc/free paths */ static void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) { @@ -1028,6 +1057,11 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, return flags; } #define slub_debug 0 + +static inline unsigned long slabs_node(struct kmem_cache *s, int node) + { return 0; } +static inline void inc_slabs_node(struct kmem_cache *s, int node) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node) {} #endif /* * Slab allocation and freeing @@ -1066,7 +1100,6 @@ static void setup_object(struct kmem_cache *s, struct page *page, static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - struct kmem_cache_node *n; void *start; void *last; void *p; @@ -1078,9 +1111,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; - n = get_node(s, page_to_nid(page)); - if (n) - atomic_long_inc(&n->nr_slabs); + inc_slabs_node(s, page_to_nid(page)); page->slab = s; page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | @@ -1153,9 +1184,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) static void discard_slab(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - atomic_long_dec(&n->nr_slabs); + dec_slabs_node(s, page_to_nid(page)); free_slab(s, page); } @@ -1894,10 +1923,10 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, static void init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; - atomic_long_set(&n->nr_slabs, 0); spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); INIT_LIST_HEAD(&n->full); #endif } @@ -2066,7 +2095,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, init_tracking(kmalloc_caches, n); #endif init_kmem_cache_node(n); - atomic_long_inc(&n->nr_slabs); + inc_slabs_node(kmalloc_caches, node); /* * lockdep requires consistent irq usage for each lock @@ -2379,7 +2408,7 @@ static inline int kmem_cache_close(struct kmem_cache *s) struct kmem_cache_node *n = get_node(s, node); n->nr_partial -= free_list(s, n, &n->partial); - if (atomic_long_read(&n->nr_slabs)) + if (slabs_node(s, node)) return 1; } free_kmem_cache_nodes(s); @@ -2801,7 +2830,7 @@ static void slab_mem_offline_callback(void *arg) * and offline_pages() function shoudn't call this * callback. So, we must fail. */ - BUG_ON(atomic_long_read(&n->nr_slabs)); + BUG_ON(slabs_node(s, offline_node)); s->node[offline_node] = NULL; kmem_cache_free(kmalloc_caches, n); -- cgit v1.2.3-59-g8ed1b From bead9a3abd15710b0bdfd418daef606722d86282 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 16 Apr 2008 01:40:00 +0200 Subject: mm: sparsemem memory_present() fix Fix memory corruption and crash on 32-bit x86 systems. If a !PAE x86 kernel is booted on a 32-bit system with more than 4GB of RAM, then we call memory_present() with a start/end that goes outside the scope of MAX_PHYSMEM_BITS. That causes this loop to happily walk over the limit of the sparse memory section map: for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { unsigned long section = pfn_to_section_nr(pfn); struct mem_section *ms; sparse_index_init(section, nid); set_section_nid(section, nid); ms = __nr_to_section(section); if (!ms->section_mem_map) ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_MARKED_PRESENT; 'ms' will be out of bounds and we'll corrupt a small amount of memory by encoding the node ID and writing SECTION_MARKED_PRESENT (==0x1) over it. The corruption might happen when encoding a non-zero node ID, or due to the SECTION_MARKED_PRESENT which is 0x1: mmzone.h:#define SECTION_MARKED_PRESENT (1UL<<0) The fix is to sanity check anything the architecture passes to sparsemem. This bug seems to be rather old (as old as sparsemem support itself), but the exact incarnation depended on random details like configs, which made this bug more prominent in v2.6.25-to-be. An additional enhancement might be to print a warning about ignored or trimmed memory ranges. Signed-off-by: Ingo Molnar Tested-by: Christoph Lameter Cc: Pekka Enberg Cc: Mel Gorman Cc: Nick Piggin Cc: Andrew Morton Cc: Rafael J. Wysocki Cc: Yinghai Lu Cc: KAMEZAWA Hiroyuki Signed-off-by: Linus Torvalds --- mm/sparse.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index f6a43c09c322..98d6b39c3472 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -149,8 +149,18 @@ static inline int sparse_early_nid(struct mem_section *section) /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { + unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); unsigned long pfn; + /* + * Sanity checks - do not allow an architecture to pass + * in larger pfns than the maximum scope of sparsemem: + */ + if (start >= max_arch_pfn) + return; + if (end >= max_arch_pfn) + end = max_arch_pfn; + start &= PAGE_SECTION_MASK; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { unsigned long section = pfn_to_section_nr(pfn); -- cgit v1.2.3-59-g8ed1b From e115f2d89253490fb2dbf304b627f8d908df26f1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 15 Apr 2008 14:34:37 -0700 Subject: memcg: fix oops in oom handling When I used a test program to fork mass processes and immediately move them to a cgroup where the memory limit is low enough to trigger oom kill, I got oops: BUG: unable to handle kernel NULL pointer dereference at 0000000000000808 IP: [] _spin_lock_irqsave+0x8/0x18 PGD 4c95f067 PUD 4406c067 PMD 0 Oops: 0002 [1] SMP CPU 2 Modules linked in: Pid: 11973, comm: a.out Not tainted 2.6.25-rc7 #5 RIP: 0010:[] [] _spin_lock_irqsave+0x8/0x18 RSP: 0018:ffff8100448c7c30 EFLAGS: 00010002 RAX: 0000000000000202 RBX: 0000000000000009 RCX: 000000000001c9f3 RDX: 0000000000000100 RSI: 0000000000000001 RDI: 0000000000000808 RBP: ffff81007e444080 R08: 0000000000000000 R09: ffff8100448c7900 R10: ffff81000105f480 R11: 00000100ffffffff R12: ffff810067c84140 R13: 0000000000000001 R14: ffff8100441d0018 R15: ffff81007da56200 FS: 00007f70eb1856f0(0000) GS:ffff81007fbad3c0(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000808 CR3: 000000004498a000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process a.out (pid: 11973, threadinfo ffff8100448c6000, task ffff81007da533e0) Stack: ffffffff8023ef5a 00000000000000d0 ffffffff80548dc0 00000000000000d0 ffff810067c84140 ffff81007e444080 ffffffff8026cef9 00000000000000d0 ffff8100441d0000 00000000000000d0 ffff8100441d0000 ffff8100505445c0 Call Trace: [] ? force_sig_info+0x25/0xb9 [] ? oom_kill_task+0x77/0xe2 [] ? mem_cgroup_out_of_memory+0x55/0x67 [] ? mem_cgroup_charge_common+0xec/0x202 [] ? handle_mm_fault+0x24e/0x77f [] ? default_wake_function+0x0/0xe [] ? get_user_pages+0x2ce/0x3af [] ? mem_cgroup_charge_common+0x2d/0x202 [] ? make_pages_present+0x8e/0xa4 [] ? mmap_region+0x373/0x429 [] ? do_mmap_pgoff+0x2ff/0x364 [] ? sys_mmap+0xe5/0x111 [] ? tracesys+0xdc/0xe1 Code: 00 00 01 48 8b 3c 24 e9 46 d4 dd ff f0 ff 07 48 8b 3c 24 e9 3a d4 dd ff fe 07 48 8b 3c 24 e9 2f d4 dd ff 9c 58 fa ba 00 01 00 00 66 0f c1 17 38 f2 74 06 f3 90 8a 17 eb f6 c3 fa b8 00 01 00 RIP [] _spin_lock_irqsave+0x8/0x18 RSP CR2: 0000000000000808 ---[ end trace c3702fa668021ea4 ]--- It's reproducable in a x86_64 box, but doesn't happen in x86_32. This is because tsk->sighand is not guarded by RCU, so we have to hold tasklist_lock, just as what out_of_memory() does. Signed-off-by: Li Zefan Cc: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Pavel Emelianov Cc: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f255eda693b0..beb592fe9389 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -423,7 +423,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) struct task_struct *p; cgroup_lock(); - rcu_read_lock(); + read_lock(&tasklist_lock); retry: p = select_bad_process(&points, mem); if (PTR_ERR(p) == -1UL) @@ -436,7 +436,7 @@ retry: "Memory cgroup out of memory")) goto retry; out: - rcu_read_unlock(); + read_unlock(&tasklist_lock); cgroup_unlock(); } #endif -- cgit v1.2.3-59-g8ed1b From 91446b064c748fc2a238fd68b677c9671e536bfd Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 15 Apr 2008 14:34:42 -0700 Subject: add "Isolate" migratetype name to /proc/pagetypeinfo In a5d76b54a3f3a40385d7f76069a2feac9f1bad63 (memory unplug: page isolation by KAMEZAWA Hiroyuki), "isolate" migratetype added. but unfortunately, it doesn't treat /proc/pagetypeinfo display logic. this patch add "Isolate" to pagetype name field. /proc/pagetype before: ------------------------------------------------------------------------------------------------------------------------ Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 0, zone DMA, type Unmovable 1 2 2 2 1 2 2 1 1 0 0 Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type Movable 2 3 3 1 3 3 2 0 0 0 0 Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone DMA, type 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Unmovable 1 9 7 4 1 1 1 1 0 0 0 Node 0, zone Normal, type Reclaimable 5 2 0 0 1 1 0 0 0 1 0 Node 0, zone Normal, type Movable 0 1 1 0 0 0 1 0 0 1 60 Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone Normal, type 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone HighMem, type Unmovable 0 0 1 1 1 0 1 1 2 2 0 Node 0, zone HighMem, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone HighMem, type Movable 236 62 6 2 2 1 1 0 1 1 16 Node 0, zone HighMem, type Reserve 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone HighMem, type 0 0 0 0 0 0 0 0 0 0 0 Number of blocks type Unmovable Reclaimable Movable Reserve Node 0, zone DMA 1 0 2 1 0 Node 0, zone Normal 10 40 169 1 0 Node 0, zone HighMem 2 0 283 1 0 after: ------------------------------------------------------------------------------------------------------------------------ Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 0, zone DMA, type Unmovable 1 2 2 2 1 2 2 1 1 0 0 Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type Movable 2 3 3 1 3 3 2 0 0 0 0 Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Unmovable 0 2 1 1 0 1 0 0 0 0 0 Node 0, zone Normal, type Reclaimable 1 1 1 1 1 0 1 1 1 0 0 Node 0, zone Normal, type Movable 0 1 1 1 0 1 0 1 0 0 196 Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone Normal, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone HighMem, type Unmovable 0 1 0 0 0 1 1 1 2 2 0 Node 0, zone HighMem, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone HighMem, type Movable 1 0 1 1 0 0 0 0 1 0 200 Node 0, zone HighMem, type Reserve 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone HighMem, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Number of blocks type Unmovable Reclaimable Movable Reserve Isolate Node 0, zone DMA 1 0 2 1 0 Node 0, zone Normal 8 4 207 1 0 Node 0, zone HighMem 2 0 283 1 0 Signed-off-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 422d960ffcd8..7c7286e9506d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -388,6 +388,7 @@ static char * const migratetype_names[MIGRATE_TYPES] = { "Reclaimable", "Movable", "Reserve", + "Isolate", }; static void *frag_start(struct seq_file *m, loff_t *pos) -- cgit v1.2.3-59-g8ed1b From c33fa9f5609e918824446ef9a75319d4a802f1f4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 17 Apr 2008 20:05:36 +0200 Subject: uaccess: add probe_kernel_write() add probe_kernel_read() and probe_kernel_write(). Uninlined and restricted to kernel range memory only, as suggested by Linus. Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner --- include/linux/uaccess.h | 22 ++++++++++++++++++++++ mm/Makefile | 2 +- mm/maccess.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 mm/maccess.c (limited to 'mm') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 975c963e5789..fec6decfb983 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -84,4 +84,26 @@ static inline unsigned long __copy_from_user_nocache(void *to, ret; \ }) +/* + * probe_kernel_read(): safely attempt to read from a location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from + * @size: size of the data chunk + * + * Safely read from address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ +extern long probe_kernel_read(void *dst, void *src, size_t size); + +/* + * probe_kernel_write(): safely attempt to write to a location + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +extern long probe_kernel_write(void *dst, void *src, size_t size); + #endif /* __LINUX_UACCESS_H__ */ diff --git a/mm/Makefile b/mm/Makefile index a5b0dd93427a..18c143b3c46c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ vmalloc.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ - page_alloc.o page-writeback.o pdflush.o \ + maccess.o page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o $(mmu-y) diff --git a/mm/maccess.c b/mm/maccess.c new file mode 100644 index 000000000000..24f81b971403 --- /dev/null +++ b/mm/maccess.c @@ -0,0 +1,49 @@ +/* + * Access kernel memory without faulting. + */ +#include +#include +#include + +/** + * probe_kernel_read(): safely attempt to read from a location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from + * @size: size of the data chunk + * + * Safely read from address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ +long probe_kernel_read(void *dst, void *src, size_t size) +{ + long ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, + (__force const void __user *)src, size); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(probe_kernel_read); + +/** + * probe_kernel_write(): safely attempt to write to a location + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +long probe_kernel_write(void *dst, void *src, size_t size) +{ + long ret; + + pagefault_disable(); + ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(probe_kernel_write); -- cgit v1.2.3-59-g8ed1b From b4b8ac524d9b6ed7229017145afa1d7afbea4a48 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 20 Feb 2008 13:33:38 -0600 Subject: kgdb: fix optional arch functions and probe_kernel_* Fix two regressions dealing with the kgdb core. 1) kgdb_skipexception and kgdb_post_primary_code are optional functions that are only required on archs that need special exception fixups. 2) The kernel address space scope must be set on any probe_kernel_* function or archs such as ARCH=arm will not allow access to the kernel memory space. As an example, it is required to allow the full kernel address space is when you the kernel debugger to inspect a system call. Signed-off-by: Jason Wessel Signed-off-by: Ingo Molnar --- kernel/kgdb.c | 11 +++++++++++ mm/maccess.c | 6 ++++++ 2 files changed, 17 insertions(+) (limited to 'mm') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 68aea78407e4..31425e0fbf20 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -200,6 +200,17 @@ int __weak kgdb_arch_init(void) return 0; } +int __weak kgdb_skipexception(int exception, struct pt_regs *regs) +{ + return 0; +} + +void __weak +kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) +{ + return; +} + /** * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. * @regs: Current &struct pt_regs. diff --git a/mm/maccess.c b/mm/maccess.c index 24f81b971403..ac40796cfb15 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -17,11 +17,14 @@ long probe_kernel_read(void *dst, void *src, size_t size) { long ret; + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); pagefault_disable(); ret = __copy_from_user_inatomic(dst, (__force const void __user *)src, size); pagefault_enable(); + set_fs(old_fs); return ret ? -EFAULT : 0; } @@ -39,10 +42,13 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); long probe_kernel_write(void *dst, void *src, size_t size) { long ret; + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); pagefault_disable(); ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); pagefault_enable(); + set_fs(old_fs); return ret ? -EFAULT : 0; } -- cgit v1.2.3-59-g8ed1b