From e92dd4fd1aa1cd081dac03973b33c972637d5b7a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 26 Mar 2010 19:27:58 -0700 Subject: slab: Fix continuation lines Signed-off-by: Joe Perches Signed-off-by: Pekka Enberg --- mm/slab.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a9f325b28bed..ceb4e3aa22f7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4227,10 +4227,11 @@ static int s_show(struct seq_file *m, void *p) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, - reaped, errors, max_freeable, node_allocs, - node_frees, overflows); + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " + "%4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees, overflows); } /* cpu stats */ { -- cgit v1.2.3-59-g8ed1b From 8f9f8d9e8080a2ff46caa7decef47810d093d252 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sat, 27 Mar 2010 19:40:47 -0700 Subject: slab: add memory hotplug support Slab lacks any memory hotplug support for nodes that are hotplugged without cpus being hotplugged. This is possible at least on x86 CONFIG_MEMORY_HOTPLUG_SPARSE kernels where SRAT entries are marked ACPI_SRAT_MEM_HOT_PLUGGABLE and the regions of RAM represent a seperate node. It can also be done manually by writing the start address to /sys/devices/system/memory/probe for kernels that have CONFIG_ARCH_MEMORY_PROBE set, which is how this patch was tested, and then onlining the new memory region. When a node is hotadded, a nodelist for that node is allocated and initialized for each slab cache. If this isn't completed due to a lack of memory, the hotadd is aborted: we have a reasonable expectation that kmalloc_node(nid) will work for all caches if nid is online and memory is available. Since nodelists must be allocated and initialized prior to the new node's memory actually being online, the struct kmem_list3 is allocated off-node due to kmalloc_node()'s fallback. When an entire node would be offlined, its nodelists are subsequently drained. If slab objects still exist and cannot be freed, the offline is aborted. It is possible that objects will be allocated between this drain and page isolation, so it's still possible that the offline will still fail, however. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slab.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 125 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a9f325b28bed..3230cd2c6b3b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -115,6 +115,7 @@ #include #include #include +#include #include #include @@ -1102,6 +1103,52 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } #endif +/* + * Allocates and initializes nodelists for a node on each slab cache, used for + * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3 + * will be allocated off-node since memory is not yet online for the new node. + * When hotplugging memory or a cpu, existing nodelists are not replaced if + * already in use. + * + * Must hold cache_chain_mutex. + */ +static int init_cache_nodelists_node(int node) +{ + struct kmem_cache *cachep; + struct kmem_list3 *l3; + const int memsize = sizeof(struct kmem_list3); + + list_for_each_entry(cachep, &cache_chain, next) { + /* + * Set up the size64 kmemlist for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + if (!cachep->nodelists[node]) { + l3 = kmalloc_node(memsize, GFP_KERNEL, node); + if (!l3) + return -ENOMEM; + kmem_list3_init(l3); + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + + /* + * The l3s don't come and go as CPUs come and + * go. cache_chain_mutex is sufficient + * protection here. + */ + cachep->nodelists[node] = l3; + } + + spin_lock_irq(&cachep->nodelists[node]->list_lock); + cachep->nodelists[node]->free_limit = + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + spin_unlock_irq(&cachep->nodelists[node]->list_lock); + } + return 0; +} + static void __cpuinit cpuup_canceled(long cpu) { struct kmem_cache *cachep; @@ -1172,7 +1219,7 @@ static int __cpuinit cpuup_prepare(long cpu) struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); - const int memsize = sizeof(struct kmem_list3); + int err; /* * We need to do this right in the beginning since @@ -1180,35 +1227,9 @@ static int __cpuinit cpuup_prepare(long cpu) * kmalloc_node allows us to add the slab to the right * kmem_list3 and not this cpu's kmem_list3 */ - - list_for_each_entry(cachep, &cache_chain, next) { - /* - * Set up the size64 kmemlist for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - if (!cachep->nodelists[node]) { - l3 = kmalloc_node(memsize, GFP_KERNEL, node); - if (!l3) - goto bad; - kmem_list3_init(l3); - l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - - /* - * The l3s don't come and go as CPUs come and - * go. cache_chain_mutex is sufficient - * protection here. - */ - cachep->nodelists[node] = l3; - } - - spin_lock_irq(&cachep->nodelists[node]->list_lock); - cachep->nodelists[node]->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); - } + err = init_cache_nodelists_node(node); + if (err < 0) + goto bad; /* * Now we can go ahead with allocating the shared arrays and @@ -1331,11 +1352,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = { &cpuup_callback, NULL, 0 }; +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +/* + * Drains freelist for a node on each slab cache, used for memory hot-remove. + * Returns -EBUSY if all objects cannot be drained so that the node is not + * removed. + * + * Must hold cache_chain_mutex. + */ +static int __meminit drain_cache_nodelists_node(int node) +{ + struct kmem_cache *cachep; + int ret = 0; + + list_for_each_entry(cachep, &cache_chain, next) { + struct kmem_list3 *l3; + + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + drain_freelist(cachep, l3, l3->free_objects); + + if (!list_empty(&l3->slabs_full) || + !list_empty(&l3->slabs_partial)) { + ret = -EBUSY; + break; + } + } + return ret; +} + +static int __meminit slab_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int ret = 0; + int nid; + + nid = mnb->status_change_nid; + if (nid < 0) + goto out; + + switch (action) { + case MEM_GOING_ONLINE: + mutex_lock(&cache_chain_mutex); + ret = init_cache_nodelists_node(nid); + mutex_unlock(&cache_chain_mutex); + break; + case MEM_GOING_OFFLINE: + mutex_lock(&cache_chain_mutex); + ret = drain_cache_nodelists_node(nid); + mutex_unlock(&cache_chain_mutex); + break; + case MEM_ONLINE: + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } +out: + return ret ? notifier_from_errno(ret) : NOTIFY_OK; +} +#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ + /* * swap the static kmem_list3 with kmalloced memory */ -static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, - int nodeid) +static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, + int nodeid) { struct kmem_list3 *ptr; @@ -1580,6 +1665,14 @@ void __init kmem_cache_init_late(void) */ register_cpu_notifier(&cpucache_notifier); +#ifdef CONFIG_NUMA + /* + * Register a memory hotplug callback that initializes and frees + * nodelists. + */ + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +#endif + /* * The reap timers are started later, with a module init call: That part * of the kernel is not yet operational. -- cgit v1.2.3-59-g8ed1b From 5c5e3b33b7cb959a401f823707bee006caadd76e Mon Sep 17 00:00:00 2001 From: Shiyong Li Date: Mon, 12 Apr 2010 13:48:21 +0800 Subject: slab: Fix missing DEBUG_SLAB last user Even with SLAB_RED_ZONE and SLAB_STORE_USER enabled, kernel would NOT store redzone and last user data around allocated memory space if "arch cache line > sizeof(unsigned long long)". As a result, last user information is unexpectedly MISSED while dumping slab corruption log. This fix makes sure that redzone and last user tags get stored unless the required alignment breaks redzone's. Signed-off-by: Shiyong Li Signed-off-by: Pekka Enberg --- mm/slab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index bac0f4fcc216..525c66466469 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2220,8 +2220,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign < align) { ralign = align; } - /* disable debug if necessary */ - if (ralign > __alignof__(unsigned long long)) + /* disable debug if not aligning with REDZONE_ALIGN */ + if (ralign & (__alignof__(unsigned long long) - 1)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. @@ -2247,8 +2247,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += sizeof(unsigned long long); - size += 2 * sizeof(unsigned long long); + cachep->obj_offset += align; + size += align + sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of -- cgit v1.2.3-59-g8ed1b From 1f0ce8b3dd667dca720a47869f8110c298f0e5b8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 19 May 2010 12:01:42 +0100 Subject: mm: Move ARCH_SLAB_MINALIGN and ARCH_KMALLOC_MINALIGN to Acked-by: Herbert Xu Signed-off-by: David Woodhouse Signed-off-by: Pekka Enberg --- include/linux/slab_def.h | 24 ++++++++++++++++++++++++ mm/slab.c | 24 ------------------------ 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index ca6b2b317991..1812dac8c496 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -16,6 +16,30 @@ #include #include +#ifndef ARCH_KMALLOC_MINALIGN +/* + * Enforce a minimum alignment for the kmalloc caches. + * Usually, the kmalloc caches are cache_line_size() aligned, except when + * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. + * Some archs want to perform DMA into kmalloc caches and need a guaranteed + * alignment larger than the alignment of a 64-bit integer. + * ARCH_KMALLOC_MINALIGN allows that. + * Note that increasing this value may disable some debug features. + */ +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +#endif + +#ifndef ARCH_SLAB_MINALIGN +/* + * Enforce a minimum alignment for all caches. + * Intended for archs that get misalignment faults even for BYTES_PER_WORD + * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. + * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables + * some debug features. + */ +#define ARCH_SLAB_MINALIGN 0 +#endif + /* * struct kmem_cache * diff --git a/mm/slab.c b/mm/slab.c index bac0f4fcc216..7401ddc24306 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -144,30 +144,6 @@ #define BYTES_PER_WORD sizeof(void *) #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) -#ifndef ARCH_KMALLOC_MINALIGN -/* - * Enforce a minimum alignment for the kmalloc caches. - * Usually, the kmalloc caches are cache_line_size() aligned, except when - * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. - * Some archs want to perform DMA into kmalloc caches and need a guaranteed - * alignment larger than the alignment of a 64-bit integer. - * ARCH_KMALLOC_MINALIGN allows that. - * Note that increasing this value may disable some debug features. - */ -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -/* - * Enforce a minimum alignment for all caches. - * Intended for archs that get misalignment faults even for BYTES_PER_WORD - * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. - * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables - * some debug features. - */ -#define ARCH_SLAB_MINALIGN 0 -#endif - #ifndef ARCH_KMALLOC_FLAGS #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif -- cgit v1.2.3-59-g8ed1b From bac49ce42a33f53beb7cf04e9a0600879d6265ca Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 19 May 2010 12:01:43 +0100 Subject: mm: Move ARCH_SLAB_MINALIGN and ARCH_KMALLOC_MINALIGN to Acked-by: Herbert Xu Signed-off-by: David Woodhouse Signed-off-by: Pekka Enberg --- include/linux/slob_def.h | 8 ++++++++ mm/slob.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h index 0ec00b39d006..62667f72c2ef 100644 --- a/include/linux/slob_def.h +++ b/include/linux/slob_def.h @@ -1,6 +1,14 @@ #ifndef __LINUX_SLOB_DEF_H #define __LINUX_SLOB_DEF_H +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) +#endif + +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long) +#endif + void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep, diff --git a/mm/slob.c b/mm/slob.c index 837ebd64cc34..23631e2bb57a 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -467,14 +467,6 @@ out: * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ -#ifndef ARCH_KMALLOC_MINALIGN -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long) -#endif - void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; -- cgit v1.2.3-59-g8ed1b From 4581ced379736fd76432c754f999d26deb83fbb7 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 19 May 2010 12:02:14 +0100 Subject: mm: Move ARCH_SLAB_MINALIGN and ARCH_KMALLOC_MINALIGN to Acked-by: Herbert Xu Signed-off-by: David Woodhouse Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 8 ++++++++ mm/slub.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 0249d4175bac..55695c8d2f8a 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -116,6 +116,14 @@ struct kmem_cache { #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE) +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +#endif + +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) +#endif + /* * Maximum kmalloc object size handled by SLUB. Larger object allocations * are passed through to the page allocator. The page allocator "fastpath" diff --git a/mm/slub.c b/mm/slub.c index d2a54fe71ea2..c874c3efac29 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -157,14 +157,6 @@ #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ SLAB_CACHE_DMA | SLAB_NOTRACK) -#ifndef ARCH_KMALLOC_MINALIGN -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) -#endif - #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ -- cgit v1.2.3-59-g8ed1b From bbd7d57bfe852d9788bae5fb171c7edb4021d8ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Mar 2010 22:25:47 +0100 Subject: slub: Potential stack overflow I discovered that we can overflow stack if CONFIG_SLUB_DEBUG=y and use slabs with many objects, since list_slab_objects() and process_slab() use DECLARE_BITMAP(map, page->objects). With 65535 bits, we use 8192 bytes of stack ... Switch these allocations to dynamic allocations. Signed-off-by: Eric Dumazet Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d2a54fe71ea2..78f1a202ca33 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2429,9 +2429,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - DECLARE_BITMAP(map, page->objects); + long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long), + GFP_ATOMIC); - bitmap_zero(map, page->objects); + if (!map) + return; slab_err(s, page, "%s", text); slab_lock(page); for_each_free_object(p, s, page->freelist) @@ -2446,6 +2448,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } slab_unlock(page); + kfree(map); #endif } @@ -3651,10 +3654,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc) + struct page *page, enum track_item alloc, + long *map) { void *addr = page_address(page); - DECLARE_BITMAP(map, page->objects); void *p; bitmap_zero(map, page->objects); @@ -3673,11 +3676,14 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * + sizeof(unsigned long), GFP_KERNEL); - if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_TEMPORARY)) + if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_TEMPORARY)) { + kfree(map); return sprintf(buf, "Out of memory\n"); - + } /* Push back cpu slabs */ flush_all(s); @@ -3691,9 +3697,9 @@ static int list_locations(struct kmem_cache *s, char *buf, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); list_for_each_entry(page, &n->full, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); spin_unlock_irqrestore(&n->list_lock, flags); } @@ -3744,6 +3750,7 @@ static int list_locations(struct kmem_cache *s, char *buf, } free_loc_track(&t); + kfree(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; -- cgit v1.2.3-59-g8ed1b From d3e14aa336b37df76ae875fa051dfdb0e765ddf9 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 8 Apr 2010 17:26:44 +0800 Subject: slub: __kmalloc_node_track_caller should trace kmalloc_large_node case commit 94b528d (kmemtrace: SLUB hooks for caller-tracking functions) missed tracing kmalloc_large_node in __kmalloc_node_track_caller. We should trace it same as __kmalloc_node. Acked-by: David Rientjes Cc: Matt Mackall Cc: Ingo Molnar Cc: Vegard Nossum Signed-off-by: Xiaotian Feng Signed-off-by: Pekka Enberg --- mm/slub.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 78f1a202ca33..52ae5a538180 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3341,8 +3341,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) - return kmalloc_large_node(size, gfpflags, node); + if (unlikely(size > SLUB_MAX_SIZE)) { + ret = kmalloc_large_node(size, gfpflags, node); + + trace_kmalloc_node(caller, ret, + size, PAGE_SIZE << get_order(size), + gfpflags, node); + + return ret; + } s = get_slab(size, gfpflags); -- cgit v1.2.3-59-g8ed1b From 6b65aaf3027c4e02b42aaefd900aa79136a30681 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 14 Apr 2010 23:58:36 +0900 Subject: slub: Use alloc_pages_exact_node() for page allocation The alloc_slab_page() in SLUB uses alloc_pages() if node is '-1'. This means that node validity check in alloc_pages_node is unnecessary and we can use alloc_pages_exact_node() to avoid comparison and branch as commit 6484eb3e2a81807722 ("page allocator: do not check NUMA node ID when the caller knows the node is valid") did for the page allocator. Cc: Christoph Lameter Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Mel Gorman Signed-off-by: Minchan Kim Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 52ae5a538180..2cdd235cb801 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1084,7 +1084,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, if (node == -1) return alloc_pages(flags, order); else - return alloc_pages_node(node, flags, order); + return alloc_pages_exact_node(node, flags, order); } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) -- cgit v1.2.3-59-g8ed1b From 73367bd8eef4f4eb311005886aaa916013073265 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 21 May 2010 14:41:35 -0700 Subject: slub: move kmem_cache_node into it's own cacheline This patch is meant to improve the performance of SLUB by moving the local kmem_cache_node lock into it's own cacheline separate from kmem_cache. This is accomplished by simply removing the local_node when NUMA is enabled. On my system with 2 nodes I saw around a 5% performance increase w/ hackbench times dropping from 6.2 seconds to 5.9 seconds on average. I suspect the performance gain would increase as the number of nodes increases, but I do not have the data to currently back that up. Bugzilla-Reference: http://bugzilla.kernel.org/show_bug.cgi?id=15713 Cc: Reported-by: Alex Shi Tested-by: Alex Shi Acked-by: Yanmin Zhang Acked-by: Christoph Lameter Signed-off-by: Alexander Duyck Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 9 +++------ mm/slub.c | 33 +++++++++++---------------------- 2 files changed, 14 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 55695c8d2f8a..6ac37664e8fe 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -75,12 +75,6 @@ struct kmem_cache { int offset; /* Free pointer offset. */ struct kmem_cache_order_objects oo; - /* - * Avoid an extra cache line for UP, SMP and for the node local to - * struct kmem_cache. - */ - struct kmem_cache_node local_node; - /* Allocation and freeing of slabs */ struct kmem_cache_order_objects max; struct kmem_cache_order_objects min; @@ -102,6 +96,9 @@ struct kmem_cache { */ int remote_node_defrag_ratio; struct kmem_cache_node *node[MAX_NUMNODES]; +#else + /* Avoid an extra cache line for UP */ + struct kmem_cache_node local_node; #endif }; diff --git a/mm/slub.c b/mm/slub.c index e46e3129697d..c2d6e6951f33 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2133,7 +2133,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = s->node[node]; - if (n && n != &s->local_node) + if (n) kmem_cache_free(kmalloc_caches, n); s->node[node] = NULL; } @@ -2142,33 +2142,22 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) { int node; - int local_node; - - if (slab_state >= UP && (s < kmalloc_caches || - s >= kmalloc_caches + KMALLOC_CACHES)) - local_node = page_to_nid(virt_to_page(s)); - else - local_node = 0; for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n; - if (local_node == node) - n = &s->local_node; - else { - if (slab_state == DOWN) { - early_kmem_cache_node_alloc(gfpflags, node); - continue; - } - n = kmem_cache_alloc_node(kmalloc_caches, - gfpflags, node); - - if (!n) { - free_kmem_cache_nodes(s); - return 0; - } + if (slab_state == DOWN) { + early_kmem_cache_node_alloc(gfpflags, node); + continue; + } + n = kmem_cache_alloc_node(kmalloc_caches, + gfpflags, node); + if (!n) { + free_kmem_cache_nodes(s); + return 0; } + s->node[node] = n; init_kmem_cache_node(n, s); } -- cgit v1.2.3-59-g8ed1b From 47846b0650f2f62fc4217cfb36efc94b8d919727 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 25 May 2010 15:06:06 +0200 Subject: mm: export lru_cache_add_*() to modules This is needed to enable moving pages into the page cache in fuse with splice(..., SPLICE_F_MOVE). Signed-off-by: Miklos Szeredi --- mm/swap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 7cd60bf0a972..3ce7bc373a52 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -224,6 +224,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) ____pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } +EXPORT_SYMBOL(__lru_cache_add); /** * lru_cache_add_lru - add a page to a page list -- cgit v1.2.3-59-g8ed1b From a52116aba5b3eed0ee41f70b794cc1937acd5cb8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 25 May 2010 15:06:06 +0200 Subject: mm: export remove_from_page_cache() to modules This is needed to enable moving pages into the page cache in fuse with splice(..., SPLICE_F_MOVE). Signed-off-by: Miklos Szeredi --- mm/filemap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 140ebda9640f..09a91a9a102e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -151,6 +151,7 @@ void remove_from_page_cache(struct page *page) spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); } +EXPORT_SYMBOL(remove_from_page_cache); static int sync_page(void *word) { -- cgit v1.2.3-59-g8ed1b From 66f998f611897319b555364cefd5d6e88a205866 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sun, 23 May 2010 11:00:54 -0400 Subject: fs: allow short direct-io reads to be completed via buffered IO This is similar to what already happens in the write case. If we have a short read while doing O_DIRECT, instead of just returning, fallthrough and try to read the rest via buffered IO. BTRFS needs this because if we encounter a compressed or inline extent during DIO, we need to fallback on buffered. If the extent is compressed we need to read the entire thing into memory and de-compress it into the users pages. I have tested this with fsx and everything works great. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- mm/filemap.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 140ebda9640f..829ac9cdbd70 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1263,7 +1263,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; ssize_t retval; - unsigned long seg; + unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; @@ -1290,21 +1290,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); } - if (retval > 0) + if (retval > 0) { *ppos = pos + retval; - if (retval) { + count -= retval; + } + + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. + */ + if (retval < 0 || !count || *ppos >= size) { file_accessed(filp); goto out; } } } + count = retval; for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; + loff_t offset = 0; + + /* + * If we did a short DIO read we need to skip the section of the + * iov that we've already read data into. + */ + if (count) { + if (count > iov[seg].iov_len) { + count -= iov[seg].iov_len; + continue; + } + offset = count; + count = 0; + } desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; + desc.arg.buf = iov[seg].iov_base + offset; + desc.count = iov[seg].iov_len - offset; if (desc.count == 0) continue; desc.error = 0; -- cgit v1.2.3-59-g8ed1b From e9d6c157385e4efa61cb8293e425c9d8beba70d3 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 24 May 2010 14:31:48 -0700 Subject: tmpfs: insert tmpfs cache pages to inactive list at first Shaohua Li reported parallel file copy on tmpfs can lead to OOM killer. This is regression of caused by commit 9ff473b9a7 ("vmscan: evict streaming IO first"). Wow, It is 2 years old patch! Currently, tmpfs file cache is inserted active list at first. This means that the insertion doesn't only increase numbers of pages in anon LRU, but it also reduces anon scanning ratio. Therefore, vmscan will get totally confused. It scans almost only file LRU even though the system has plenty unused tmpfs pages. Historically, lru_cache_add_active_anon() was used for two reasons. 1) Intend to priotize shmem page rather than regular file cache. 2) Intend to avoid reclaim priority inversion of used once pages. But we've lost both motivation because (1) Now we have separate anon and file LRU list. then, to insert active list doesn't help such priotize. (2) In past, one pte access bit will cause page activation. then to insert inactive list with pte access bit mean higher priority than to insert active list. Its priority inversion may lead to uninteded lru chun. but it was already solved by commit 645747462 (vmscan: detect mapped file pages used only once). (Thanks Hannes, you are great!) Thus, now we can use lru_cache_add_anon() instead. Signed-off-by: KOSAKI Motohiro Reported-by: Shaohua Li Reviewed-by: Wu Fengguang Reviewed-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Acked-by: Hugh Dickins Cc: Henrique de Moraes Holschuh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 10 ---------- mm/filemap.c | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index ec2b7a42b45f..600b03b7732e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -224,21 +224,11 @@ static inline void lru_cache_add_anon(struct page *page) __lru_cache_add(page, LRU_INACTIVE_ANON); } -static inline void lru_cache_add_active_anon(struct page *page) -{ - __lru_cache_add(page, LRU_ACTIVE_ANON); -} - static inline void lru_cache_add_file(struct page *page) { __lru_cache_add(page, LRU_INACTIVE_FILE); } -static inline void lru_cache_add_active_file(struct page *page) -{ - __lru_cache_add(page, LRU_ACTIVE_FILE); -} - /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); diff --git a/mm/filemap.c b/mm/filemap.c index 140ebda9640f..d6f4f073836e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -441,7 +441,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, /* * Splice_read and readahead add shmem/tmpfs pages into the page cache * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * need to go on the anon lru below, and mem_cgroup_cache_charge * (called in add_to_page_cache) needs to know where they're going too. */ if (mapping_cap_swap_backed(mapping)) @@ -452,7 +452,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, if (page_is_file_cache(page)) lru_cache_add_file(page); else - lru_cache_add_active_anon(page); + lru_cache_add_anon(page); } return ret; } -- cgit v1.2.3-59-g8ed1b From 6dda9d55bf545013597724bf0cd79d01bd2bd944 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Mon, 24 May 2010 14:31:54 -0700 Subject: page allocator: reduce fragmentation in buddy allocator by adding buddies that are merging to the tail of the free lists In order to reduce fragmentation, this patch classifies freed pages in two groups according to their probability of being part of a high order merge. Pages belonging to a compound whose next-highest buddy is free are more likely to be part of a high order merge in the near future, so they will be added at the tail of the freelist. The remaining pages are put at the front of the freelist. In this way, the pages that are more likely to cause a big merge are kept free longer. Consequently there is a tendency to aggregate the long-living allocations on a subset of the compounds, reducing the fragmentation. This heuristic was tested on three machines, x86, x86-64 and ppc64 with 3GB of RAM in each machine. The tests were kernbench, netperf, sysbench and STREAM for performance and a high-order stress test for huge page allocations. KernBench X86 Elapsed mean 374.77 ( 0.00%) 375.10 (-0.09%) User mean 649.53 ( 0.00%) 650.44 (-0.14%) System mean 54.75 ( 0.00%) 54.18 ( 1.05%) CPU mean 187.75 ( 0.00%) 187.25 ( 0.27%) KernBench X86-64 Elapsed mean 94.45 ( 0.00%) 94.01 ( 0.47%) User mean 323.27 ( 0.00%) 322.66 ( 0.19%) System mean 36.71 ( 0.00%) 36.50 ( 0.57%) CPU mean 380.75 ( 0.00%) 381.75 (-0.26%) KernBench PPC64 Elapsed mean 173.45 ( 0.00%) 173.74 (-0.17%) User mean 587.99 ( 0.00%) 587.95 ( 0.01%) System mean 60.60 ( 0.00%) 60.57 ( 0.05%) CPU mean 373.50 ( 0.00%) 372.75 ( 0.20%) Nothing notable for kernbench. NetPerf UDP X86 64 42.68 ( 0.00%) 42.77 ( 0.21%) 128 85.62 ( 0.00%) 85.32 (-0.35%) 256 170.01 ( 0.00%) 168.76 (-0.74%) 1024 655.68 ( 0.00%) 652.33 (-0.51%) 2048 1262.39 ( 0.00%) 1248.61 (-1.10%) 3312 1958.41 ( 0.00%) 1944.61 (-0.71%) 4096 2345.63 ( 0.00%) 2318.83 (-1.16%) 8192 4132.90 ( 0.00%) 4089.50 (-1.06%) 16384 6770.88 ( 0.00%) 6642.05 (-1.94%)* NetPerf UDP X86-64 64 148.82 ( 0.00%) 154.92 ( 3.94%) 128 298.96 ( 0.00%) 312.95 ( 4.47%) 256 583.67 ( 0.00%) 626.39 ( 6.82%) 1024 2293.18 ( 0.00%) 2371.10 ( 3.29%) 2048 4274.16 ( 0.00%) 4396.83 ( 2.79%) 3312 6356.94 ( 0.00%) 6571.35 ( 3.26%) 4096 7422.68 ( 0.00%) 7635.42 ( 2.79%)* 8192 12114.81 ( 0.00%)* 12346.88 ( 1.88%) 16384 17022.28 ( 0.00%)* 17033.19 ( 0.06%)* 1.64% 2.73% NetPerf UDP PPC64 64 49.98 ( 0.00%) 50.25 ( 0.54%) 128 98.66 ( 0.00%) 100.95 ( 2.27%) 256 197.33 ( 0.00%) 191.03 (-3.30%) 1024 761.98 ( 0.00%) 785.07 ( 2.94%) 2048 1493.50 ( 0.00%) 1510.85 ( 1.15%) 3312 2303.95 ( 0.00%) 2271.72 (-1.42%) 4096 2774.56 ( 0.00%) 2773.06 (-0.05%) 8192 4918.31 ( 0.00%) 4793.59 (-2.60%) 16384 7497.98 ( 0.00%) 7749.52 ( 3.25%) The tests are run to have confidence limits within 1%. Results marked with a * were not confident although in this case, it's only outside by small amounts. Even with some results that were not confident, the netperf UDP results were generally positive. NetPerf TCP X86 64 652.25 ( 0.00%)* 648.12 (-0.64%)* 23.80% 22.82% 128 1229.98 ( 0.00%)* 1220.56 (-0.77%)* 21.03% 18.90% 256 2105.88 ( 0.00%) 1872.03 (-12.49%)* 1.00% 16.46% 1024 3476.46 ( 0.00%)* 3548.28 ( 2.02%)* 13.37% 11.39% 2048 4023.44 ( 0.00%)* 4231.45 ( 4.92%)* 9.76% 12.48% 3312 4348.88 ( 0.00%)* 4396.96 ( 1.09%)* 6.49% 8.75% 4096 4726.56 ( 0.00%)* 4877.71 ( 3.10%)* 9.85% 8.50% 8192 4732.28 ( 0.00%)* 5777.77 (18.10%)* 9.13% 13.04% 16384 5543.05 ( 0.00%)* 5906.24 ( 6.15%)* 7.73% 8.68% NETPERF TCP X86-64 netperf-tcp-vanilla-netperf netperf-tcp tcp-vanilla pgalloc-delay 64 1895.87 ( 0.00%)* 1775.07 (-6.81%)* 5.79% 4.78% 128 3571.03 ( 0.00%)* 3342.20 (-6.85%)* 3.68% 6.06% 256 5097.21 ( 0.00%)* 4859.43 (-4.89%)* 3.02% 2.10% 1024 8919.10 ( 0.00%)* 8892.49 (-0.30%)* 5.89% 6.55% 2048 10255.46 ( 0.00%)* 10449.39 ( 1.86%)* 7.08% 7.44% 3312 10839.90 ( 0.00%)* 10740.15 (-0.93%)* 6.87% 7.33% 4096 10814.84 ( 0.00%)* 10766.97 (-0.44%)* 6.86% 8.18% 8192 11606.89 ( 0.00%)* 11189.28 (-3.73%)* 7.49% 5.55% 16384 12554.88 ( 0.00%)* 12361.22 (-1.57%)* 7.36% 6.49% NETPERF TCP PPC64 netperf-tcp-vanilla-netperf netperf-tcp tcp-vanilla pgalloc-delay 64 594.17 ( 0.00%) 596.04 ( 0.31%)* 1.00% 2.29% 128 1064.87 ( 0.00%)* 1074.77 ( 0.92%)* 1.30% 1.40% 256 1852.46 ( 0.00%)* 1856.95 ( 0.24%) 1.25% 1.00% 1024 3839.46 ( 0.00%)* 3813.05 (-0.69%) 1.02% 1.00% 2048 4885.04 ( 0.00%)* 4881.97 (-0.06%)* 1.15% 1.04% 3312 5506.90 ( 0.00%) 5459.72 (-0.86%) 4096 6449.19 ( 0.00%) 6345.46 (-1.63%) 8192 7501.17 ( 0.00%) 7508.79 ( 0.10%) 16384 9618.65 ( 0.00%) 9490.10 (-1.35%) There was a distinct lack of confidence in the X86* figures so I included what the devation was where the results were not confident. Many of the results, whether gains or losses were within the standard deviation so no solid conclusion can be reached on performance impact. Looking at the figures, only the X86-64 ones look suspicious with a few losses that were outside the noise. However, the results were so unstable that without knowing why they vary so much, a solid conclusion cannot be reached. SYSBENCH X86 sysbench-vanilla pgalloc-delay 1 7722.85 ( 0.00%) 7756.79 ( 0.44%) 2 14901.11 ( 0.00%) 13683.44 (-8.90%) 3 15171.71 ( 0.00%) 14888.25 (-1.90%) 4 14966.98 ( 0.00%) 15029.67 ( 0.42%) 5 14370.47 ( 0.00%) 14865.00 ( 3.33%) 6 14870.33 ( 0.00%) 14845.57 (-0.17%) 7 14429.45 ( 0.00%) 14520.85 ( 0.63%) 8 14354.35 ( 0.00%) 14362.31 ( 0.06%) SYSBENCH X86-64 1 17448.70 ( 0.00%) 17484.41 ( 0.20%) 2 34276.39 ( 0.00%) 34251.00 (-0.07%) 3 50805.25 ( 0.00%) 50854.80 ( 0.10%) 4 66667.10 ( 0.00%) 66174.69 (-0.74%) 5 66003.91 ( 0.00%) 65685.25 (-0.49%) 6 64981.90 ( 0.00%) 65125.60 ( 0.22%) 7 64933.16 ( 0.00%) 64379.23 (-0.86%) 8 63353.30 ( 0.00%) 63281.22 (-0.11%) 9 63511.84 ( 0.00%) 63570.37 ( 0.09%) 10 62708.27 ( 0.00%) 63166.25 ( 0.73%) 11 62092.81 ( 0.00%) 61787.75 (-0.49%) 12 61330.11 ( 0.00%) 61036.34 (-0.48%) 13 61438.37 ( 0.00%) 61994.47 ( 0.90%) 14 62304.48 ( 0.00%) 62064.90 (-0.39%) 15 63296.48 ( 0.00%) 62875.16 (-0.67%) 16 63951.76 ( 0.00%) 63769.09 (-0.29%) SYSBENCH PPC64 -sysbench-pgalloc-delay-sysbench sysbench-vanilla pgalloc-delay 1 7645.08 ( 0.00%) 7467.43 (-2.38%) 2 14856.67 ( 0.00%) 14558.73 (-2.05%) 3 21952.31 ( 0.00%) 21683.64 (-1.24%) 4 27946.09 ( 0.00%) 28623.29 ( 2.37%) 5 28045.11 ( 0.00%) 28143.69 ( 0.35%) 6 27477.10 ( 0.00%) 27337.45 (-0.51%) 7 26489.17 ( 0.00%) 26590.06 ( 0.38%) 8 26642.91 ( 0.00%) 25274.33 (-5.41%) 9 25137.27 ( 0.00%) 24810.06 (-1.32%) 10 24451.99 ( 0.00%) 24275.85 (-0.73%) 11 23262.20 ( 0.00%) 23674.88 ( 1.74%) 12 24234.81 ( 0.00%) 23640.89 (-2.51%) 13 24577.75 ( 0.00%) 24433.50 (-0.59%) 14 25640.19 ( 0.00%) 25116.52 (-2.08%) 15 26188.84 ( 0.00%) 26181.36 (-0.03%) 16 26782.37 ( 0.00%) 26255.99 (-2.00%) Again, there is little to conclude here. While there are a few losses, the results vary by +/- 8% in some cases. They are the results of most concern as there are some large losses but it's also within the variance typically seen between kernel releases. The STREAM results varied so little and are so verbose that I didn't include them here. The final test stressed how many huge pages can be allocated. The absolute number of huge pages allocated are the same with or without the page. However, the "unusability free space index" which is a measure of external fragmentation was slightly lower (lower is better) throughout the lifetime of the system. I also measured the latency of how long it took to successfully allocate a huge page. The latency was slightly lower and on X86 and PPC64, more huge pages were allocated almost immediately from the free lists. The improvement is slight but there. [mel@csn.ul.ie: Tested, reworked for less branches] [czoccolo@gmail.com: fix oops by checking pfn_valid_within()] Signed-off-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Christoph Lameter Acked-by: Rik van Riel Reviewed-by: Pekka Enberg Cc: Corrado Zoccolo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6326c71b663..596180fedd3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -475,6 +475,8 @@ static inline void __free_one_page(struct page *page, int migratetype) { unsigned long page_idx; + unsigned long combined_idx; + struct page *buddy; if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) @@ -488,9 +490,6 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - unsigned long combined_idx; - struct page *buddy; - buddy = __page_find_buddy(page, page_idx, order); if (!page_is_buddy(page, buddy, order)) break; @@ -505,8 +504,29 @@ static inline void __free_one_page(struct page *page, order++; } set_page_order(page, order); - list_add(&page->lru, - &zone->free_area[order].free_list[migratetype]); + + /* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ + if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { + struct page *higher_page, *higher_buddy; + combined_idx = __find_combined_index(page_idx, order); + higher_page = page + combined_idx - page_idx; + higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + list_add_tail(&page->lru, + &zone->free_area[order].free_list[migratetype]); + goto out; + } + } + + list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); +out: zone->free_area[order].nr_free++; } -- cgit v1.2.3-59-g8ed1b From e48e67e08c340def3d0349c2910d23c7985fb6fa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 24 May 2010 14:31:57 -0700 Subject: sparsemem: on no vmemmap path put mem_map on node high too We need to put mem_map high when virtual memmap is not used. before this patch free mem pfn range on first node: [ 0.000000] 19 - 1f [ 0.000000] 28 40 - 80 95 [ 0.000000] 702 740 - 1000 1000 [ 0.000000] 347c - 347e [ 0.000000] 34e7 3500 - 3b80 3b8b [ 0.000000] 73b8b 73bc0 - 73c00 73c00 [ 0.000000] 73ddd - 73e00 [ 0.000000] 73fdd - 74000 [ 0.000000] 741dd - 74200 [ 0.000000] 743dd - 74400 [ 0.000000] 745dd - 74600 [ 0.000000] 747dd - 74800 [ 0.000000] 749dd - 74a00 [ 0.000000] 74bdd - 74c00 [ 0.000000] 74ddd - 74e00 [ 0.000000] 74fdd - 75000 [ 0.000000] 751dd - 75200 [ 0.000000] 753dd - 75400 [ 0.000000] 755dd - 75600 [ 0.000000] 757dd - 75800 [ 0.000000] 759dd - 75a00 [ 0.000000] 79bdd 79c00 - 7d540 7d550 [ 0.000000] 7f745 - 7f750 [ 0.000000] 10000b 100040 - 2080000 2080000 so only 79c00 - 7d540 are major free block under 4g... after this patch, we will get [ 0.000000] 19 - 1f [ 0.000000] 28 40 - 80 95 [ 0.000000] 702 740 - 1000 1000 [ 0.000000] 347c - 347e [ 0.000000] 34e7 3500 - 3600 3600 [ 0.000000] 37dd - 3800 [ 0.000000] 39dd - 3a00 [ 0.000000] 3bdd - 3c00 [ 0.000000] 3ddd - 3e00 [ 0.000000] 3fdd - 4000 [ 0.000000] 41dd - 4200 [ 0.000000] 43dd - 4400 [ 0.000000] 45dd - 4600 [ 0.000000] 47dd - 4800 [ 0.000000] 49dd - 4a00 [ 0.000000] 4bdd - 4c00 [ 0.000000] 4ddd - 4e00 [ 0.000000] 4fdd - 5000 [ 0.000000] 51dd - 5200 [ 0.000000] 53dd - 5400 [ 0.000000] 95dd 9600 - 7d540 7d550 [ 0.000000] 7f745 - 7f750 [ 0.000000] 17000b 170040 - 2080000 2080000 we will have 9600 - 7d540 for major free block... sparse-vmemmap path already used __alloc_bootmem_node_high() Signed-off-by: Yinghai Lu Cc: Jiri Slaby Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Christoph Lameter Cc: Greg Thelen Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index dc0cc4d43ff3..95ac219af379 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -382,13 +382,15 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) { struct page *map; + unsigned long size; map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) return map; - map = alloc_bootmem_pages_node(NODE_DATA(nid), - PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); + size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); + map = __alloc_bootmem_node_high(NODE_DATA(nid), size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); return map; } void __init sparse_mem_maps_populate_node(struct page **map_map, @@ -412,7 +414,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } size = PAGE_ALIGN(size); - map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); + map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (map) { for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) -- cgit v1.2.3-59-g8ed1b From 4b50dc26a0a25a9d1998d206e1f7d849aa78063f Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Mon, 24 May 2010 14:31:58 -0700 Subject: shmem: remove redundant code prep_new_page() will call set_page_private(page, 0) to initialise the page, so the code is redundant. Signed-off-by: Huang Shijie Reviewed-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 0cd7f66f1c66..4ef9797bd430 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -433,8 +433,6 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long spin_unlock(&info->lock); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); - if (page) - set_page_private(page, 0); spin_lock(&info->lock); if (!page) { -- cgit v1.2.3-59-g8ed1b From e13861d822f8f443ca0c020ea8fc2dc01039cd63 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 24 May 2010 14:31:59 -0700 Subject: mm: remove return value of putback_lru_pages() putback_lru_page() never can fail. So it doesn't matter count of "the number of pages put back". In addition, users of this functions don't use return value. Let's remove unnecessary code. Signed-off-by: Minchan Kim Reviewed-by: Rik van Riel Reviewed-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 4 ++-- mm/migrate.c | 7 +------ 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7f085c97c799..7a07b17d27c5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -9,7 +9,7 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); #ifdef CONFIG_MIGRATION #define PAGE_MIGRATION 1 -extern int putback_lru_pages(struct list_head *l); +extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, @@ -25,7 +25,7 @@ extern int migrate_vmas(struct mm_struct *mm, #else #define PAGE_MIGRATION 0 -static inline int putback_lru_pages(struct list_head *l) { return 0; } +static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, int offlining) { return -ENOSYS; } diff --git a/mm/migrate.c b/mm/migrate.c index d3f3f7f81075..5938db54e1d7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -58,23 +58,18 @@ int migrate_prep(void) /* * Add isolated pages on the list back to the LRU under page lock * to avoid leaking evictable pages back onto unevictable list. - * - * returns the number of pages put back. */ -int putback_lru_pages(struct list_head *l) +void putback_lru_pages(struct list_head *l) { struct page *page; struct page *page2; - int count = 0; list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); - count++; } - return count; } /* -- cgit v1.2.3-59-g8ed1b From 6d556294d5b27fb12f18be7495af45b6156a409e Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 24 May 2010 14:31:59 -0700 Subject: mempolicy: remove redundant code 1. In funtion is_valid_nodemask(), varibable k will be inited to 0 in the following loop, needn't init to policy_zone anymore. 2. (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) has already defined to MPOL_MODE_FLAGS in mempolicy.h. Signed-off-by: Bob Liu Acked-by: David Rientjes Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08f40a2f3fe0..ad500f3b12bf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -127,9 +127,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask) { int nd, k; - /* Check that there is something useful in this mask */ - k = policy_zone; - for_each_node_mask(nd, *nodemask) { struct zone *z; @@ -145,7 +142,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask) static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); + return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, -- cgit v1.2.3-59-g8ed1b From 6eb27e1fdf5781719a3d2e90e6c89fa012135c62 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 24 May 2010 14:32:00 -0700 Subject: mempolicy: remove case MPOL_INTERLEAVE from policy_zonelist() In policy_zonelist() mode MPOL_INTERLEAVE shouldn't happen, so fall through to BUG() instead of break to return. I also fixed the comment. Signed-off-by: Bob Liu Acked-by: David Rientjes Cc: Andi Kleen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ad500f3b12bf..d97355b744ab 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1441,15 +1441,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) /* * Normally, MPOL_BIND allocations are node-local within the * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node is part of the mask, we use the zonelist for + * current node isn't part of the mask, we use the zonelist for * the first node in the mask instead. */ if (unlikely(gfp & __GFP_THISNODE) && unlikely(!node_isset(nd, policy->v.nodes))) nd = first_node(policy->v.nodes); break; - case MPOL_INTERLEAVE: /* should not happen */ - break; default: BUG(); } -- cgit v1.2.3-59-g8ed1b From 1980050250fa052b1c24a19f9b3d82fae14d77f8 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 24 May 2010 14:32:01 -0700 Subject: mempolicy: remove redundant check Lee's patch "mempolicy: use MPOL_PREFERRED for system-wide default policy" has made the MPOL_DEFAULT only used in the memory policy APIs. So, no need to check in __mpol_equal also. Also get rid of mpol_match_intent() and move its logic directly into __mpol_equal(). Signed-off-by: Bob Liu Acked-by: David Rientjes Cc: Andi Kleen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d97355b744ab..ac5aeafaec9a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1787,16 +1787,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, return tompol; } -static int mpol_match_intent(const struct mempolicy *a, - const struct mempolicy *b) -{ - if (a->flags != b->flags) - return 0; - if (!mpol_store_user_nodemask(a)) - return 1; - return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); -} - /* Slow path of a mempolicy comparison */ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) { @@ -1804,8 +1794,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) return 0; if (a->mode != b->mode) return 0; - if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) + if (a->flags != b->flags) return 0; + if (mpol_store_user_nodemask(a)) + if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) + return 0; + switch (a->mode) { case MPOL_BIND: /* Fall through */ -- cgit v1.2.3-59-g8ed1b From e17f74af351cce9a1bade7b33af179497fdf95cf Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 24 May 2010 14:32:02 -0700 Subject: mempolicy: don't call mpol_set_nodemask() when no_context No need to call mpol_set_nodemask() when we have no context for the mempolicy. This can occur when we're parsing a tmpfs 'mpol' mount option. Just save the raw nodemask in the mempolicy's w.user_nodemask member for use when a tmpfs/shmem file is created. mpol_shared_policy_init() will "contextualize" the policy for the new file based on the creating task's context. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ac5aeafaec9a..0e1b293e4054 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2239,7 +2239,10 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (IS_ERR(new)) goto out; - { + if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } else { int ret; NODEMASK_SCRATCH(scratch); if (scratch) { @@ -2255,10 +2258,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) } } err = 0; - if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; - } out: /* Restore string for error message */ -- cgit v1.2.3-59-g8ed1b From b4652e8429100ba5c3ddb49499faa1188c98c246 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 24 May 2010 14:32:03 -0700 Subject: mempolicy: lose unnecessary loop variable in mpol_parse_str() We don't really need the extra variable 'i' in mpol_parse_str(). The only use is as the the loop variable. Then, it's assigned to 'mode'. Just use mode, and loose the 'uninitialized_var()' macro. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e1b293e4054..b4f1265df2d8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2148,12 +2148,11 @@ static const char * const policy_types[] = int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) { struct mempolicy *new = NULL; - unsigned short uninitialized_var(mode); + unsigned short mode; unsigned short uninitialized_var(mode_flags); nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int i; int err = 1; if (nodelist) { @@ -2169,13 +2168,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (i = 0; i <= MPOL_LOCAL; i++) { - if (!strcmp(str, policy_types[i])) { - mode = i; + for (mode = 0; mode <= MPOL_LOCAL; mode++) { + if (!strcmp(str, policy_types[mode])) { break; } } - if (i > MPOL_LOCAL) + if (mode > MPOL_LOCAL) goto out; switch (mode) { -- cgit v1.2.3-59-g8ed1b From 345ace9c797030e77da8ff211b9502370b9d81ab Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 24 May 2010 14:32:04 -0700 Subject: mempolicy: rename policy_types and cleanup initialization Rename 'policy_types[]' to 'policy_modes[]' to better match the array contents. Use designated intializer syntax for policy_modes[]. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b4f1265df2d8..ade57322fa2a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2121,9 +2121,15 @@ void numa_default_policy(void) * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_parse_str() and mpol_to_str() */ -#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) -static const char * const policy_types[] = - { "default", "prefer", "bind", "interleave", "local" }; +#define MPOL_LOCAL MPOL_MAX +static const char * const policy_modes[] = +{ + [MPOL_DEFAULT] = "default", + [MPOL_PREFERRED] = "prefer", + [MPOL_BIND] = "bind", + [MPOL_INTERLEAVE] = "interleave", + [MPOL_LOCAL] = "local" +}; #ifdef CONFIG_TMPFS @@ -2169,7 +2175,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) *flags++ = '\0'; /* terminate mode string */ for (mode = 0; mode <= MPOL_LOCAL; mode++) { - if (!strcmp(str, policy_types[mode])) { + if (!strcmp(str, policy_modes[mode])) { break; } } @@ -2324,11 +2330,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) BUG(); } - l = strlen(policy_types[mode]); + l = strlen(policy_modes[mode]); if (buffer + maxlen < p + l + 1) return -ENOSPC; - strcpy(p, policy_types[mode]); + strcpy(p, policy_modes[mode]); p += l; if (flags & MPOL_MODE_FLAGS) { -- cgit v1.2.3-59-g8ed1b From 15d77835ac48dbc2d4884376ea6a08b65b1c40ba Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 24 May 2010 14:32:04 -0700 Subject: mempolicy: factor mpol_shared_policy_init() return paths Factor out duplicate put/frees in mpol_shared_policy_init() to a common return path. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ade57322fa2a..0c73c8b814cd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1995,26 +1995,22 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) { - mpol_put(mpol); /* drop our ref on sb mpol */ - NODEMASK_SCRATCH_FREE(scratch); - return; /* no valid nodemask intersection */ - } + if (IS_ERR(new)) + goto put_free; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); mpol_put(mpol); /* drop our ref on sb mpol */ - if (ret) { - NODEMASK_SCRATCH_FREE(scratch); - mpol_put(new); - return; - } + if (ret) + goto put_free; /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ + +put_free: mpol_put(new); /* drop initial ref */ NODEMASK_SCRATCH_FREE(scratch); } -- cgit v1.2.3-59-g8ed1b From 708c1bbc9d0c3e57f40501794d9b0eed29d10fce Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 24 May 2010 14:32:07 -0700 Subject: mempolicy: restructure rebinding-mempolicy functions Nick Piggin reported that the allocator may see an empty nodemask when changing cpuset's mems[1]. It happens only on the kernel that do not do atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG) But I found that there is also a problem on the kernel that can do atomic nodemask_t stores. The problem is that the allocator can't find a node to alloc page when changing cpuset's mems though there is a lot of free memory. The reason is like this: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom I can use the attached program reproduce it by the following step: # mkdir /dev/cpuset # mount -t cpuset cpuset /dev/cpuset # mkdir /dev/cpuset/1 # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems # echo $$ > /dev/cpuset/1/tasks # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog & = max(nr_cpus - 1, 1) # killall -s SIGUSR1 cpuset_mem_hog # ./change_mems.sh several hours later, oom will happen though there is a lot of free memory. This patchset fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. This patch: In order to fix no node to alloc memory, when we want to update mempolicy and mems_allowed, we expand the set of nodes first (set all the newly nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the mempolicy's rebind functions may breaks the expanding. So we restructure the mempolicy's rebind functions and split the rebind work to two steps, just like the update of cpuset's mems: The 1st step: expand the set of the mempolicy's nodes. The 2nd step: shrink the set of the mempolicy's nodes. It is used when there is no real lock to protect the mempolicy in the read-side. Otherwise we can do rebind work at once. In order to implement it, we define enum mpol_rebind_step { MPOL_REBIND_ONCE, MPOL_REBIND_STEP1, MPOL_REBIND_STEP2, MPOL_REBIND_NSTEP, }; If the mempolicy needn't be updated by two steps, we can pass MPOL_REBIND_ONCE to the rebind functions. Or we can pass MPOL_REBIND_STEP1 to do the first step of the rebind work and pass MPOL_REBIND_STEP2 to do the second step work. Besides that, it maybe long time between these two step and we have to release the lock that protects mempolicy and mems_allowed. If we hold the lock once again, we must check whether the current mempolicy is under the rebinding (the first step has been done) or not, because the task may alloc a new mempolicy when we don't hold the lock. So we defined the following flag to identify it: #define MPOL_F_REBINDING (1 << 2) The new functions will be used in the next patch. Signed-off-by: Miao Xie Cc: David Rientjes Cc: Nick Piggin Cc: Paul Menage Cc: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 15 ++++-- kernel/cpuset.c | 4 +- mm/mempolicy.c | 124 +++++++++++++++++++++++++++++++++++++++------- 3 files changed, 119 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 1cc966cd3e5f..7b9ef6bf45aa 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -23,6 +23,13 @@ enum { MPOL_MAX, /* always last member of enum */ }; +enum mpol_rebind_step { + MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */ + MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */ + MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/ + MPOL_REBIND_NSTEP, +}; + /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) @@ -51,6 +58,7 @@ enum { */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ +#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #ifdef __KERNEL__ @@ -193,8 +201,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, extern void numa_default_policy(void); extern void numa_policy_init(void); -extern void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new); +extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + enum mpol_rebind_step step); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern void mpol_fix_fork_child_flag(struct task_struct *p); @@ -308,7 +316,8 @@ static inline void numa_default_policy(void) } static inline void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new) + const nodemask_t *new, + enum mpol_rebind_step step) { } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9a50c5f6e727..db0990ac3fac 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -953,8 +953,8 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, &tsk->mems_allowed); - mpol_rebind_task(tsk, newmems); + mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE); tsk->mems_allowed = *newmems; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0c73c8b814cd..8a993db88029 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -119,7 +119,22 @@ struct mempolicy default_policy = { static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); + /* + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step); } mpol_ops[MPOL_MAX]; /* Check that the nodemask contains at least one populated zone */ @@ -274,12 +289,19 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { } -static void mpol_rebind_nodemask(struct mempolicy *pol, - const nodemask_t *nodes) +/* + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -288,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; + /* + * if step == 1, we use ->w.cpuset_mems_allowed to cache the + * result + */ + if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { + nodes_remap(tmp, pol->v.nodes, + pol->w.cpuset_mems_allowed, *nodes); + pol->w.cpuset_mems_allowed = step ? tmp : *nodes; + } else if (step == MPOL_REBIND_STEP2) { + tmp = pol->w.cpuset_mems_allowed; + pol->w.cpuset_mems_allowed = *nodes; + } else + BUG(); } - pol->v.nodes = tmp; + if (nodes_empty(tmp)) + tmp = *nodes; + + if (step == MPOL_REBIND_STEP1) + nodes_or(pol->v.nodes, pol->v.nodes, tmp); + else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) + pol->v.nodes = tmp; + else + BUG(); + if (!node_isset(current->il_next, tmp)) { current->il_next = next_node(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) @@ -304,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes) + const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -327,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol, } } -/* Migrate a policy to a different set of nodes */ -static void mpol_rebind_policy(struct mempolicy *pol, - const nodemask_t *newmask) +/* + * mpol_rebind_policy - Migrate a policy to a different set of nodes + * + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, + enum mpol_rebind_step step) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && + if (!mpol_store_user_nodemask(pol) && step == 0 && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - mpol_ops[pol->mode].rebind(pol, newmask); + + if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) + return; + + if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) + BUG(); + + if (step == MPOL_REBIND_STEP1) + pol->flags |= MPOL_F_REBINDING; + else if (step == MPOL_REBIND_STEP2) + pol->flags &= ~MPOL_F_REBINDING; + else if (step >= MPOL_REBIND_NSTEP) + BUG(); + + mpol_ops[pol->mode].rebind(pol, newmask, step); } /* @@ -346,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + enum mpol_rebind_step step) { - mpol_rebind_policy(tsk->mempolicy, new); + mpol_rebind_policy(tsk->mempolicy, new, step); } /* @@ -363,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new); + mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); up_write(&mm->mmap_sem); } @@ -1745,6 +1817,9 @@ EXPORT_SYMBOL(alloc_pages_current); * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). + * + * current's mempolicy may be rebinded by the other task(the task that changes + * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ @@ -1754,13 +1829,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + + /* task's mempolicy is protected by alloc_lock */ + if (old == current->mempolicy) { + task_lock(current); + *new = *old; + task_unlock(current); + } else + *new = *old; + rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - mpol_rebind_policy(old, &mems); + if (new->flags & MPOL_F_REBINDING) + mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); + else + mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } rcu_read_unlock(); - *new = *old; atomic_set(&new->refcnt, 1); return new; } -- cgit v1.2.3-59-g8ed1b From c0ff7453bb5c7c98e0885fb94279f2571946f280 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 24 May 2010 14:32:08 -0700 Subject: cpuset,mm: fix no node to alloc memory when changing cpuset's mems Before applying this patch, cpuset updates task->mems_allowed and mempolicy by setting all new bits in the nodemask first, and clearing all old unallowed bits later. But in the way, the allocator may find that there is no node to alloc memory. The reason is that cpuset rebinds the task's mempolicy, it cleans the nodes which the allocater can alloc pages on, for example: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom This patch fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. [akpm@linux-foundation.org: fix spello] Signed-off-by: Miao Xie Cc: David Rientjes Cc: Nick Piggin Cc: Paul Menage Cc: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 43 +++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 1 + kernel/cpuset.c | 58 +++++++++++++++++++++++++++++++++++++++++++------- kernel/exit.c | 2 ++ mm/filemap.c | 10 +++++++-- mm/hugetlb.c | 12 +++++++---- mm/mempolicy.c | 24 +++++++++++++++++---- mm/page_alloc.c | 6 +++++- mm/slab.c | 4 ++++ mm/slub.c | 6 +++++- mm/vmscan.c | 2 ++ 11 files changed, 148 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a73454aec333..20b51cab6593 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -86,9 +86,44 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); +/* + * reading current mems_allowed and mempolicy in the fastpath must protected + * by get_mems_allowed() + */ +static inline void get_mems_allowed(void) +{ + current->mems_allowed_change_disable++; + + /* + * ensure that reading mems_allowed and mempolicy happens after the + * update of ->mems_allowed_change_disable. + * + * the write-side task finds ->mems_allowed_change_disable is not 0, + * and knows the read-side task is reading mems_allowed or mempolicy, + * so it will clear old bits lazily. + */ + smp_mb(); +} + +static inline void put_mems_allowed(void) +{ + /* + * ensure that reading mems_allowed and mempolicy before reducing + * mems_allowed_change_disable. + * + * the write-side task will know that the read-side task is still + * reading mems_allowed or mempolicy, don't clears old bits in the + * nodemask. + */ + smp_mb(); + --ACCESS_ONCE(current->mems_allowed_change_disable); +} + static inline void set_mems_allowed(nodemask_t nodemask) { + task_lock(current); current->mems_allowed = nodemask; + task_unlock(current); } #else /* !CONFIG_CPUSETS */ @@ -187,6 +222,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } +static inline void get_mems_allowed(void) +{ +} + +static inline void put_mems_allowed(void) +{ +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index b55e988988b5..415b8f8a3f45 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1421,6 +1421,7 @@ struct task_struct { #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ + int mems_allowed_change_disable; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS diff --git a/kernel/cpuset.c b/kernel/cpuset.c index db0990ac3fac..61d6af7fa676 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, * In order to avoid seeing no nodes if the old and new nodes are disjoint, * we structure updates as setting all new allowed nodes, then clearing newly * disallowed ones. - * - * Called with task's alloc_lock held */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { +repeat: + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return; + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return; + + task_lock(tsk); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); + + + /* + * ensure checking ->mems_allowed_change_disable after setting all new + * allowed nodes. + * + * the read-side task can see an nodemask with new allowed nodes and + * old allowed nodes. and if it allocates page when cpuset clears newly + * disallowed ones continuous, it can see the new allowed bits. + * + * And if setting all new allowed nodes is after the checking, setting + * all new allowed nodes and clearing newly disallowed ones will be done + * continuous, and the read-side task may find no node to alloc page. + */ + smp_mb(); + + /* + * Allocation of memory is very fast, we needn't sleep when waiting + * for the read-side. + */ + while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + task_unlock(tsk); + if (!task_curr(tsk)) + yield(); + goto repeat; + } + + /* + * ensure checking ->mems_allowed_change_disable before clearing all new + * disallowed nodes. + * + * if clearing newly disallowed bits before the checking, the read-side + * task may find no node to alloc page. + */ + smp_mb(); + + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + task_unlock(tsk); } /* @@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p, cs = cgroup_cs(scan->cg); guarantee_online_mems(cs, newmems); - task_lock(p); cpuset_change_task_nodemask(p, newmems); - task_unlock(p); NODEMASK_FREE(newmems); @@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, err = set_cpus_allowed_ptr(tsk, cpus_attach); WARN_ON_ONCE(err); - task_lock(tsk); cpuset_change_task_nodemask(tsk, to); - task_unlock(tsk); cpuset_update_task_spread_flag(cs, tsk); } diff --git a/kernel/exit.c b/kernel/exit.c index eabca5a73a85..019a2843bf95 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1002,8 +1002,10 @@ NORET_TYPE void do_exit(long code) exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA + task_lock(tsk); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; + task_unlock(tsk); #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) diff --git a/mm/filemap.c b/mm/filemap.c index d6f4f073836e..88d719665a28 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { + int n; + struct page *page; + if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_exact_node(n, gfp, 0); + get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + put_mems_allowed(); + return page; } return alloc_pages(gfp, 0); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c9e6bbf3772..54d42b009dbe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; - struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are @@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, */ if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err;; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { @@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, break; } } +err: mpol_cond_put(mpol); + put_mems_allowed(); return page; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8a993db88029..721b2b338032 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1639,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. + * + * Must be protected by get_mems_allowed() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -1684,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) if (!(mask && current->mempolicy)) return false; + task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: @@ -1703,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) default: BUG(); } + task_unlock(current); return true; } @@ -1750,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; + struct page *page; + get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - return alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, 0, nid); + put_mems_allowed(); + return page; } zl = policy_zonelist(gfp, pol); if (unlikely(mpol_needs_cond_ref(pol))) { @@ -1766,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); + put_mems_allowed(); return page; } /* * fast path: default or task policy */ - return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } /** @@ -1796,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; + struct page *page; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; + get_mems_allowed(); /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) - return alloc_page_interleave(gfp, order, interleave_nodes(pol)); - return __alloc_pages_nodemask(gfp, order, + page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else + page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } EXPORT_SYMBOL(alloc_pages_current); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 596180fedd3a..f7da2a2934b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1990,10 +1990,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + get_mems_allowed(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); - if (!preferred_zone) + if (!preferred_zone) { + put_mems_allowed(); return NULL; + } /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -2003,6 +2006,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; diff --git a/mm/slab.c b/mm/slab.c index 50a73fca19c4..02786e1a32d2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3217,10 +3217,12 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; nid_alloc = nid_here = numa_node_id(); + get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_mem_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); + put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) if (flags & __GFP_THISNODE) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); @@ -3302,6 +3305,7 @@ retry: } } } + put_mems_allowed(); return obj; } diff --git a/mm/slub.c b/mm/slub.c index e46e3129697d..26f0cb9cc584 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { page = get_partial_node(n); - if (page) + if (page) { + put_mems_allowed(); return page; + } } } + put_mems_allowed(); #endif return NULL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ff3311447f5..f2c367c9ec12 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1774,6 +1774,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); unsigned long writeback_threshold; + get_mems_allowed(); delayacct_freepages_start(); if (scanning_global_lru(sc)) @@ -1857,6 +1858,7 @@ out: mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); delayacct_freepages_end(); + put_mems_allowed(); return ret; } -- cgit v1.2.3-59-g8ed1b From 6a60f1b3588aef6ddceaa14192df475d430cce45 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 May 2010 14:32:09 -0700 Subject: mincore: cleanups This fixes some minor issues that bugged me while going over the code: o adjust argument order of do_mincore() to match the syscall o simplify range length calculation o drop superfluous shift in huge tlb calculation, address is page aligned o drop dead nr_huge calculation o check pte_none() before pte_present() o comment and whitespace fixes No semantic changes intended. Signed-off-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mincore.c | 76 +++++++++++++++++++++--------------------------------------- 1 file changed, 27 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/mm/mincore.c b/mm/mincore.c index f77433c20279..1f6574c5167b 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -54,7 +54,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * all the arguments, we hold the mmap semaphore: we should * just return the amount of info we're asked for. */ -static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) +static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) { pgd_t *pgd; pud_t *pud; @@ -64,35 +64,29 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag unsigned long nr; int i; pgoff_t pgoff; - struct vm_area_struct *vma = find_vma(current->mm, addr); + struct vm_area_struct *vma; - /* - * find_vma() didn't find anything above us, or we're - * in an unmapped hole in the address space: ENOMEM. - */ + vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) return -ENOMEM; + nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); + #ifdef CONFIG_HUGETLB_PAGE if (is_vm_hugetlb_page(vma)) { struct hstate *h; - unsigned long nr_huge; - unsigned char present; i = 0; - nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); h = hstate_vma(vma); - nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) - - (addr >> huge_page_shift(h)) + 1; - nr_huge = min(nr_huge, - (vma->vm_end - addr) >> huge_page_shift(h)); while (1) { - /* hugepage always in RAM for now, - * but generally it needs to be check */ + unsigned char present; + /* + * Huge pages are always in RAM for now, but + * theoretically it needs to be checked. + */ ptep = huge_pte_offset(current->mm, addr & huge_page_mask(h)); - present = !!(ptep && - !huge_pte_none(huge_ptep_get(ptep))); + present = ptep && !huge_pte_none(huge_ptep_get(ptep)); while (1) { vec[i++] = present; addr += PAGE_SIZE; @@ -100,8 +94,7 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag if (i == nr) return nr; /* check hugepage border */ - if (!((addr & ~huge_page_mask(h)) - >> PAGE_SHIFT)) + if (!(addr & ~huge_page_mask(h))) break; } } @@ -113,17 +106,7 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag * Calculate how many pages there are left in the last level of the * PTE array for our address. */ - nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1)); - - /* - * Don't overrun this vma - */ - nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT); - - /* - * Don't return more than the caller asked for - */ - nr = min(nr, pages); + nr = min(nr, PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1))); pgd = pgd_offset(vma->vm_mm, addr); if (pgd_none_or_clear_bad(pgd)) @@ -137,43 +120,38 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { - unsigned char present; pte_t pte = *ptep; - if (pte_present(pte)) { - present = 1; - - } else if (pte_none(pte)) { + if (pte_none(pte)) { if (vma->vm_file) { pgoff = linear_page_index(vma, addr); - present = mincore_page(vma->vm_file->f_mapping, - pgoff); + vec[i] = mincore_page(vma->vm_file->f_mapping, + pgoff); } else - present = 0; - - } else if (pte_file(pte)) { + vec[i] = 0; + } else if (pte_present(pte)) + vec[i] = 1; + else if (pte_file(pte)) { pgoff = pte_to_pgoff(pte); - present = mincore_page(vma->vm_file->f_mapping, pgoff); - + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); + if (is_migration_entry(entry)) { /* migration entries are always uptodate */ - present = 1; + vec[i] = 1; } else { #ifdef CONFIG_SWAP pgoff = entry.val; - present = mincore_page(&swapper_space, pgoff); + vec[i] = mincore_page(&swapper_space, pgoff); #else WARN_ON(1); - present = 1; + vec[i] = 1; #endif } } - - vec[i] = present; } - pte_unmap_unlock(ptep-1, ptl); + pte_unmap_unlock(ptep - 1, ptl); return nr; @@ -247,7 +225,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, * the temporary buffer size. */ down_read(¤t->mm->mmap_sem); - retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); + retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); up_read(¤t->mm->mmap_sem); if (retval <= 0) -- cgit v1.2.3-59-g8ed1b From f488401076c5570130c018e573f450a9a6c43365 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 May 2010 14:32:10 -0700 Subject: mincore: break do_mincore() into logical pieces Split out functions to handle hugetlb ranges, pte ranges and unmapped ranges, to improve readability but also to prepare the file structure for nested page table walks. No semantic changes intended. Signed-off-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mincore.c | 171 +++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 97 insertions(+), 74 deletions(-) (limited to 'mm') diff --git a/mm/mincore.c b/mm/mincore.c index 1f6574c5167b..a0c4c10bbab7 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -19,6 +19,42 @@ #include #include +static void mincore_hugetlb_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long nr, + unsigned char *vec) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct hstate *h; + int i; + + i = 0; + h = hstate_vma(vma); + while (1) { + unsigned char present; + pte_t *ptep; + /* + * Huge pages are always in RAM for now, but + * theoretically it needs to be checked. + */ + ptep = huge_pte_offset(current->mm, + addr & huge_page_mask(h)); + present = ptep && !huge_pte_none(huge_ptep_get(ptep)); + while (1) { + vec[i++] = present; + addr += PAGE_SIZE; + /* reach buffer limit */ + if (i == nr) + return; + /* check hugepage border */ + if (!(addr & ~huge_page_mask(h))) + break; + } + } +#else + BUG(); +#endif +} + /* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, @@ -49,6 +85,64 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) return present; } +static void mincore_unmapped_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long nr, + unsigned char *vec) +{ + int i; + + if (vma->vm_file) { + pgoff_t pgoff; + + pgoff = linear_page_index(vma, addr); + for (i = 0; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + } else { + for (i = 0; i < nr; i++) + vec[i] = 0; + } +} + +static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long nr, + unsigned char *vec) +{ + spinlock_t *ptl; + pte_t *ptep; + int i; + + ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { + pte_t pte = *ptep; + pgoff_t pgoff; + + if (pte_none(pte)) + mincore_unmapped_range(vma, addr, 1, vec); + else if (pte_present(pte)) + vec[i] = 1; + else if (pte_file(pte)) { + pgoff = pte_to_pgoff(pte); + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + } else { /* pte is a swap entry */ + swp_entry_t entry = pte_to_swp_entry(pte); + + if (is_migration_entry(entry)) { + /* migration entries are always uptodate */ + vec[i] = 1; + } else { +#ifdef CONFIG_SWAP + pgoff = entry.val; + vec[i] = mincore_page(&swapper_space, pgoff); +#else + WARN_ON(1); + vec[i] = 1; +#endif + } + } + } + pte_unmap_unlock(ptep - 1, ptl); +} + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -59,11 +153,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; unsigned long nr; - int i; - pgoff_t pgoff; struct vm_area_struct *vma; vma = find_vma(current->mm, addr); @@ -72,35 +162,10 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); -#ifdef CONFIG_HUGETLB_PAGE if (is_vm_hugetlb_page(vma)) { - struct hstate *h; - - i = 0; - h = hstate_vma(vma); - while (1) { - unsigned char present; - /* - * Huge pages are always in RAM for now, but - * theoretically it needs to be checked. - */ - ptep = huge_pte_offset(current->mm, - addr & huge_page_mask(h)); - present = ptep && !huge_pte_none(huge_ptep_get(ptep)); - while (1) { - vec[i++] = present; - addr += PAGE_SIZE; - /* reach buffer limit */ - if (i == nr) - return nr; - /* check hugepage border */ - if (!(addr & ~huge_page_mask(h))) - break; - } - } + mincore_hugetlb_page_range(vma, addr, nr, vec); return nr; } -#endif /* * Calculate how many pages there are left in the last level of the @@ -118,53 +183,11 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v if (pmd_none_or_clear_bad(pmd)) goto none_mapped; - ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { - pte_t pte = *ptep; - - if (pte_none(pte)) { - if (vma->vm_file) { - pgoff = linear_page_index(vma, addr); - vec[i] = mincore_page(vma->vm_file->f_mapping, - pgoff); - } else - vec[i] = 0; - } else if (pte_present(pte)) - vec[i] = 1; - else if (pte_file(pte)) { - pgoff = pte_to_pgoff(pte); - vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); - } else { /* pte is a swap entry */ - swp_entry_t entry = pte_to_swp_entry(pte); - - if (is_migration_entry(entry)) { - /* migration entries are always uptodate */ - vec[i] = 1; - } else { -#ifdef CONFIG_SWAP - pgoff = entry.val; - vec[i] = mincore_page(&swapper_space, pgoff); -#else - WARN_ON(1); - vec[i] = 1; -#endif - } - } - } - pte_unmap_unlock(ptep - 1, ptl); - + mincore_pte_range(vma, pmd, addr, nr, vec); return nr; none_mapped: - if (vma->vm_file) { - pgoff = linear_page_index(vma, addr); - for (i = 0; i < nr; i++, pgoff++) - vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); - } else { - for (i = 0; i < nr; i++) - vec[i] = 0; - } - + mincore_unmapped_range(vma, addr, nr, vec); return nr; } -- cgit v1.2.3-59-g8ed1b From 25ef0e50cca790370ad7838e3ad74db6a6a2d829 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 May 2010 14:32:11 -0700 Subject: mincore: pass ranges as start,end address pairs Instead of passing a start address and a number of pages into the helper functions, convert them to use a start and an end address. Signed-off-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mincore.c | 57 +++++++++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/mincore.c b/mm/mincore.c index a0c4c10bbab7..211604adc23c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -20,14 +20,12 @@ #include static void mincore_hugetlb_page_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long nr, + unsigned long addr, unsigned long end, unsigned char *vec) { #ifdef CONFIG_HUGETLB_PAGE struct hstate *h; - int i; - i = 0; h = hstate_vma(vma); while (1) { unsigned char present; @@ -40,10 +38,10 @@ static void mincore_hugetlb_page_range(struct vm_area_struct *vma, addr & huge_page_mask(h)); present = ptep && !huge_pte_none(huge_ptep_get(ptep)); while (1) { - vec[i++] = present; + *vec = present; + vec++; addr += PAGE_SIZE; - /* reach buffer limit */ - if (i == nr) + if (addr == end) return; /* check hugepage border */ if (!(addr & ~huge_page_mask(h))) @@ -86,9 +84,10 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) } static void mincore_unmapped_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long nr, + unsigned long addr, unsigned long end, unsigned char *vec) { + unsigned long nr = (end - addr) >> PAGE_SHIFT; int i; if (vma->vm_file) { @@ -104,42 +103,44 @@ static void mincore_unmapped_range(struct vm_area_struct *vma, } static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long nr, + unsigned long addr, unsigned long end, unsigned char *vec) { + unsigned long next; spinlock_t *ptl; pte_t *ptep; - int i; ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { + do { pte_t pte = *ptep; pgoff_t pgoff; + next = addr + PAGE_SIZE; if (pte_none(pte)) - mincore_unmapped_range(vma, addr, 1, vec); + mincore_unmapped_range(vma, addr, next, vec); else if (pte_present(pte)) - vec[i] = 1; + *vec = 1; else if (pte_file(pte)) { pgoff = pte_to_pgoff(pte); - vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + *vec = mincore_page(vma->vm_file->f_mapping, pgoff); } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); if (is_migration_entry(entry)) { /* migration entries are always uptodate */ - vec[i] = 1; + *vec = 1; } else { #ifdef CONFIG_SWAP pgoff = entry.val; - vec[i] = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(&swapper_space, pgoff); #else WARN_ON(1); - vec[i] = 1; + *vec = 1; #endif } } - } + vec++; + } while (ptep++, addr = next, addr != end); pte_unmap_unlock(ptep - 1, ptl); } @@ -153,25 +154,21 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v pgd_t *pgd; pud_t *pud; pmd_t *pmd; - unsigned long nr; struct vm_area_struct *vma; + unsigned long end; vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) return -ENOMEM; - nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); + end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); if (is_vm_hugetlb_page(vma)) { - mincore_hugetlb_page_range(vma, addr, nr, vec); - return nr; + mincore_hugetlb_page_range(vma, addr, end, vec); + return (end - addr) >> PAGE_SHIFT; } - /* - * Calculate how many pages there are left in the last level of the - * PTE array for our address. - */ - nr = min(nr, PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1))); + end = pmd_addr_end(addr, end); pgd = pgd_offset(vma->vm_mm, addr); if (pgd_none_or_clear_bad(pgd)) @@ -183,12 +180,12 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v if (pmd_none_or_clear_bad(pmd)) goto none_mapped; - mincore_pte_range(vma, pmd, addr, nr, vec); - return nr; + mincore_pte_range(vma, pmd, addr, end, vec); + return (end - addr) >> PAGE_SHIFT; none_mapped: - mincore_unmapped_range(vma, addr, nr, vec); - return nr; + mincore_unmapped_range(vma, addr, end, vec); + return (end - addr) >> PAGE_SHIFT; } /* -- cgit v1.2.3-59-g8ed1b From e48293fd75b3aa67f43ad6e3d2ff397caa55d58b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 May 2010 14:32:11 -0700 Subject: mincore: do nested page table walks Do page table walks with the well-known nested loops we use in several other places already. This avoids doing full page table walks after every pte range and also allows to handle unmapped areas bigger than one pte range in one go. Signed-off-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mincore.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/mincore.c b/mm/mincore.c index 211604adc23c..9ac42dc6d7b6 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -144,6 +144,60 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap_unlock(ptep - 1, ptl); } +static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pmd_t *pmd; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pte_range(vma, pmd, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pmd++, addr = next, addr != end); +} + +static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pud_t *pud; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pmd_range(vma, pud, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pud++, addr = next, addr != end); +} + +static void mincore_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pgd_t *pgd; + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pud_range(vma, pgd, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pgd++, addr = next, addr != end); +} + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -151,9 +205,6 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, */ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; struct vm_area_struct *vma; unsigned long end; @@ -170,21 +221,11 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v end = pmd_addr_end(addr, end); - pgd = pgd_offset(vma->vm_mm, addr); - if (pgd_none_or_clear_bad(pgd)) - goto none_mapped; - pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) - goto none_mapped; - pmd = pmd_offset(pud, addr); - if (pmd_none_or_clear_bad(pmd)) - goto none_mapped; - - mincore_pte_range(vma, pmd, addr, end, vec); - return (end - addr) >> PAGE_SHIFT; + if (is_vm_hugetlb_page(vma)) + mincore_hugetlb_page_range(vma, addr, end, vec); + else + mincore_page_range(vma, addr, end, vec); -none_mapped: - mincore_unmapped_range(vma, addr, end, vec); return (end - addr) >> PAGE_SHIFT; } -- cgit v1.2.3-59-g8ed1b From e325c90ffc13b698fa2814102e05275b21c26bec Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 24 May 2010 14:32:13 -0700 Subject: mm: default to node zonelist ordering when nodes have only lowmem There are two types of zonelist ordering methodologies: - node order, preferring allocations on a node to stay local to and - zone order, preferring allocations come from a higher zone to avoid allocating in lowmem zones even though they may not be local. The ordering technique used by the kernel is configurable on the command line, but also has some logic to determine what the default should be. This logic currently lacks knowledge of systems where a node may only have lowmem. For such systems, it is necessary to use node order so that GFP_KERNEL allocations may be satisfied by nodes consisting of only lowmem. If zone order is used, GFP_KERNEL allocations to such nodes are actually allocated on a node with local affinity that includes ZONE_NORMAL. This change defaults to node zonelist ordering if any node lacks ZONE_NORMAL. To force zone order, append 'numa_zonelist_order=zone' to the kernel command line. Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f7da2a2934b7..cefe6fe8d991 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2606,7 +2606,7 @@ static int default_zonelist_order(void) * ZONE_DMA and ZONE_DMA32 can be very small area in the system. * If they are really small and used heavily, the system can fall * into OOM very easily. - * This function detect ZONE_DMA/DMA32 size and confgigures zone order. + * This function detect ZONE_DMA/DMA32 size and configures zone order. */ /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ low_kmem_size = 0; @@ -2618,6 +2618,15 @@ static int default_zonelist_order(void) if (zone_type < ZONE_NORMAL) low_kmem_size += z->present_pages; total_size += z->present_pages; + } else if (zone_type == ZONE_NORMAL) { + /* + * If any node has only lowmem, then node order + * is preferred to allow kernel allocations + * locally; otherwise, they can easily infringe + * on other nodes when there is an abundance of + * lowmem available to allocate from. + */ + return ZONELIST_ORDER_NODE; } } } -- cgit v1.2.3-59-g8ed1b From 3f6c82728f4e31a97c3a1b32abccb512fed0b573 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:17 -0700 Subject: mm: migration: take a reference to the anon_vma before migrating This patchset is a memory compaction mechanism that reduces external fragmentation memory by moving GFP_MOVABLE pages to a fewer number of pageblocks. The term "compaction" was chosen as there are is a number of mechanisms that are not mutually exclusive that can be used to defragment memory. For example, lumpy reclaim is a form of defragmentation as was slub "defragmentation" (really a form of targeted reclaim). Hence, this is called "compaction" to distinguish it from other forms of defragmentation. In this implementation, a full compaction run involves two scanners operating within a zone - a migration and a free scanner. The migration scanner starts at the beginning of a zone and finds all movable pages within one pageblock_nr_pages-sized area and isolates them on a migratepages list. The free scanner begins at the end of the zone and searches on a per-area basis for enough free pages to migrate all the pages on the migratepages list. As each area is respectively migrated or exhausted of free pages, the scanners are advanced one area. A compaction run completes within a zone when the two scanners meet. This method is a bit primitive but is easy to understand and greater sophistication would require maintenance of counters on a per-pageblock basis. This would have a big impact on allocator fast-paths to improve compaction which is a poor trade-off. It also does not try relocate virtually contiguous pages to be physically contiguous. However, assuming transparent hugepages were in use, a hypothetical khugepaged might reuse compaction code to isolate free pages, split them and relocate userspace pages for promotion. Memory compaction can be triggered in one of three ways. It may be triggered explicitly by writing any value to /proc/sys/vm/compact_memory and compacting all of memory. It can be triggered on a per-node basis by writing any value to /sys/devices/system/node/nodeN/compact where N is the node ID to be compacted. When a process fails to allocate a high-order page, it may compact memory in an attempt to satisfy the allocation instead of entering direct reclaim. Explicit compaction does not finish until the two scanners meet and direct compaction ends if a suitable page becomes available that would meet watermarks. The series is in 14 patches. The first three are not "core" to the series but are important pre-requisites. Patch 1 reference counts anon_vma for rmap_walk_anon(). Without this patch, it's possible to use anon_vma after free if the caller is not holding a VMA or mmap_sem for the pages in question. While there should be no existing user that causes this problem, it's a requirement for memory compaction to be stable. The patch is at the start of the series for bisection reasons. Patch 2 merges the KSM and migrate counts. It could be merged with patch 1 but would be slightly harder to review. Patch 3 skips over unmapped anon pages during migration as there are no guarantees about the anon_vma existing. There is a window between when a page was isolated and migration started during which anon_vma could disappear. Patch 4 notes that PageSwapCache pages can still be migrated even if they are unmapped. Patch 5 allows CONFIG_MIGRATION to be set without CONFIG_NUMA Patch 6 exports a "unusable free space index" via debugfs. It's a measure of external fragmentation that takes the size of the allocation request into account. It can also be calculated from userspace so can be dropped if requested Patch 7 exports a "fragmentation index" which only has meaning when an allocation request fails. It determines if an allocation failure would be due to a lack of memory or external fragmentation. Patch 8 moves the definition for LRU isolation modes for use by compaction Patch 9 is the compaction mechanism although it's unreachable at this point Patch 10 adds a means of compacting all of memory with a proc trgger Patch 11 adds a means of compacting a specific node with a sysfs trigger Patch 12 adds "direct compaction" before "direct reclaim" if it is determined there is a good chance of success. Patch 13 adds a sysctl that allows tuning of the threshold at which the kernel will compact or direct reclaim Patch 14 temporarily disables compaction if an allocation failure occurs after compaction. Testing of compaction was in three stages. For the test, debugging, preempt, the sleep watchdog and lockdep were all enabled but nothing nasty popped out. min_free_kbytes was tuned as recommended by hugeadm to help fragmentation avoidance and high-order allocations. It was tested on X86, X86-64 and PPC64. Ths first test represents one of the easiest cases that can be faced for lumpy reclaim or memory compaction. 1. Machine freshly booted and configured for hugepage usage with a) hugeadm --create-global-mounts b) hugeadm --pool-pages-max DEFAULT:8G c) hugeadm --set-recommended-min_free_kbytes d) hugeadm --set-recommended-shmmax The min_free_kbytes here is important. Anti-fragmentation works best when pageblocks don't mix. hugeadm knows how to calculate a value that will significantly reduce the worst of external-fragmentation-related events as reported by the mm_page_alloc_extfrag tracepoint. 2. Load up memory a) Start updatedb b) Create in parallel a X files of pagesize*128 in size. Wait until files are created. By parallel, I mean that 4096 instances of dd were launched, one after the other using &. The crude objective being to mix filesystem metadata allocations with the buffer cache. c) Delete every second file so that pageblocks are likely to have holes d) kill updatedb if it's still running At this point, the system is quiet, memory is full but it's full with clean filesystem metadata and clean buffer cache that is unmapped. This is readily migrated or discarded so you'd expect lumpy reclaim to have no significant advantage over compaction but this is at the POC stage. 3. In increments, attempt to allocate 5% of memory as hugepages. Measure how long it took, how successful it was, how many direct reclaims took place and how how many compactions. Note the compaction figures might not fully add up as compactions can take place for orders other than the hugepage size X86 vanilla compaction Final page count 913 916 (attempted 1002) pages reclaimed 68296 9791 X86-64 vanilla compaction Final page count: 901 902 (attempted 1002) Total pages reclaimed: 112599 53234 PPC64 vanilla compaction Final page count: 93 94 (attempted 110) Total pages reclaimed: 103216 61838 There was not a dramatic improvement in success rates but it wouldn't be expected in this case either. What was important is that fewer pages were reclaimed in all cases reducing the amount of IO required to satisfy a huge page allocation. The second tests were all performance related - kernbench, netperf, iozone and sysbench. None showed anything too remarkable. The last test was a high-order allocation stress test. Many kernel compiles are started to fill memory with a pressured mix of unmovable and movable allocations. During this, an attempt is made to allocate 90% of memory as huge pages - one at a time with small delays between attempts to avoid flooding the IO queue. vanilla compaction Percentage of request allocated X86 98 99 Percentage of request allocated X86-64 95 98 Percentage of request allocated PPC64 55 70 This patch: rmap_walk_anon() does not use page_lock_anon_vma() for looking up and locking an anon_vma and it does not appear to have sufficient locking to ensure the anon_vma does not disappear from under it. This patch copies an approach used by KSM to take a reference on the anon_vma while pages are being migrated. This should prevent rmap_walk() running into nasty surprises later because anon_vma has been freed. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 23 +++++++++++++++++++++++ mm/migrate.c | 12 ++++++++++++ mm/rmap.c | 10 +++++----- 3 files changed, 40 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d25bd224d370..567d43f29a10 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -28,6 +28,9 @@ struct anon_vma { spinlock_t lock; /* Serialize access to vma list */ #ifdef CONFIG_KSM atomic_t ksm_refcount; +#endif +#ifdef CONFIG_MIGRATION + atomic_t migrate_refcount; #endif /* * NOTE: the LSB of the head.next is set by @@ -81,6 +84,26 @@ static inline int ksm_refcount(struct anon_vma *anon_vma) return 0; } #endif /* CONFIG_KSM */ +#ifdef CONFIG_MIGRATION +static inline void migrate_refcount_init(struct anon_vma *anon_vma) +{ + atomic_set(&anon_vma->migrate_refcount, 0); +} + +static inline int migrate_refcount(struct anon_vma *anon_vma) +{ + return atomic_read(&anon_vma->migrate_refcount); +} +#else +static inline void migrate_refcount_init(struct anon_vma *anon_vma) +{ +} + +static inline int migrate_refcount(struct anon_vma *anon_vma) +{ + return 0; +} +#endif /* CONFIG_MIGRATE */ static inline struct anon_vma *page_anon_vma(struct page *page) { diff --git a/mm/migrate.c b/mm/migrate.c index 5938db54e1d7..b768a1d4fa43 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -543,6 +543,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, int rcu_locked = 0; int charge = 0; struct mem_cgroup *mem = NULL; + struct anon_vma *anon_vma = NULL; if (!newpage) return -ENOMEM; @@ -599,6 +600,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (PageAnon(page)) { rcu_read_lock(); rcu_locked = 1; + anon_vma = page_anon_vma(page); + atomic_inc(&anon_vma->migrate_refcount); } /* @@ -638,6 +641,15 @@ skip_unmap: if (rc) remove_migration_ptes(page, page); rcu_unlock: + + /* Drop an anon_vma reference if we took one */ + if (anon_vma && atomic_dec_and_lock(&anon_vma->migrate_refcount, &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } + if (rcu_locked) rcu_read_unlock(); uncharge: diff --git a/mm/rmap.c b/mm/rmap.c index 0feeef860a8f..f522cb008646 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -250,7 +250,8 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); + empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma) && + !migrate_refcount(anon_vma); spin_unlock(&anon_vma->lock); if (empty) @@ -275,6 +276,7 @@ static void anon_vma_ctor(void *data) spin_lock_init(&anon_vma->lock); ksm_refcount_init(anon_vma); + migrate_refcount_init(anon_vma); INIT_LIST_HEAD(&anon_vma->head); } @@ -1355,10 +1357,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma() * because that depends on page_mapped(); but not all its usages - * are holding mmap_sem, which also gave the necessary guarantee - * (that this anon_vma's slab has not already been destroyed). - * This needs to be reviewed later: avoiding page_lock_anon_vma() - * is risky, and currently limits the usefulness of rmap_walk(). + * are holding mmap_sem. Users without mmap_sem are required to + * take a reference count to prevent the anon_vma disappearing */ anon_vma = page_anon_vma(page); if (!anon_vma) -- cgit v1.2.3-59-g8ed1b From 7f60c214fd3a360461f3286c6908084f7f8b1950 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:18 -0700 Subject: mm: migration: share the anon_vma ref counts between KSM and page migration For clarity of review, KSM and page migration have separate refcounts on the anon_vma. While clear, this is a waste of memory. This patch gets KSM and page migration to share their toys in a spirit of harmony. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 50 ++++++++++++++++++-------------------------------- mm/ksm.c | 4 ++-- mm/migrate.c | 4 ++-- mm/rmap.c | 6 ++---- 4 files changed, 24 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 567d43f29a10..77216742c178 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -26,11 +26,17 @@ */ struct anon_vma { spinlock_t lock; /* Serialize access to vma list */ -#ifdef CONFIG_KSM - atomic_t ksm_refcount; -#endif -#ifdef CONFIG_MIGRATION - atomic_t migrate_refcount; +#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) + + /* + * The external_refcount is taken by either KSM or page migration + * to take a reference to an anon_vma when there is no + * guarantee that the vma of page tables will exist for + * the duration of the operation. A caller that takes + * the reference is responsible for clearing up the + * anon_vma if they are the last user on release + */ + atomic_t external_refcount; #endif /* * NOTE: the LSB of the head.next is set by @@ -64,46 +70,26 @@ struct anon_vma_chain { }; #ifdef CONFIG_MMU -#ifdef CONFIG_KSM -static inline void ksm_refcount_init(struct anon_vma *anon_vma) +#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) +static inline void anonvma_external_refcount_init(struct anon_vma *anon_vma) { - atomic_set(&anon_vma->ksm_refcount, 0); + atomic_set(&anon_vma->external_refcount, 0); } -static inline int ksm_refcount(struct anon_vma *anon_vma) +static inline int anonvma_external_refcount(struct anon_vma *anon_vma) { - return atomic_read(&anon_vma->ksm_refcount); + return atomic_read(&anon_vma->external_refcount); } #else -static inline void ksm_refcount_init(struct anon_vma *anon_vma) +static inline void anonvma_external_refcount_init(struct anon_vma *anon_vma) { } -static inline int ksm_refcount(struct anon_vma *anon_vma) +static inline int anonvma_external_refcount(struct anon_vma *anon_vma) { return 0; } #endif /* CONFIG_KSM */ -#ifdef CONFIG_MIGRATION -static inline void migrate_refcount_init(struct anon_vma *anon_vma) -{ - atomic_set(&anon_vma->migrate_refcount, 0); -} - -static inline int migrate_refcount(struct anon_vma *anon_vma) -{ - return atomic_read(&anon_vma->migrate_refcount); -} -#else -static inline void migrate_refcount_init(struct anon_vma *anon_vma) -{ -} - -static inline int migrate_refcount(struct anon_vma *anon_vma) -{ - return 0; -} -#endif /* CONFIG_MIGRATE */ static inline struct anon_vma *page_anon_vma(struct page *page) { diff --git a/mm/ksm.c b/mm/ksm.c index 956880f2ff49..6c3e99b4ae7c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -318,14 +318,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item, struct anon_vma *anon_vma) { rmap_item->anon_vma = anon_vma; - atomic_inc(&anon_vma->ksm_refcount); + atomic_inc(&anon_vma->external_refcount); } static void drop_anon_vma(struct rmap_item *rmap_item) { struct anon_vma *anon_vma = rmap_item->anon_vma; - if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { + if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { int empty = list_empty(&anon_vma->head); spin_unlock(&anon_vma->lock); if (empty) diff --git a/mm/migrate.c b/mm/migrate.c index b768a1d4fa43..42a3d24d1107 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -601,7 +601,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, rcu_read_lock(); rcu_locked = 1; anon_vma = page_anon_vma(page); - atomic_inc(&anon_vma->migrate_refcount); + atomic_inc(&anon_vma->external_refcount); } /* @@ -643,7 +643,7 @@ skip_unmap: rcu_unlock: /* Drop an anon_vma reference if we took one */ - if (anon_vma && atomic_dec_and_lock(&anon_vma->migrate_refcount, &anon_vma->lock)) { + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { int empty = list_empty(&anon_vma->head); spin_unlock(&anon_vma->lock); if (empty) diff --git a/mm/rmap.c b/mm/rmap.c index f522cb008646..b5c320f7d0a5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -250,8 +250,7 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma) && - !migrate_refcount(anon_vma); + empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); spin_unlock(&anon_vma->lock); if (empty) @@ -275,8 +274,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); - ksm_refcount_init(anon_vma); - migrate_refcount_init(anon_vma); + anonvma_external_refcount_init(anon_vma); INIT_LIST_HEAD(&anon_vma->head); } -- cgit v1.2.3-59-g8ed1b From 67b9509b2c68ae38cecb83a239881cb0ddf087dc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:19 -0700 Subject: mm: migration: do not try to migrate unmapped anonymous pages rmap_walk_anon() was triggering errors in memory compaction that look like use-after-free errors. The problem is that between the page being isolated from the LRU and rcu_read_lock() being taken, the mapcount of the page dropped to 0 and the anon_vma gets freed. This can happen during memory compaction if pages being migrated belong to a process that exits before migration completes. Hence, the use-after-free race looks like 1. Page isolated for migration 2. Process exits 3. page_mapcount(page) drops to zero so anon_vma was no longer reliable 4. unmap_and_move() takes the rcu_lock but the anon_vma is already garbage 4. call try_to_unmap, looks up tha anon_vma and "locks" it but the lock is garbage. This patch checks the mapcount after the rcu lock is taken. If the mapcount is zero, the anon_vma is assumed to be freed and no further action is taken. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 42a3d24d1107..b114635962dc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -600,6 +600,17 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (PageAnon(page)) { rcu_read_lock(); rcu_locked = 1; + + /* + * If the page has no mappings any more, just bail. An + * unmapped anon page is likely to be freed soon but worse, + * it's possible its anon_vma disappeared between when + * the page was isolated and when we reached here while + * the RCU lock was not held + */ + if (!page_mapped(page)) + goto rcu_unlock; + anon_vma = page_anon_vma(page); atomic_inc(&anon_vma->external_refcount); } -- cgit v1.2.3-59-g8ed1b From 3fe2011ff51e92500010a495df4be86745fbbda9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:20 -0700 Subject: mm: migration: allow the migration of PageSwapCache pages PageAnon pages that are unmapped may or may not have an anon_vma so are not currently migrated. However, a swap cache page can be migrated and fits this description. This patch identifies page swap caches and allows them to be migrated but ensures that no attempt to made to remap the pages would would potentially try to access an already freed anon_vma. Signed-off-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Rik van Riel Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 53 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index b114635962dc..4afd6fe3c074 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -485,7 +485,8 @@ static int fallback_migrate_page(struct address_space *mapping, * < 0 - error code * == 0 - success */ -static int move_to_new_page(struct page *newpage, struct page *page) +static int move_to_new_page(struct page *newpage, struct page *page, + int remap_swapcache) { struct address_space *mapping; int rc; @@ -520,10 +521,12 @@ static int move_to_new_page(struct page *newpage, struct page *page) else rc = fallback_migrate_page(mapping, newpage, page); - if (!rc) - remove_migration_ptes(page, newpage); - else + if (rc) { newpage->mapping = NULL; + } else { + if (remap_swapcache) + remove_migration_ptes(page, newpage); + } unlock_page(newpage); @@ -540,6 +543,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); + int remap_swapcache = 1; int rcu_locked = 0; int charge = 0; struct mem_cgroup *mem = NULL; @@ -601,18 +605,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, rcu_read_lock(); rcu_locked = 1; - /* - * If the page has no mappings any more, just bail. An - * unmapped anon page is likely to be freed soon but worse, - * it's possible its anon_vma disappeared between when - * the page was isolated and when we reached here while - * the RCU lock was not held - */ - if (!page_mapped(page)) - goto rcu_unlock; + /* Determine how to safely use anon_vma */ + if (!page_mapped(page)) { + if (!PageSwapCache(page)) + goto rcu_unlock; - anon_vma = page_anon_vma(page); - atomic_inc(&anon_vma->external_refcount); + /* + * We cannot be sure that the anon_vma of an unmapped + * swapcache page is safe to use because we don't + * know in advance if the VMA that this page belonged + * to still exists. If the VMA and others sharing the + * data have been freed, then the anon_vma could + * already be invalid. + * + * To avoid this possibility, swapcache pages get + * migrated but are not remapped when migration + * completes + */ + remap_swapcache = 0; + } else { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + anon_vma = page_anon_vma(page); + atomic_inc(&anon_vma->external_refcount); + } } /* @@ -647,9 +666,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page); + rc = move_to_new_page(newpage, page, remap_swapcache); - if (rc) + if (rc && remap_swapcache) remove_migration_ptes(page, page); rcu_unlock: -- cgit v1.2.3-59-g8ed1b From e9e96b39f932a065e14f5d5bab0797ae261d03b5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:21 -0700 Subject: mm: allow CONFIG_MIGRATION to be set without CONFIG_NUMA or memory hot-remove CONFIG_MIGRATION currently depends on CONFIG_NUMA or on the architecture being able to hot-remove memory. The main users of page migration such as sys_move_pages(), sys_migrate_pages() and cpuset process migration are only beneficial on NUMA so it makes sense. As memory compaction will operate within a zone and is useful on both NUMA and non-NUMA systems, this patch allows CONFIG_MIGRATION to be set if the user selects CONFIG_COMPACTION as an option. [akpm@linux-foundation.org: Depend on CONFIG_HUGETLB_PAGE] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: Rik van Riel Reviewed-by: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 9c61158308dc..527136b22384 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -171,6 +171,15 @@ config SPLIT_PTLOCK_CPUS default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" +# +# support for memory compaction +config COMPACTION + bool "Allow for memory compaction" + select MIGRATION + depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + help + Allows the compaction of memory for the allocation of huge pages. + # # support for page migration # @@ -180,9 +189,11 @@ config MIGRATION depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE help Allows the migration of the physical location of pages of processes - while the virtual addresses are not changed. This is useful for - example on NUMA systems to put pages nearer to the processors accessing - the page. + while the virtual addresses are not changed. This is useful in + two situations. The first is on NUMA systems to put pages nearer + to the processors accessing. The second is when allocating huge + pages as migration can relocate pages to satisfy a huge page + allocation instead of reclaiming. config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT -- cgit v1.2.3-59-g8ed1b From a8bef8ff6ea15fa4c67433cab0f5f3484574ef7c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:24 -0700 Subject: mm: migration: avoid race between shift_arg_pages() and rmap_walk() during migration by not migrating temporary stacks Page migration requires rmap to be able to find all ptes mapping a page at all times, otherwise the migration entry can be instantiated, but it is possible to leave one behind if the second rmap_walk fails to find the page. If this page is later faulted, migration_entry_to_page() will call BUG because the page is locked indicating the page was migrated by the migration PTE not cleaned up. For example kernel BUG at include/linux/swapops.h:105! invalid opcode: 0000 [#1] PREEMPT SMP ... Call Trace: [] handle_mm_fault+0x3f8/0x76a [] do_page_fault+0x44a/0x46e [] page_fault+0x25/0x30 [] load_elf_binary+0x152a/0x192b [] search_binary_handler+0x173/0x313 [] do_execve+0x219/0x30a [] sys_execve+0x43/0x5e [] stub_execve+0x6a/0xc0 RIP [] migration_entry_wait+0xc1/0x129 There is a race between shift_arg_pages and migration that triggers this bug. A temporary stack is setup during exec and later moved. If migration moves a page in the temporary stack and the VMA is then removed before migration completes, the migration PTE may not be found leading to a BUG when the stack is faulted. This patch causes pages within the temporary stack during exec to be skipped by migration. It does this by marking the VMA covering the temporary stack with an otherwise impossible combination of VMA flags. These flags are cleared when the temporary stack is moved to its final location. [kamezawa.hiroyu@jp.fujitsu.com: idea for having migration skip temporary stacks] Signed-off-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Rik van Riel Acked-by: Linus Torvalds Cc: Minchan Kim Cc: Christoph Lameter Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 ++++++- include/linux/mm.h | 3 +++ mm/rmap.c | 30 +++++++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/fs/exec.c b/fs/exec.c index e6e94c626c2c..9badbc0bfb1d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ + BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS; + vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); @@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; + vm_flags |= VM_STACK_INCOMPLETE_SETUP; ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags); @@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; } + /* mprotect_fixup is overkill to remove the temporary stack flags */ + vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* diff --git a/include/linux/mm.h b/include/linux/mm.h index fb19bb92b809..98ea5bab963e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -106,6 +106,9 @@ extern unsigned int kobjsize(const void *objp); #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +/* Bits set in the VMA until the stack is in its final location */ +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) + #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif diff --git a/mm/rmap.c b/mm/rmap.c index b5c320f7d0a5..38a336e2eea1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1131,6 +1131,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } +static bool is_vma_temporary_stack(struct vm_area_struct *vma) +{ + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; +} + /** * try_to_unmap_anon - unmap or unlock anonymous page using the object-based * rmap method @@ -1159,7 +1173,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); + unsigned long address; + + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && + is_vma_temporary_stack(vma)) + continue; + + address = vma_address(page, vma); if (address == -EFAULT) continue; ret = try_to_unmap_one(page, vma, address, flags); -- cgit v1.2.3-59-g8ed1b From d7a5752c0c19750312efab3a2a80d350e11fa4a2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:25 -0700 Subject: mm: export unusable free space index via debugfs The unusable free space index measures how much of the available free memory cannot be used to satisfy an allocation of a given size and is a value between 0 and 1. The higher the value, the more of free memory is unusable and by implication, the worse the external fragmentation is. For the most part, the huge page size will be the size of interest but not necessarily so it is exported on a per-order and per-zone basis via /sys/kernel/debug/extfrag/unusable_index. > cat /sys/kernel/debug/extfrag/unusable_index Node 0, zone DMA 0.000 0.000 0.000 0.001 0.005 0.013 0.021 0.037 0.037 0.101 0.230 Node 0, zone Normal 0.000 0.000 0.000 0.001 0.002 0.002 0.005 0.015 0.028 0.028 0.054 [akpm@linux-foundation.org: Fix allnoconfig] Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index fa12ea3051fb..d3e0fa169f05 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -379,7 +379,50 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) } #endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_COMPACTION +struct contig_page_info { + unsigned long free_pages; + unsigned long free_blocks_total; + unsigned long free_blocks_suitable; +}; + +/* + * Calculate the number of free pages in a zone, how many contiguous + * pages are free and how many are large enough to satisfy an allocation of + * the target size. Note that this function makes no attempt to estimate + * how many suitable free blocks there *might* be if MOVABLE pages were + * migrated. Calculating that is possible, but expensive and can be + * figured out from userspace + */ +static void fill_contig_page_info(struct zone *zone, + unsigned int suitable_order, + struct contig_page_info *info) +{ + unsigned int order; + + info->free_pages = 0; + info->free_blocks_total = 0; + info->free_blocks_suitable = 0; + + for (order = 0; order < MAX_ORDER; order++) { + unsigned long blocks; + + /* Count number of free blocks */ + blocks = zone->free_area[order].nr_free; + info->free_blocks_total += blocks; + + /* Count free base pages */ + info->free_pages += blocks << order; + + /* Count the suitable free blocks */ + if (order >= suitable_order) + info->free_blocks_suitable += blocks << + (order - suitable_order); + } +} +#endif + +#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) #include #include @@ -432,7 +475,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, spin_unlock_irqrestore(&zone->lock, flags); } } +#endif +#ifdef CONFIG_PROC_FS static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -954,3 +999,106 @@ static int __init setup_vmstat(void) return 0; } module_init(setup_vmstat) + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) +#include + +static struct dentry *extfrag_debug_root; + +/* + * Return an index indicating how much of the available free memory is + * unusable for an allocation of the requested size. + */ +static int unusable_free_index(unsigned int order, + struct contig_page_info *info) +{ + /* No free memory is interpreted as all free memory is unusable */ + if (info->free_pages == 0) + return 1000; + + /* + * Index should be a value between 0 and 1. Return a value to 3 + * decimal places. + * + * 0 => no fragmentation + * 1 => high fragmentation + */ + return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); + +} + +static void unusable_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = unusable_free_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display unusable free space index + * + * The unusable free space index measures how much of the available free + * memory cannot be used to satisfy an allocation of a given size and is a + * value between 0 and 1. The higher the value, the more of free memory is + * unusable and by implication, the worse the external fragmentation is. This + * can be expressed as a percentage by multiplying by 100. + */ +static int unusable_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + return 0; + + walk_zones_in_node(m, pgdat, unusable_show_print); + + return 0; +} + +static const struct seq_operations unusable_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = unusable_show, +}; + +static int unusable_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &unusable_op); +} + +static const struct file_operations unusable_file_ops = { + .open = unusable_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init extfrag_debug_init(void) +{ + extfrag_debug_root = debugfs_create_dir("extfrag", NULL); + if (!extfrag_debug_root) + return -ENOMEM; + + if (!debugfs_create_file("unusable_index", 0444, + extfrag_debug_root, NULL, &unusable_file_ops)) + return -ENOMEM; + + return 0; +} + +module_init(extfrag_debug_init); +#endif -- cgit v1.2.3-59-g8ed1b From f1a5ab1210579e2d3ac8c0c227645823af5aafb0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:26 -0700 Subject: mm: export fragmentation index via debugfs The fragmentation fragmentation index, is only meaningful if an allocation would fail and indicates what the failure is due to. A value of -1 such as in many of the examples above states that the allocation would succeed. If it would fail, the value is between 0 and 1. A value tending towards 0 implies the allocation failed due to a lack of memory. A value tending towards 1 implies it failed due to external fragmentation. For the most part, the huge page size will be the size of interest but not necessarily so it is exported on a per-order and per-zo basis via /sys/kernel/debug/extfrag/extfrag_index > cat /sys/kernel/debug/extfrag/extfrag_index Node 0, zone DMA -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.00 Node 0, zone Normal -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 0.954 Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Acked-by: Rik van Riel Reviewed-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index d3e0fa169f05..23a5899c7461 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -420,6 +421,33 @@ static void fill_contig_page_info(struct zone *zone, (order - suitable_order); } } + +/* + * A fragmentation index only makes sense if an allocation of a requested + * size would fail. If that is true, the fragmentation index indicates + * whether external fragmentation or a lack of memory was the problem. + * The value can be used to determine if page reclaim or compaction + * should be used + */ +int fragmentation_index(unsigned int order, struct contig_page_info *info) +{ + unsigned long requested = 1UL << order; + + if (!info->free_blocks_total) + return 0; + + /* Fragmentation index only makes sense when a request would fail */ + if (info->free_blocks_suitable) + return -1000; + + /* + * Index is between 0 and 1 so return within 3 decimal places + * + * 0 => allocation would fail due to lack of memory + * 1 => allocation would fail due to fragmentation + */ + return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); +} #endif #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) @@ -1087,6 +1115,58 @@ static const struct file_operations unusable_file_ops = { .release = seq_release, }; +static void extfrag_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + + /* Alloc on stack as interrupts are disabled for zone walk */ + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = fragmentation_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display fragmentation index for orders that allocations would fail for + */ +static int extfrag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + walk_zones_in_node(m, pgdat, extfrag_show_print); + + return 0; +} + +static const struct seq_operations extfrag_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = extfrag_show, +}; + +static int extfrag_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &extfrag_op); +} + +static const struct file_operations extfrag_file_ops = { + .open = extfrag_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static int __init extfrag_debug_init(void) { extfrag_debug_root = debugfs_create_dir("extfrag", NULL); @@ -1097,6 +1177,10 @@ static int __init extfrag_debug_init(void) extfrag_debug_root, NULL, &unusable_file_ops)) return -ENOMEM; + if (!debugfs_create_file("extfrag_index", 0444, + extfrag_debug_root, NULL, &extfrag_file_ops)) + return -ENOMEM; + return 0; } -- cgit v1.2.3-59-g8ed1b From c175a0ce7584e5b498fff8cbdb9aa7912aa9fbba Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:26 -0700 Subject: mm: move definition for LRU isolation modes to a header Currently, vmscan.c defines the isolation modes for __isolate_lru_page(). Memory compaction needs access to these modes for isolating pages for migration. This patch exports them. Signed-off-by: Mel Gorman Acked-by: Christoph Lameter Cc: Rik van Riel Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 +++++ mm/vmscan.c | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 600b03b7732e..1e8ce148d058 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -229,6 +229,11 @@ static inline void lru_cache_add_file(struct page *page) __lru_cache_add(page, LRU_INACTIVE_FILE); } +/* LRU Isolation modes. */ +#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ +#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ +#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ + /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); diff --git a/mm/vmscan.c b/mm/vmscan.c index f2c367c9ec12..25b0202c60df 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -839,11 +839,6 @@ keep: return nr_reclaimed; } -/* LRU Isolation modes. */ -#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ -#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ -#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ - /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being -- cgit v1.2.3-59-g8ed1b From 748446bb6b5a9390b546af38ec899c868a9dbcf0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:27 -0700 Subject: mm: compaction: memory compaction core This patch is the core of a mechanism which compacts memory in a zone by relocating movable pages towards the end of the zone. A single compaction run involves a migration scanner and a free scanner. Both scanners operate on pageblock-sized areas in the zone. The migration scanner starts at the bottom of the zone and searches for all movable pages within each area, isolating them onto a private list called migratelist. The free scanner starts at the top of the zone and searches for suitable areas and consumes the free pages within making them available for the migration scanner. The pages isolated for migration are then migrated to the newly isolated free pages. [aarcange@redhat.com: Fix unsafe optimisation] [mel@csn.ul.ie: do not schedule work on other CPUs for compaction] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 9 ++ include/linux/migrate.h | 2 + include/linux/mm.h | 1 + include/linux/swap.h | 1 + include/linux/vmstat.h | 3 + mm/Makefile | 1 + mm/compaction.c | 393 +++++++++++++++++++++++++++++++++++++++++++++ mm/migrate.c | 11 +- mm/page_alloc.c | 45 ++++++ mm/vmstat.c | 7 + 10 files changed, 472 insertions(+), 1 deletion(-) create mode 100644 include/linux/compaction.h create mode 100644 mm/compaction.c (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h new file mode 100644 index 000000000000..465ca88615ed --- /dev/null +++ b/include/linux/compaction.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_COMPACTION_H +#define _LINUX_COMPACTION_H + +/* Return values for compact_zone() */ +#define COMPACT_CONTINUE 0 +#define COMPACT_PARTIAL 1 +#define COMPACT_COMPLETE 2 + +#endif /* _LINUX_COMPACTION_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7a07b17d27c5..7238231b8dd4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -19,6 +19,7 @@ extern int fail_migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_prep(void); +extern int migrate_prep_local(void); extern int migrate_vmas(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, unsigned long flags); @@ -30,6 +31,7 @@ static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, int offlining) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } +static inline int migrate_prep_local(void) { return -ENOSYS; } static inline int migrate_vmas(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, diff --git a/include/linux/mm.h b/include/linux/mm.h index 98ea5bab963e..963f908af9d0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -337,6 +337,7 @@ void put_page(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); +int split_free_page(struct page *page); /* * Compound pages have a destructor function. Provide a diff --git a/include/linux/swap.h b/include/linux/swap.h index 1e8ce148d058..b6b614364dd8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -152,6 +152,7 @@ enum { }; #define SWAP_CLUSTER_MAX 32 +#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX #define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ #define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 117f0dd8ad03..b421d1b22b62 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -43,6 +43,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, KSWAPD_SKIP_CONGESTION_WAIT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, +#ifdef CONFIG_COMPACTION + COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED, +#endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif diff --git a/mm/Makefile b/mm/Makefile index 6c2a73a54a43..8982504bd03b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_COMPACTION) += compaction.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o diff --git a/mm/compaction.c b/mm/compaction.c new file mode 100644 index 000000000000..be1ff3f7552b --- /dev/null +++ b/mm/compaction.c @@ -0,0 +1,393 @@ +/* + * linux/mm/compaction.c + * + * Memory compaction for the reduction of external fragmentation. Note that + * this heavily depends upon page migration to do all the real heavy + * lifting + * + * Copyright IBM Corp. 2007-2010 Mel Gorman + */ +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * compact_control is used to track pages being migrated and the free pages + * they are being migrated to during memory compaction. The free_pfn starts + * at the end of a zone and migrate_pfn begins at the start. Movable pages + * are moved to the end of a zone during a compaction run and the run + * completes when free_pfn <= migrate_pfn + */ +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned long nr_freepages; /* Number of isolated free pages */ + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + + /* Account for isolated anon and file pages */ + unsigned long nr_anon; + unsigned long nr_file; + + struct zone *zone; +}; + +static unsigned long release_freepages(struct list_head *freelist) +{ + struct page *page, *next; + unsigned long count = 0; + + list_for_each_entry_safe(page, next, freelist, lru) { + list_del(&page->lru); + __free_page(page); + count++; + } + + return count; +} + +/* Isolate free pages onto a private freelist. Must hold zone->lock */ +static unsigned long isolate_freepages_block(struct zone *zone, + unsigned long blockpfn, + struct list_head *freelist) +{ + unsigned long zone_end_pfn, end_pfn; + int total_isolated = 0; + struct page *cursor; + + /* Get the last PFN we should scan for free pages at */ + zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); + + /* Find the first usable PFN in the block to initialse page cursor */ + for (; blockpfn < end_pfn; blockpfn++) { + if (pfn_valid_within(blockpfn)) + break; + } + cursor = pfn_to_page(blockpfn); + + /* Isolate free pages. This assumes the block is valid */ + for (; blockpfn < end_pfn; blockpfn++, cursor++) { + int isolated, i; + struct page *page = cursor; + + if (!pfn_valid_within(blockpfn)) + continue; + + if (!PageBuddy(page)) + continue; + + /* Found a free page, break it into order-0 pages */ + isolated = split_free_page(page); + total_isolated += isolated; + for (i = 0; i < isolated; i++) { + list_add(&page->lru, freelist); + page++; + } + + /* If a page was split, advance to the end of it */ + if (isolated) { + blockpfn += isolated - 1; + cursor += isolated - 1; + } + } + + return total_isolated; +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE, allow migration */ + if (migratetype == MIGRATE_MOVABLE) + return true; + + /* Otherwise skip the block */ + return false; +} + +/* + * Based on information in the current compact_control, find blocks + * suitable for isolating free pages from and then isolate them. + */ +static void isolate_freepages(struct zone *zone, + struct compact_control *cc) +{ + struct page *page; + unsigned long high_pfn, low_pfn, pfn; + unsigned long flags; + int nr_freepages = cc->nr_freepages; + struct list_head *freelist = &cc->freepages; + + pfn = cc->free_pfn; + low_pfn = cc->migrate_pfn + pageblock_nr_pages; + high_pfn = low_pfn; + + /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ + spin_lock_irqsave(&zone->lock, flags); + for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; + pfn -= pageblock_nr_pages) { + unsigned long isolated; + + if (!pfn_valid(pfn)) + continue; + + /* + * Check for overlapping nodes/zones. It's possible on some + * configurations to have a setup like + * node0 node1 node0 + * i.e. it's possible that all pages within a zones range of + * pages do not belong to a single zone. + */ + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + + /* Check the block is suitable for migration */ + if (!suitable_migration_target(page)) + continue; + + /* Found a block suitable for isolating free pages from */ + isolated = isolate_freepages_block(zone, pfn, freelist); + nr_freepages += isolated; + + /* + * Record the highest PFN we isolated pages from. When next + * looking for free pages, the search will restart here as + * page migration may have returned some pages to the allocator + */ + if (isolated) + high_pfn = max(high_pfn, pfn); + } + spin_unlock_irqrestore(&zone->lock, flags); + + /* split_free_page does not map the pages */ + list_for_each_entry(page, freelist, lru) { + arch_alloc_page(page, 0); + kernel_map_pages(page, 1, 1); + } + + cc->free_pfn = high_pfn; + cc->nr_freepages = nr_freepages; +} + +/* Update the number of anon and file isolated pages in the zone */ +static void acct_isolated(struct zone *zone, struct compact_control *cc) +{ + struct page *page; + unsigned int count[NR_LRU_LISTS] = { 0, }; + + list_for_each_entry(page, &cc->migratepages, lru) { + int lru = page_lru_base_type(page); + count[lru]++; + } + + cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); +} + +/* Similar to reclaim, but different enough that they don't share logic */ +static bool too_many_isolated(struct zone *zone) +{ + + unsigned long inactive, isolated; + + inactive = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_FILE) + + zone_page_state(zone, NR_ISOLATED_ANON); + + return isolated > inactive; +} + +/* + * Isolate all pages that can be migrated from the block pointed to by + * the migrate scanner within compact_control. + */ +static unsigned long isolate_migratepages(struct zone *zone, + struct compact_control *cc) +{ + unsigned long low_pfn, end_pfn; + struct list_head *migratelist = &cc->migratepages; + + /* Do not scan outside zone boundaries */ + low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + + /* Only scan within a pageblock boundary */ + end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); + + /* Do not cross the free scanner or scan within a memory hole */ + if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { + cc->migrate_pfn = end_pfn; + return 0; + } + + /* + * Ensure that there are not too many pages isolated from the LRU + * list by either parallel reclaimers or compaction. If there are, + * delay for some time until fewer pages are isolated + */ + while (unlikely(too_many_isolated(zone))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + if (fatal_signal_pending(current)) + return 0; + } + + /* Time to isolate some pages for migration */ + spin_lock_irq(&zone->lru_lock); + for (; low_pfn < end_pfn; low_pfn++) { + struct page *page; + if (!pfn_valid_within(low_pfn)) + continue; + + /* Get the page and skip if free */ + page = pfn_to_page(low_pfn); + if (PageBuddy(page)) + continue; + + /* Try isolate the page */ + if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) + continue; + + /* Successfully isolated */ + del_page_from_lru_list(zone, page, page_lru(page)); + list_add(&page->lru, migratelist); + mem_cgroup_del_lru(page); + cc->nr_migratepages++; + + /* Avoid isolating too much */ + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + break; + } + + acct_isolated(zone, cc); + + spin_unlock_irq(&zone->lru_lock); + cc->migrate_pfn = low_pfn; + + return cc->nr_migratepages; +} + +/* + * This is a migrate-callback that "allocates" freepages by taking pages + * from the isolated freelists in the block we are migrating to. + */ +static struct page *compaction_alloc(struct page *migratepage, + unsigned long data, + int **result) +{ + struct compact_control *cc = (struct compact_control *)data; + struct page *freepage; + + /* Isolate free pages if necessary */ + if (list_empty(&cc->freepages)) { + isolate_freepages(cc->zone, cc); + + if (list_empty(&cc->freepages)) + return NULL; + } + + freepage = list_entry(cc->freepages.next, struct page, lru); + list_del(&freepage->lru); + cc->nr_freepages--; + + return freepage; +} + +/* + * We cannot control nr_migratepages and nr_freepages fully when migration is + * running as migrate_pages() has no knowledge of compact_control. When + * migration is complete, we count the number of pages on the lists by hand. + */ +static void update_nr_listpages(struct compact_control *cc) +{ + int nr_migratepages = 0; + int nr_freepages = 0; + struct page *page; + + list_for_each_entry(page, &cc->migratepages, lru) + nr_migratepages++; + list_for_each_entry(page, &cc->freepages, lru) + nr_freepages++; + + cc->nr_migratepages = nr_migratepages; + cc->nr_freepages = nr_freepages; +} + +static int compact_finished(struct zone *zone, + struct compact_control *cc) +{ + if (fatal_signal_pending(current)) + return COMPACT_PARTIAL; + + /* Compaction run completes if the migrate and free scanner meet */ + if (cc->free_pfn <= cc->migrate_pfn) + return COMPACT_COMPLETE; + + return COMPACT_CONTINUE; +} + +static int compact_zone(struct zone *zone, struct compact_control *cc) +{ + int ret; + + /* Setup to move all movable pages to the end of the zone */ + cc->migrate_pfn = zone->zone_start_pfn; + cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; + cc->free_pfn &= ~(pageblock_nr_pages-1); + + migrate_prep_local(); + + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { + unsigned long nr_migrate, nr_remaining; + + if (!isolate_migratepages(zone, cc)) + continue; + + nr_migrate = cc->nr_migratepages; + migrate_pages(&cc->migratepages, compaction_alloc, + (unsigned long)cc, 0); + update_nr_listpages(cc); + nr_remaining = cc->nr_migratepages; + + count_vm_event(COMPACTBLOCKS); + count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); + if (nr_remaining) + count_vm_events(COMPACTPAGEFAILED, nr_remaining); + + /* Release LRU pages not migrated */ + if (!list_empty(&cc->migratepages)) { + putback_lru_pages(&cc->migratepages); + cc->nr_migratepages = 0; + } + + } + + /* Release free pages and check accounting */ + cc->nr_freepages -= release_freepages(&cc->freepages); + VM_BUG_ON(cc->nr_freepages != 0); + + return ret; +} diff --git a/mm/migrate.c b/mm/migrate.c index 4afd6fe3c074..09e2471afa0f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -40,7 +40,8 @@ /* * migrate_prep() needs to be called before we start compiling a list of pages - * to be migrated using isolate_lru_page(). + * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is + * undesirable, use migrate_prep_local() */ int migrate_prep(void) { @@ -55,6 +56,14 @@ int migrate_prep(void) return 0; } +/* Do the necessary work of migrate_prep but not if it involves other CPUs */ +int migrate_prep_local(void) +{ + lru_add_drain(); + + return 0; +} + /* * Add isolated pages on the list back to the LRU under page lock * to avoid leaking evictable pages back onto unevictable list. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cefe6fe8d991..c54376a09f30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1207,6 +1207,51 @@ void split_page(struct page *page, unsigned int order) set_page_refcounted(page + i); } +/* + * Similar to split_page except the page is already free. As this is only + * being used for migration, the migratetype of the block also changes. + * As this is called with interrupts disabled, the caller is responsible + * for calling arch_alloc_page() and kernel_map_page() after interrupts + * are enabled. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +int split_free_page(struct page *page) +{ + unsigned int order; + unsigned long watermark; + struct zone *zone; + + BUG_ON(!PageBuddy(page)); + + zone = page_zone(page); + order = page_order(page); + + /* Obey watermarks as if the page was being allocated */ + watermark = low_wmark_pages(zone) + (1 << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return 0; + + /* Remove page from free list */ + list_del(&page->lru); + zone->free_area[order].nr_free--; + rmv_page_order(page); + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); + + /* Split into individual pages */ + set_page_refcounted(page); + split_page(page, order); + + if (order >= pageblock_order - 1) { + struct page *endpage = page + (1 << order) - 1; + for (; page < endpage; page += pageblock_nr_pages) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } + + return 1 << order; +} + /* * Really, prep_compound_page() should be called from __rmqueue_bulk(). But * we cheat by calling it from here, in the order > 0 path. Saves a branch diff --git a/mm/vmstat.c b/mm/vmstat.c index 23a5899c7461..c6aacf51b554 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -766,6 +766,13 @@ static const char * const vmstat_text[] = { "allocstall", "pgrotated", + +#ifdef CONFIG_COMPACTION + "compact_blocks_moved", + "compact_pages_moved", + "compact_pagemigrate_failed", +#endif + #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", -- cgit v1.2.3-59-g8ed1b From 76ab0f530e4a01d4dc20cdc1d5e87753c579dc18 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:28 -0700 Subject: mm: compaction: add /proc trigger for memory compaction Add a proc file /proc/sys/vm/compact_memory. When an arbitrary value is written to the file, all zones are compacted. The expected user of such a trigger is a job scheduler that prepares the system before the target application runs. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 10 ++++++++ include/linux/compaction.h | 6 +++++ kernel/sysctl.c | 10 ++++++++ mm/compaction.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+) (limited to 'mm') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6c7d18c53f84..56dd29b97a91 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -19,6 +19,7 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: - block_dump +- compact_memory - dirty_background_bytes - dirty_background_ratio - dirty_bytes @@ -64,6 +65,15 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt. ============================================================== +compact_memory + +Available only when CONFIG_COMPACTION is set. When 1 is written to the file, +all zones are compacted such that free memory is available in contiguous +blocks where possible. This can be important for example in the allocation of +huge pages although processes will also directly compact memory as required. + +============================================================== + dirty_background_bytes Contains the amount of dirty memory at which the pdflush background writeback diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 465ca88615ed..572388880ba8 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -6,4 +6,10 @@ #define COMPACT_PARTIAL 1 #define COMPACT_COMPLETE 2 +#ifdef CONFIG_COMPACTION +extern int sysctl_compact_memory; +extern int sysctl_compaction_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos); +#endif /* CONFIG_COMPACTION */ + #endif /* _LINUX_COMPACTION_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4c93486b45d1..284f330d6a01 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1121,6 +1122,15 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = drop_caches_sysctl_handler, }, +#ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", + .data = &sysctl_compact_memory, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = sysctl_compaction_handler, + }, +#endif /* CONFIG_COMPACTION */ { .procname = "min_free_kbytes", .data = &min_free_kbytes, diff --git a/mm/compaction.c b/mm/compaction.c index be1ff3f7552b..77854fbc0f56 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "internal.h" /* @@ -391,3 +392,64 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } + +/* Compact all zones within a node */ +static int compact_node(int nid) +{ + int zoneid; + pg_data_t *pgdat; + struct zone *zone; + + if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) + return -EINVAL; + pgdat = NODE_DATA(nid); + + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + }; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + return 0; +} + +/* Compact all nodes in the system */ +static int compact_nodes(void) +{ + int nid; + + for_each_online_node(nid) + compact_node(nid); + + return COMPACT_COMPLETE; +} + +/* The written value is actually unused, all memory is compacted */ +int sysctl_compact_memory; + +/* This is the entry point for compacting all nodes via /proc/sys/vm */ +int sysctl_compaction_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + if (write) + return compact_nodes(); + + return 0; +} -- cgit v1.2.3-59-g8ed1b From ed4a6d7f0676db50b5023cc01f6cda82a2f2a307 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:29 -0700 Subject: mm: compaction: add /sys trigger for per-node memory compaction Add a per-node sysfs file called compact. When the file is written to, each zone in that node is compacted. The intention that this would be used by something like a job scheduler in a batch system before a job starts so that the job can allocate the maximum number of hugepages without significant start-up cost. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-devices-node | 7 +++++++ drivers/base/node.c | 3 +++ include/linux/compaction.h | 16 ++++++++++++++++ mm/compaction.c | 23 +++++++++++++++++++++++ 4 files changed, 49 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-devices-node (limited to 'mm') diff --git a/Documentation/ABI/testing/sysfs-devices-node b/Documentation/ABI/testing/sysfs-devices-node new file mode 100644 index 000000000000..453a210c3ceb --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-node @@ -0,0 +1,7 @@ +What: /sys/devices/system/node/nodeX/compact +Date: February 2010 +Contact: Mel Gorman +Description: + When this file is written to, all memory within that node + will be compacted. When it completes, memory will be freed + into blocks which have as many contiguous pages as possible diff --git a/drivers/base/node.c b/drivers/base/node.c index 057979a19eea..2bdd8a94ec94 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -246,6 +247,8 @@ int register_node(struct node *node, int num, struct node *parent) scan_unevictable_register_node(node); hugetlb_register_node(node); + + compaction_register_node(node); } return error; } diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 572388880ba8..ba98cfe0ae15 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -12,4 +12,20 @@ extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif /* CONFIG_COMPACTION */ +#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +extern int compaction_register_node(struct node *node); +extern void compaction_unregister_node(struct node *node); + +#else + +static inline int compaction_register_node(struct node *node) +{ + return 0; +} + +static inline void compaction_unregister_node(struct node *node) +{ +} +#endif /* CONFIG_COMPACTION && CONFIG_SYSFS && CONFIG_NUMA */ + #endif /* _LINUX_COMPACTION_H */ diff --git a/mm/compaction.c b/mm/compaction.c index 77854fbc0f56..f61f77983ff4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" /* @@ -453,3 +454,25 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, return 0; } + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +ssize_t sysfs_compact_node(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + compact_node(dev->id); + + return count; +} +static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); + +int compaction_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_compact); +} + +void compaction_unregister_node(struct node *node) +{ + return sysdev_remove_file(&node->sysdev, &attr_compact); +} +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ -- cgit v1.2.3-59-g8ed1b From 56de7263fcf3eb10c8dcdf8d59a9cec831795f3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:30 -0700 Subject: mm: compaction: direct compact when a high-order allocation fails Ordinarily when a high-order allocation fails, direct reclaim is entered to free pages to satisfy the allocation. With this patch, it is determined if an allocation failed due to external fragmentation instead of low memory and if so, the calling process will compact until a suitable page is freed. Compaction by moving pages in memory is considerably cheaper than paging out to disk and works where there are locked pages or no swap. If compaction fails to free a page of a suitable size, then reclaim will still occur. Direct compaction returns as soon as possible. As each block is compacted, it is checked if a suitable page has been freed and if so, it returns. [akpm@linux-foundation.org: Fix build errors] [aarcange@redhat.com: fix count_vm_event preempt in memory compaction direct reclaim] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 24 ++++++++-- include/linux/vmstat.h | 1 + mm/compaction.c | 117 +++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 63 ++++++++++++++++++++++++ mm/vmstat.c | 16 ++++++- 5 files changed, 215 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index ba98cfe0ae15..eed40ec4280b 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -1,15 +1,31 @@ #ifndef _LINUX_COMPACTION_H #define _LINUX_COMPACTION_H -/* Return values for compact_zone() */ -#define COMPACT_CONTINUE 0 -#define COMPACT_PARTIAL 1 -#define COMPACT_COMPLETE 2 +/* Return values for compact_zone() and try_to_compact_pages() */ +/* compaction didn't start as it was not possible or direct reclaim was more suitable */ +#define COMPACT_SKIPPED 0 +/* compaction should continue to another pageblock */ +#define COMPACT_CONTINUE 1 +/* direct compaction partially compacted a zone and there are suitable pages */ +#define COMPACT_PARTIAL 2 +/* The full zone was compacted */ +#define COMPACT_COMPLETE 3 #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); + +extern int fragmentation_index(struct zone *zone, unsigned int order); +extern unsigned long try_to_compact_pages(struct zonelist *zonelist, + int order, gfp_t gfp_mask, nodemask_t *mask); +#else +static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, + int order, gfp_t gfp_mask, nodemask_t *nodemask) +{ + return COMPACT_CONTINUE; +} + #endif /* CONFIG_COMPACTION */ #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index b421d1b22b62..7f43ccdc1d38 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -45,6 +45,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, #ifdef CONFIG_COMPACTION COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED, + COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, diff --git a/mm/compaction.c b/mm/compaction.c index f61f77983ff4..9583e193dc47 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -35,6 +35,8 @@ struct compact_control { unsigned long nr_anon; unsigned long nr_file; + unsigned int order; /* order a direct compactor needs */ + int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; }; @@ -341,6 +343,9 @@ static void update_nr_listpages(struct compact_control *cc) static int compact_finished(struct zone *zone, struct compact_control *cc) { + unsigned int order; + unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + if (fatal_signal_pending(current)) return COMPACT_PARTIAL; @@ -348,6 +353,24 @@ static int compact_finished(struct zone *zone, if (cc->free_pfn <= cc->migrate_pfn) return COMPACT_COMPLETE; + /* Compaction run is not finished if the watermark is not met */ + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) + return COMPACT_CONTINUE; + + if (cc->order == -1) + return COMPACT_CONTINUE; + + /* Direct compactor: Is a suitable page free? */ + for (order = cc->order; order < MAX_ORDER; order++) { + /* Job done if page is free of the right migratetype */ + if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (order >= pageblock_order && zone->free_area[order].nr_free) + return COMPACT_PARTIAL; + } + return COMPACT_CONTINUE; } @@ -394,6 +417,99 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } +static unsigned long compact_zone_order(struct zone *zone, + int order, gfp_t gfp_mask) +{ + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + .order = order, + .migratetype = allocflags_to_migratetype(gfp_mask), + .zone = zone, + }; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + return compact_zone(zone, &cc); +} + +/** + * try_to_compact_pages - Direct compact to satisfy a high-order allocation + * @zonelist: The zonelist used for the current allocation + * @order: The order of the current allocation + * @gfp_mask: The GFP mask of the current allocation + * @nodemask: The allowed nodes to allocate from + * + * This is the main entry point for direct page compaction. + */ +unsigned long try_to_compact_pages(struct zonelist *zonelist, + int order, gfp_t gfp_mask, nodemask_t *nodemask) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + int may_enter_fs = gfp_mask & __GFP_FS; + int may_perform_io = gfp_mask & __GFP_IO; + unsigned long watermark; + struct zoneref *z; + struct zone *zone; + int rc = COMPACT_SKIPPED; + + /* + * Check whether it is worth even starting compaction. The order check is + * made because an assumption is made that the page allocator can satisfy + * the "cheaper" orders without taking special steps + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + return rc; + + count_vm_event(COMPACTSTALL); + + /* Compact each zone in the list */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + nodemask) { + int fragindex; + int status; + + /* + * Watermarks for order-0 must be met for compaction. Note + * the 2UL. This is because during migration, copies of + * pages need to be allocated and for a short time, the + * footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + continue; + + /* + * fragmentation index determines if allocation failures are + * due to low memory or external fragmentation + * + * index of -1 implies allocations might succeed depending + * on watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= 500) + continue; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { + rc = COMPACT_PARTIAL; + break; + } + + status = compact_zone_order(zone, order, gfp_mask); + rc = max(status, rc); + + if (zone_watermark_ok(zone, order, watermark, 0, 0)) + break; + } + + return rc; +} + + /* Compact all zones within a node */ static int compact_node(int nid) { @@ -412,6 +528,7 @@ static int compact_node(int nid) struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, + .order = -1, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c54376a09f30..cd88a860f088 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1758,6 +1759,59 @@ out: return page; } +#ifdef CONFIG_COMPACTION +/* Try memory compaction for high-order allocations before reclaim */ +static struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page; + + if (!order) + return NULL; + + *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, + nodemask); + if (*did_some_progress != COMPACT_SKIPPED) { + + /* Page migration frees to the PCP lists but we want merging */ + drain_pages(get_cpu()); + put_cpu(); + + page = get_page_from_freelist(gfp_mask, nodemask, + order, zonelist, high_zoneidx, + alloc_flags, preferred_zone, + migratetype); + if (page) { + count_vm_event(COMPACTSUCCESS); + return page; + } + + /* + * It's bad if compaction run occurs and fails. + * The most likely reason is that pages exist, + * but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); + + cond_resched(); + } + + return NULL; +} +#else +static inline struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + return NULL; +} +#endif /* CONFIG_COMPACTION */ + /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, @@ -1944,6 +1998,15 @@ rebalance: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; + /* Try direct compaction */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, diff --git a/mm/vmstat.c b/mm/vmstat.c index c6aacf51b554..7759941d4e77 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -429,7 +429,7 @@ static void fill_contig_page_info(struct zone *zone, * The value can be used to determine if page reclaim or compaction * should be used */ -int fragmentation_index(unsigned int order, struct contig_page_info *info) +static int __fragmentation_index(unsigned int order, struct contig_page_info *info) { unsigned long requested = 1UL << order; @@ -448,6 +448,15 @@ int fragmentation_index(unsigned int order, struct contig_page_info *info) */ return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); } + +/* Same as __fragmentation index but allocs contig_page_info on stack */ +int fragmentation_index(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + return __fragmentation_index(order, &info); +} #endif #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) @@ -771,6 +780,9 @@ static const char * const vmstat_text[] = { "compact_blocks_moved", "compact_pages_moved", "compact_pagemigrate_failed", + "compact_stall", + "compact_fail", + "compact_success", #endif #ifdef CONFIG_HUGETLB_PAGE @@ -1136,7 +1148,7 @@ static void extfrag_show_print(struct seq_file *m, zone->name); for (order = 0; order < MAX_ORDER; ++order) { fill_contig_page_info(zone, order, &info); - index = fragmentation_index(order, &info); + index = __fragmentation_index(order, &info); seq_printf(m, "%d.%03d ", index / 1000, index % 1000); } -- cgit v1.2.3-59-g8ed1b From 5e7719058079a1423ccce56148b0aaa56b2df821 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:31 -0700 Subject: mm: compaction: add a tunable that decides when memory should be compacted and when it should be reclaimed The kernel applies some heuristics when deciding if memory should be compacted or reclaimed to satisfy a high-order allocation. One of these is based on the fragmentation. If the index is below 500, memory will not be compacted. This choice is arbitrary and not based on data. To help optimise the system and set a sensible default for this value, this patch adds a sysctl extfrag_threshold. The kernel will only compact memory if the fragmentation index is above the extfrag_threshold. [randy.dunlap@oracle.com: Fix build errors when proc fs is not configured] Signed-off-by: Mel Gorman Signed-off-by: Randy Dunlap Cc: Rik van Riel Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 15 +++++++++++++++ include/linux/compaction.h | 3 +++ kernel/sysctl.c | 15 +++++++++++++++ mm/compaction.c | 12 +++++++++++- 4 files changed, 44 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 56dd29b97a91..5fdbb612aeb8 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -27,6 +27,7 @@ Currently, these files are in /proc/sys/vm: - dirty_ratio - dirty_writeback_centisecs - drop_caches +- extfrag_threshold - hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode @@ -149,6 +150,20 @@ user should run `sync' first. ============================================================== +extfrag_threshold + +This parameter affects whether the kernel will compact memory or direct +reclaim to satisfy a high-order allocation. /proc/extfrag_index shows what +the fragmentation index for each order is in each zone in the system. Values +tending towards 0 imply allocations would fail due to lack of memory, +values towards 1000 imply failures are due to fragmentation and -1 implies +that the allocation will succeed as long as watermarks are met. + +The kernel will not compact memory in a zone if the +fragmentation index is <= extfrag_threshold. The default value is 500. + +============================================================== + hugepages_treat_as_movable This parameter is only useful when kernelcore= is specified at boot time to diff --git a/include/linux/compaction.h b/include/linux/compaction.h index eed40ec4280b..3719325c6091 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -15,6 +15,9 @@ extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); +extern int sysctl_extfrag_threshold; +extern int sysctl_extfrag_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos); extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 284f330d6a01..84ff5e75c084 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -263,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */ static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif +#ifdef CONFIG_COMPACTION +static int min_extfrag_threshold; +static int max_extfrag_threshold = 1000; +#endif + static struct ctl_table kern_table[] = { { .procname = "sched_child_runs_first", @@ -1130,6 +1135,16 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = sysctl_compaction_handler, }, + { + .procname = "extfrag_threshold", + .data = &sysctl_extfrag_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_extfrag_handler, + .extra1 = &min_extfrag_threshold, + .extra2 = &max_extfrag_threshold, + }, + #endif /* CONFIG_COMPACTION */ { .procname = "min_free_kbytes", diff --git a/mm/compaction.c b/mm/compaction.c index 9583e193dc47..94cce51b0b35 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -433,6 +433,8 @@ static unsigned long compact_zone_order(struct zone *zone, return compact_zone(zone, &cc); } +int sysctl_extfrag_threshold = 500; + /** * try_to_compact_pages - Direct compact to satisfy a high-order allocation * @zonelist: The zonelist used for the current allocation @@ -491,7 +493,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, * Only compact if a failure would be due to fragmentation. */ fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= 500) + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) continue; if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { @@ -572,6 +574,14 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, return 0; } +int sysctl_extfrag_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, buffer, length, ppos); + + return 0; +} + #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) ssize_t sysfs_compact_node(struct sys_device *dev, struct sysdev_attribute *attr, -- cgit v1.2.3-59-g8ed1b From 4f92e2586b43a2402e116055d4edda704f911b5b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:32 -0700 Subject: mm: compaction: defer compaction using an exponential backoff when compaction fails The fragmentation index may indicate that a failure is due to external fragmentation but after a compaction run completes, it is still possible for an allocation to fail. There are two obvious reasons as to why o Page migration cannot move all pages so fragmentation remains o A suitable page may exist but watermarks are not met In the event of compaction followed by an allocation failure, this patch defers further compaction in the zone (1 << compact_defer_shift) times. If the next compaction attempt also fails, compact_defer_shift is increased up to a maximum of 6. If compaction succeeds, the defer counters are reset again. The zone that is deferred is the first zone in the zonelist - i.e. the preferred zone. To defer compaction in the other zones, the information would need to be stored in the zonelist or implemented similar to the zonelist_cache. This would impact the fast-paths and is not justified at this time. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 9 +++++++++ mm/page_alloc.c | 5 ++++- 3 files changed, 52 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 3719325c6091..5ac51552d908 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,6 +22,36 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); + +/* Do not skip compaction more than 64 times */ +#define COMPACT_MAX_DEFER_SHIFT 6 + +/* + * Compaction is deferred when compaction fails to result in a page + * allocation success. 1 << compact_defer_limit compactions are skipped up + * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT + */ +static inline void defer_compaction(struct zone *zone) +{ + zone->compact_considered = 0; + zone->compact_defer_shift++; + + if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) + zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; +} + +/* Returns true if compaction should be skipped this time */ +static inline bool compaction_deferred(struct zone *zone) +{ + unsigned long defer_limit = 1UL << zone->compact_defer_shift; + + /* Avoid possible overflow */ + if (++zone->compact_considered > defer_limit) + zone->compact_considered = defer_limit; + + return zone->compact_considered < (1UL << zone->compact_defer_shift); +} + #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) @@ -29,6 +59,15 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_CONTINUE; } +static inline void defer_compaction(struct zone *zone) +{ +} + +static inline bool compaction_deferred(struct zone *zone) +{ + return 1; +} + #endif /* CONFIG_COMPACTION */ #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cf9e458e96b0..fd55f725a09e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -321,6 +321,15 @@ struct zone { unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ +#ifdef CONFIG_COMPACTION + /* + * On compaction failure, 1<compact_considered = 0; + preferred_zone->compact_defer_shift = 0; count_vm_event(COMPACTSUCCESS); return page; } @@ -1795,6 +1797,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); + defer_compaction(preferred_zone); cond_resched(); } -- cgit v1.2.3-59-g8ed1b From 6ec3a12712ac67ffa4b80d16e0767ffd2431a68d Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Mon, 24 May 2010 14:32:33 -0700 Subject: mm: consider the entire user address space during node migration Use mm->task_size instead of TASK_SIZE to ensure that the entire user address space is migrated. mm->task_size is independent of the calling task context. TASK SIZE may be dependant on the address space size of the calling process. Usage of TASK_SIZE can lead to partial address space migration if the calling process was 32 bit and the migrating process was 64 bit. Here is the test script used on 64 system with a 32 bit echo process: mount -t cgroup none /cgroup -o cpuset cd /cgroup mkdir 0 echo 1 > 0/cpuset.cpus echo 0 > 0/cpuset.mems echo 1 > 0/cpuset.memory_migrate mkdir 1 echo 1 > 1/cpuset.cpus echo 1 > 1/cpuset.mems echo 1 > 1/cpuset.memory_migrate echo $$ > 0/tasks 64_bit_process & pid=$! echo $pid > 1/tasks # This does not migrate all process pages without # this patch. If 64 bit echo is used or this patch is # applied, then the full address space of $pid is # migrated. To check memory migration, I watched: grep MemUsed /sys/devices/system/node/node*/meminfo Signed-off-by: Greg Thelen Acked-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 721b2b338032..75751012c552 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -928,7 +928,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodes_clear(nmask); node_set(source, nmask); - check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) -- cgit v1.2.3-59-g8ed1b From 76a33fc380c9a65e01eb15b3b87c05863a0d51db Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 24 May 2010 14:32:36 -0700 Subject: vmscan: prevent get_scan_ratio() rounding errors get_scan_ratio() calculates percentage and if the percentage is < 1%, it will round percentage down to 0% and cause we completely ignore scanning anon/file pages to reclaim memory even the total anon/file pages are very big. To avoid underflow, we don't use percentage, instead we directly calculate how many pages should be scaned. In this way, we should get several scanned pages for < 1% percent. This has some benefits: 1. increase our calculation precision 2. making our scan more smoothly. Without this, if percent[x] is underflow, shrink_zone() doesn't scan any pages and suddenly it scans all pages when priority is zero. With this, even priority isn't zero, shrink_zone() gets chance to scan some pages. Note, this patch doesn't really change logics, but just increase precision. For system with a lot of memory, this might slightly changes behavior. For example, in a sequential file read workload, without the patch, we don't swap any anon pages. With it, if anon memory size is bigger than 16G, we will see one anon page swapped. The 16G is calculated as PAGE_SIZE * priority(4096) * (fp/ap). fp/ap is assumed to be 1024 which is common in this workload. So the impact sounds not a big deal. Signed-off-by: Shaohua Li Cc: KOSAKI Motohiro Acked-by: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 107 +++++++++++++++++++++++++++++++----------------------------- 1 file changed, 55 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 25b0202c60df..8e1d72333e8a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1513,22 +1513,53 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } +/* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= SWAP_CLUSTER_MAX) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * percent[0] specifies how much pressure to put on ram/swap backed - * memory, while percent[1] determines pressure on the file LRUs. + * nr[0] = anon pages to scan; nr[1] = file pages to scan */ -static void get_scan_ratio(struct zone *zone, struct scan_control *sc, - unsigned long *percent) +static void get_scan_count(struct zone *zone, struct scan_control *sc, + unsigned long *nr, int priority) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + u64 fraction[2], denominator; + enum lru_list l; + int noswap = 0; + + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + fraction[0] = 0; + fraction[1] = 1; + denominator = 1; + goto out; + } anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); @@ -1540,9 +1571,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, /* If we have very few page cache pages, force-scan anon pages. */ if (unlikely(file + free <= high_wmark_pages(zone))) { - percent[0] = 100; - percent[1] = 0; - return; + fraction[0] = 1; + fraction[1] = 0; + denominator = 1; + goto out; } } @@ -1589,29 +1621,22 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - /* Normalize to percentages */ - percent[0] = 100 * ap / (ap + fp + 1); - percent[1] = 100 - percent[0]; -} - -/* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan) -{ - unsigned long nr; - - *nr_saved_scan += nr_to_scan; - nr = *nr_saved_scan; - - if (nr >= SWAP_CLUSTER_MAX) - *nr_saved_scan = 0; - else - nr = 0; + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp + 1; +out: + for_each_evictable_lru(l) { + int file = is_file_lru(l); + unsigned long scan; - return nr; + scan = zone_nr_lru_pages(zone, sc, l); + if (priority || noswap) { + scan >>= priority; + scan = div64_u64(scan * fraction[file], denominator); + } + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l]); + } } /* @@ -1622,33 +1647,11 @@ static void shrink_zone(int priority, struct zone *zone, { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - int noswap = 0; - - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - noswap = 1; - percent[0] = 0; - percent[1] = 100; - } else - get_scan_ratio(zone, sc, percent); - for_each_evictable_lru(l) { - int file = is_file_lru(l); - unsigned long scan; - - scan = zone_nr_lru_pages(zone, sc, l); - if (priority || noswap) { - scan >>= priority; - scan = (scan * percent[file]) / 100; - } - nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l]); - } + get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { -- cgit v1.2.3-59-g8ed1b From bf8abe8b926f7546eb763fd2a088fe461dde6317 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Mon, 24 May 2010 14:32:36 -0700 Subject: readahead.c: fix comment Fix a wrong comment over page_cache_async_readahead(). Signed-off-by: Huang Shijie Acked-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index dfa9a1a03a11..77506a291a2d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -523,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * @req_size: hint: total size of the read which the caller is performing in * pagecache pages * - * page_cache_async_ondemand() should be called when a page is used which + * page_cache_async_readahead() should be called when a page is used which * has the PG_readahead flag; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. -- cgit v1.2.3-59-g8ed1b From 5f53e76299ceebd68bdf9495e8ff80db77711236 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 24 May 2010 14:32:37 -0700 Subject: vmscan: page_check_references(): check low order lumpy reclaim properly If vmscan is under lumpy reclaim mode, it have to ignore referenced bit for making contenious free pages. but current page_check_references() doesn't. Fix it. Signed-off-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e1d72333e8a..cd4a5edf5be2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -77,6 +77,12 @@ struct scan_control { int order; + /* + * Intend to reclaim enough contenious memory rather than to reclaim + * enough amount memory. I.e, it's the mode for high order allocation. + */ + bool lumpy_reclaim_mode; + /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -575,7 +581,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + if (sc->lumpy_reclaim_mode) return PAGEREF_RECLAIM; /* @@ -1125,7 +1131,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scanned = 0; unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - int lumpy_reclaim = 0; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1135,17 +1140,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, return SWAP_CLUSTER_MAX; } - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - lumpy_reclaim = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - lumpy_reclaim = 1; pagevec_init(&pvec, 1); @@ -1158,7 +1152,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_freed; unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; + int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE; unsigned long nr_anon; unsigned long nr_file; @@ -1211,7 +1205,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * but that should be acceptable to the caller */ if (nr_freed < nr_taken && !current_is_kswapd() && - lumpy_reclaim) { + sc->lumpy_reclaim_mode) { congestion_wait(BLK_RW_ASYNC, HZ/10); /* @@ -1639,6 +1633,21 @@ out: } } +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) +{ + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->lumpy_reclaim_mode = 1; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->lumpy_reclaim_mode = 1; + else + sc->lumpy_reclaim_mode = 0; +} + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1653,6 +1662,8 @@ static void shrink_zone(int priority, struct zone *zone, get_scan_count(zone, sc, nr, priority); + set_lumpy_reclaim_mode(priority, sc); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { -- cgit v1.2.3-59-g8ed1b From ec95f53aa6ed62ba68660cb19c8474ebe9025cce Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 24 May 2010 14:32:38 -0700 Subject: mm: introduce free_pages_prepare() free_hot_cold_page() and __free_pages_ok() have very similar freeing preparation. Consolidate them. [akpm@linux-foundation.org: fix busted coding style] Signed-off-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 95ad42de5a87..8f4f27841b71 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -620,20 +620,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order, spin_unlock(&zone->lock); } -static void __free_pages_ok(struct page *page, unsigned int order) +static bool free_pages_prepare(struct page *page, unsigned int order) { - unsigned long flags; int i; int bad = 0; - int wasMlocked = __TestClearPageMlocked(page); trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0 ; i < (1 << order) ; ++i) - bad += free_pages_check(page + i); + for (i = 0; i < (1 << order); i++) { + struct page *pg = page + i; + + if (PageAnon(pg)) + pg->mapping = NULL; + bad += free_pages_check(pg); + } if (bad) - return; + return false; if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page),PAGE_SIZE<mapping = NULL; - if (free_pages_check(page)) + if (!free_pages_prepare(page, 0)) return; - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), PAGE_SIZE); - debug_check_no_obj_freed(page_address(page), PAGE_SIZE); - } - arch_free_page(page, 0); - kernel_map_pages(page, 1, 0); - migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); local_irq_save(flags); -- cgit v1.2.3-59-g8ed1b From 142762bd8d8c46345e79f0f68d3374564306972f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 May 2010 14:32:39 -0700 Subject: mm: document follow_page() Signed-off-by: Johannes Weiner Cc: Dan Carpenter Cc: Rik van Riel Cc: Izik Eidus Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 833952d8b74d..119b7ccdf39b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1227,8 +1227,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -/* - * Do a quick page-table lookup for a single page. +/** + * follow_page - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * + * @flags can have FOLL_ flags set, defined in + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). */ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags) -- cgit v1.2.3-59-g8ed1b From 0aeb2339e54e40d0788a7017ecaeac7f5271e262 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 May 2010 14:32:40 -0700 Subject: vmscan: remove all_unreclaimable scan control This scan control is abused to communicate a return value from shrink_zones(). Write this idiomatically and remove the knob. Signed-off-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index cd4a5edf5be2..c55763ee8312 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -73,8 +73,6 @@ struct scan_control { int swappiness; - int all_unreclaimable; - int order; /* @@ -1716,14 +1714,14 @@ static void shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void shrink_zones(int priority, struct zonelist *zonelist, +static int shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); struct zoneref *z; struct zone *zone; + int progress = 0; - sc->all_unreclaimable = 1; for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, sc->nodemask) { if (!populated_zone(zone)) @@ -1739,19 +1737,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist, if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - sc->all_unreclaimable = 0; } else { /* * Ignore cpuset limitation here. We just want to reduce * # of used pages by us regardless of memory shortage. */ - sc->all_unreclaimable = 0; mem_cgroup_note_reclaim_priority(sc->mem_cgroup, priority); } shrink_zone(priority, zone, sc); + progress = 1; } + return progress; } /* @@ -1805,7 +1803,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(); - shrink_zones(priority, zonelist, sc); + ret = shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -1842,7 +1840,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ - if (!sc->all_unreclaimable && scanning_global_lru(sc)) + if (ret && scanning_global_lru(sc)) ret = sc->nr_reclaimed; out: /* -- cgit v1.2.3-59-g8ed1b From 8b25c6d2231b978ccce9c401e771932bde79aa9f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 May 2010 14:32:40 -0700 Subject: vmscan: remove isolate_pages callback scan control For now, we have global isolation vs. memory control group isolation, do not allow the reclaim entry function to set an arbitrary page isolation callback, we do not need that flexibility. And since we already pass around the group descriptor for the memory control group isolation case, just use it to decide which one of the two isolator functions to use. The decisions can be merged into nearby branches, so no extra cost there. In fact, we save the indirect calls. Signed-off-by: Johannes Weiner Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 ++++++------ mm/vmscan.c | 52 +++++++++++++++++++++++++--------------------- 2 files changed, 35 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 44301c6affa8..05894795fdc1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,6 +25,13 @@ struct page_cgroup; struct page; struct mm_struct; +extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_cgroup *mem_cont, + int active, int file); + #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* * All "charge" functions with gfp_mask should use GFP_KERNEL or @@ -64,12 +71,6 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page); extern int mem_cgroup_shmem_charge_fallback(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); -extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - int mode, struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file); extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); diff --git a/mm/vmscan.c b/mm/vmscan.c index c55763ee8312..915dceb487c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -89,12 +89,6 @@ struct scan_control { * are scanned. */ nodemask_t *nodemask; - - /* Pluggable isolate pages callback */ - unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, - unsigned long *scanned, int order, int mode, - struct zone *z, struct mem_cgroup *mem_cont, - int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -1010,7 +1004,6 @@ static unsigned long isolate_pages_global(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, - struct mem_cgroup *mem_cont, int active, int file) { int lru = LRU_BASE; @@ -1154,11 +1147,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_anon; unsigned long nr_file; - nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, - &page_list, &nr_scan, sc->order, mode, - zone, sc->mem_cgroup, 0, file); - if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX, + &page_list, &nr_scan, + sc->order, mode, + zone, 0, file); zone->pages_scanned += nr_scan; if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1166,6 +1159,16 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, else __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); + } else { + nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX, + &page_list, &nr_scan, + sc->order, mode, + zone, sc->mem_cgroup, + 0, file); + /* + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. + */ } if (nr_taken == 0) @@ -1343,16 +1346,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, lru_add_drain(); spin_lock_irq(&zone->lru_lock); - nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1, file); - /* - * zone->pages_scanned is used for detect zone's oom - * mem_cgroup remembers nr_scan by itself. - */ if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + 1, file); zone->pages_scanned += pgscanned; + } else { + nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + sc->mem_cgroup, 1, file); + /* + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. + */ } + reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); @@ -1882,7 +1892,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, .nodemask = nodemask, }; @@ -1903,7 +1912,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .swappiness = swappiness, .order = 0, .mem_cgroup = mem, - .isolate_pages = mem_cgroup_isolate_pages, }; nodemask_t nm = nodemask_of_node(nid); @@ -1937,7 +1945,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, - .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; @@ -2015,7 +2022,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, }; /* * temp_priority is used to remember the scanning priority at which @@ -2394,7 +2400,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .hibernation_mode = 1, .swappiness = vm_swappiness, .order = 0, - .isolate_pages = isolate_pages_global, }; struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -2579,7 +2584,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order, - .isolate_pages = isolate_pages_global, }; unsigned long slab_reclaimable; -- cgit v1.2.3-59-g8ed1b From cf23422b9d76215316855253da491d4c9f294372 Mon Sep 17 00:00:00 2001 From: minskey guo Date: Mon, 24 May 2010 14:32:41 -0700 Subject: cpu/mem hotplug: enable CPUs online before local memory online Enable users to online CPUs even if the CPUs belongs to a numa node which doesn't have onlined local memory. The zonlists(pg_data_t.node_zonelists[]) of a numa node are created either in system boot/init period, or at the time of local memory online. For a numa node without onlined local memory, its zonelists are not initialized at present. As a result, any memory allocation operations executed by CPUs within this node will fail. In fact, an out-of-memory error is triggered when attempt to online CPUs before memory comes to online. This patch tries to create zonelists for such numa nodes, so that the memory allocation for this node can be fallback'ed to other nodes. [akpm@linux-foundation.org: remove unneeded export] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: minskey guo Cc: Minchan Kim Cc: Yasunori Goto Cc: Andi Kleen Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 + kernel/cpu.c | 25 +++++++++++++++++++++++++ mm/memory_hotplug.c | 23 +++++++++++++++++++++++ 3 files changed, 49 insertions(+) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 35b07b773e6c..864035fb8f8a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -202,6 +202,7 @@ static inline int is_mem_section_removable(unsigned long pfn, } #endif /* CONFIG_MEMORY_HOTREMOVE */ +extern int mem_online_node(int nid); extern int add_memory(int nid, u64 start, u64 size); extern int arch_add_memory(int nid, u64 start, u64 size); extern int remove_memory(u64 start, u64 size); diff --git a/kernel/cpu.c b/kernel/cpu.c index 545777574779..a3fbcc0a0abc 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -326,6 +326,12 @@ out_notify: int __cpuinit cpu_up(unsigned int cpu) { int err = 0; + +#ifdef CONFIG_MEMORY_HOTPLUG + int nid; + pg_data_t *pgdat; +#endif + if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); @@ -336,6 +342,25 @@ int __cpuinit cpu_up(unsigned int cpu) return -EINVAL; } +#ifdef CONFIG_MEMORY_HOTPLUG + nid = cpu_to_node(cpu); + if (!node_online(nid)) { + err = mem_online_node(nid); + if (err) + return err; + } + + pgdat = NODE_DATA(nid); + if (!pgdat) { + printk(KERN_ERR + "Can't online cpu %d due to NULL pgdat\n", cpu); + return -ENOMEM; + } + + if (pgdat->node_zonelists->_zonerefs->zone == NULL) + build_all_zonelists(); +#endif + cpu_maps_update_begin(); if (cpu_hotplug_disabled) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index be211a582930..85eb4d342ac5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -482,6 +482,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } +/* + * called by cpu_up() to online a node without onlined memory. + */ +int mem_online_node(int nid) +{ + pg_data_t *pgdat; + int ret; + + lock_system_sleep(); + pgdat = hotadd_new_pgdat(nid, 0); + if (pgdat) { + ret = -ENOMEM; + goto out; + } + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + +out: + unlock_system_sleep(); + return ret; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { -- cgit v1.2.3-59-g8ed1b From ff3d58c22b6827039983911d3460cf0c1657f8cc Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 24 May 2010 14:32:46 -0700 Subject: highmem: remove unneeded #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT for debug_kmap_atomic() In f4112de6b679d84bd9b9681c7504be7bdfb7c7d5 ("mm: introduce debug_kmap_atomic") I said that debug_kmap_atomic() needs CONFIG_TRACE_IRQFLAGS_SUPPORT. It was wrong. (I thought irqs_disabled() is only available when the architecture has CONFIG_TRACE_IRQFLAGS_SUPPORT) Remove the #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT check to enable kmap_atomic() debugging for the architectures which do not have CONFIG_TRACE_IRQFLAGS_SUPPORT. Reported-by: Andrew Morton Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 2 +- mm/highmem.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 74152c08ad07..caafd0561aa1 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -27,7 +27,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include -#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) +#ifdef CONFIG_DEBUG_HIGHMEM void debug_kmap_atomic(enum km_type type); diff --git a/mm/highmem.c b/mm/highmem.c index bed8a8bfd01f..66baa20f78f5 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -422,7 +422,7 @@ void __init page_address_init(void) #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ -#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) +#ifdef CONFIG_DEBUG_HIGHMEM void debug_kmap_atomic(enum km_type type) { -- cgit v1.2.3-59-g8ed1b From 319774e25fa4b7641bdc3b0a464dd84e62103347 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 24 May 2010 14:32:49 -0700 Subject: mem-hotplug: separate setup_per_cpu_pageset() into separate functions No behavior change here. Move some of setup_per_cpu_pageset() code into a new function setup_zone_pageset() that will be useful for memory hotplug. Signed-off-by: Wu Fengguang Signed-off-by: Haicheng Li Reviewed-by: Andi Kleen Reviewed-by: Christoph Lameter Cc: Mel Gorman Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f4f27841b71..595d0ac211e2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3292,31 +3292,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } +static __meminit void setup_zone_pageset(struct zone *zone) +{ + int cpu; + + zone->pageset = alloc_percpu(struct per_cpu_pageset); + + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + setup_pageset(pcp, zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->present_pages / + percpu_pagelist_fraction)); + } +} + /* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. - * Boot pagesets will no longer be used by this processorr - * after setup_per_cpu_pageset(). */ void __init setup_per_cpu_pageset(void) { struct zone *zone; - int cpu; - for_each_populated_zone(zone) { - zone->pageset = alloc_percpu(struct per_cpu_pageset); - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - setup_pageset(pcp, zone_batchsize(zone)); - - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->present_pages / - percpu_pagelist_fraction)); - } - } + for_each_populated_zone(zone) + setup_zone_pageset(zone); } static noinline __init_refok -- cgit v1.2.3-59-g8ed1b From 1f522509c77a5dea8dc384b735314f03908a6415 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Mon, 24 May 2010 14:32:51 -0700 Subject: mem-hotplug: avoid multiple zones sharing same boot strapping boot_pageset For each new populated zone of hotadded node, need to update its pagesets with dynamically allocated per_cpu_pageset struct for all possible CPUs: 1) Detach zone->pageset from the shared boot_pageset at end of __build_all_zonelists(). 2) Use mutex to protect zone->pageset when it's still shared in onlined_pages() Otherwises, multiple zones of different nodes would share same boot strapping boot_pageset for same CPU, which will finally cause below kernel panic: ------------[ cut here ]------------ kernel BUG at mm/page_alloc.c:1239! invalid opcode: 0000 [#1] SMP ... Call Trace: [] __alloc_pages_nodemask+0x131/0x7b0 [] alloc_pages_current+0x87/0xd0 [] __page_cache_alloc+0x67/0x70 [] __do_page_cache_readahead+0x120/0x260 [] ra_submit+0x21/0x30 [] ondemand_readahead+0x166/0x2c0 [] page_cache_async_readahead+0x80/0xa0 [] generic_file_aio_read+0x364/0x670 [] nfs_file_read+0xca/0x130 [] do_sync_read+0xfa/0x140 [] vfs_read+0xb5/0x1a0 [] sys_read+0x51/0x80 [] system_call_fastpath+0x16/0x1b RIP [] get_page_from_freelist+0x883/0x900 RSP ---[ end trace 4bda28328b9990db ] [akpm@linux-foundation.org: merge fix] Signed-off-by: Haicheng Li Signed-off-by: Wu Fengguang Reviewed-by: Andi Kleen Reviewed-by: Christoph Lameter Cc: Mel Gorman Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- init/main.c | 2 +- kernel/cpu.c | 2 +- mm/memory_hotplug.c | 18 +++++++++++++----- mm/page_alloc.c | 17 +++++++++++++---- 5 files changed, 29 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f6f2c505fa7e..a367ed5bb3fe 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -652,7 +652,7 @@ typedef struct pglist_data { void get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free); -void build_all_zonelists(void); +void build_all_zonelists(void *data); void wakeup_kswapd(struct zone *zone, int order); int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); diff --git a/init/main.c b/init/main.c index 22881b5e95e3..3bdb152f412f 100644 --- a/init/main.c +++ b/init/main.c @@ -567,7 +567,7 @@ asmlinkage void __init start_kernel(void) setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ - build_all_zonelists(); + build_all_zonelists(NULL); page_alloc_init(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); diff --git a/kernel/cpu.c b/kernel/cpu.c index a3fbcc0a0abc..3e8b3ba27175 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -358,7 +358,7 @@ int __cpuinit cpu_up(unsigned int cpu) } if (pgdat->node_zonelists->_zonerefs->zone == NULL) - build_all_zonelists(); + build_all_zonelists(NULL); #endif cpu_maps_update_begin(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 85eb4d342ac5..089cc97aed3c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -389,6 +389,11 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) int nid; int ret; struct memory_notify arg; + /* + * mutex to protect zone->pageset when it's still shared + * in onlined_pages() + */ + static DEFINE_MUTEX(zone_pageset_mutex); arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -415,12 +420,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ + mutex_lock(&zone_pageset_mutex); if (!populated_zone(zone)) need_zonelists_rebuild = 1; ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + mutex_unlock(&zone_pageset_mutex); printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); @@ -429,8 +436,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; + if (need_zonelists_rebuild) + build_all_zonelists(zone); + else + zone_pcp_update(zone); - zone_pcp_update(zone); + mutex_unlock(&zone_pageset_mutex); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { @@ -438,10 +449,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); } - if (need_zonelists_rebuild) - build_all_zonelists(); - else - vm_total_pages = nr_free_pagecache_pages(); + vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 595d0ac211e2..21c52d2d8624 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2572,7 +2572,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) - build_all_zonelists(); + build_all_zonelists(NULL); } out: mutex_unlock(&zl_order_mutex); @@ -2922,9 +2922,10 @@ static void build_zonelist_cache(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static void setup_zone_pageset(struct zone *zone); /* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *dummy) +static __init_refok int __build_all_zonelists(void *data) { int nid; int cpu; @@ -2939,6 +2940,14 @@ static int __build_all_zonelists(void *dummy) build_zonelist_cache(pgdat); } +#ifdef CONFIG_MEMORY_HOTPLUG + /* Setup real pagesets for the new zone */ + if (data) { + struct zone *zone = data; + setup_zone_pageset(zone); + } +#endif + /* * Initialize the boot_pagesets that are going to be used * for bootstrapping processors. The real pagesets for @@ -2958,7 +2967,7 @@ static int __build_all_zonelists(void *dummy) return 0; } -void build_all_zonelists(void) +void build_all_zonelists(void *data) { set_zonelist_order(); @@ -2969,7 +2978,7 @@ void build_all_zonelists(void) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, NULL, NULL); + stop_machine(__build_all_zonelists, data, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v1.2.3-59-g8ed1b From 4eaf3f64397c3db3c5785eee508270d62a9fabd9 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Mon, 24 May 2010 14:32:52 -0700 Subject: mem-hotplug: fix potential race while building zonelist for new populated zone Add global mutex zonelists_mutex to fix the possible race: CPU0 CPU1 CPU2 (1) zone->present_pages += online_pages; (2) build_all_zonelists(); (3) alloc_page(); (4) free_page(); (5) build_all_zonelists(); (6) __build_all_zonelists(); (7) zone->pageset = alloc_percpu(); In step (3,4), zone->pageset still points to boot_pageset, so bad things may happen if 2+ nodes are in this state. Even if only 1 node is accessing the boot_pageset, (3) may still consume too much memory to fail the memory allocations in step (7). Besides, atomic operation ensures alloc_percpu() in step (7) will never fail since there is a new fresh memory block added in step(6). [haicheng.li@linux.intel.com: hold zonelists_mutex when build_all_zonelists] Signed-off-by: Haicheng Li Signed-off-by: Wu Fengguang Reviewed-by: Andi Kleen Cc: Christoph Lameter Cc: Mel Gorman Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + kernel/cpu.c | 5 ++++- mm/memory_hotplug.c | 11 +++-------- mm/page_alloc.c | 15 ++++++++++++++- 4 files changed, 22 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a367ed5bb3fe..0fa491326c4a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -650,6 +650,7 @@ typedef struct pglist_data { #include +extern struct mutex zonelists_mutex; void get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free); void build_all_zonelists(void *data); diff --git a/kernel/cpu.c b/kernel/cpu.c index 3e8b3ba27175..124ad9d6be16 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -357,8 +357,11 @@ int __cpuinit cpu_up(unsigned int cpu) return -ENOMEM; } - if (pgdat->node_zonelists->_zonerefs->zone == NULL) + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } #endif cpu_maps_update_begin(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 089cc97aed3c..a4cfcdc00455 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -389,11 +389,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) int nid; int ret; struct memory_notify arg; - /* - * mutex to protect zone->pageset when it's still shared - * in onlined_pages() - */ - static DEFINE_MUTEX(zone_pageset_mutex); arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -420,14 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ - mutex_lock(&zone_pageset_mutex); + mutex_lock(&zonelists_mutex); if (!populated_zone(zone)) need_zonelists_rebuild = 1; ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { - mutex_unlock(&zone_pageset_mutex); + mutex_unlock(&zonelists_mutex); printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); @@ -441,7 +436,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) else zone_pcp_update(zone); - mutex_unlock(&zone_pageset_mutex); + mutex_unlock(&zonelists_mutex); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21c52d2d8624..08b349931ebc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2571,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write, strncpy((char*)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) + } else if (oldval != user_zonelist_order) { + mutex_lock(&zonelists_mutex); build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } } out: mutex_unlock(&zl_order_mutex); @@ -2924,6 +2927,12 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static void setup_zone_pageset(struct zone *zone); +/* + * Global mutex to protect against size modification of zonelists + * as well as to serialize pageset setup for the new populated zone. + */ +DEFINE_MUTEX(zonelists_mutex); + /* return values int ....just for stop_machine() */ static __init_refok int __build_all_zonelists(void *data) { @@ -2967,6 +2976,10 @@ static __init_refok int __build_all_zonelists(void *data) return 0; } +/* + * Called with zonelists_mutex held always + * unless system_state == SYSTEM_BOOTING. + */ void build_all_zonelists(void *data) { set_zonelist_order(); -- cgit v1.2.3-59-g8ed1b From 0cae3457b1a6e88f31020272bcfd90c178716053 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 25 May 2010 23:42:58 -0700 Subject: mempolicy: ERR_PTR dereference in mpol_shared_policy_init() The original code called mpol_put(new) while "new" was an ERR_PTR. Signed-off-by: Dan Carpenter Cc: Lee Schermerhorn Cc: KOSAKI Motohiro Cc: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 75751012c552..5d6fb339de03 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2098,7 +2098,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) - goto put_free; /* no valid nodemask intersection */ + goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); @@ -2114,6 +2114,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) put_free: mpol_put(new); /* drop initial ref */ +free_scratch: NODEMASK_SCRATCH_FREE(scratch); } } -- cgit v1.2.3-59-g8ed1b From 3c7b204547bc3d342a4e31196fe14803581d279f Mon Sep 17 00:00:00 2001 From: Bernd Schmidt Date: Tue, 25 May 2010 23:43:00 -0700 Subject: nommu: allow private mappings of read-only devices Slightly rearrange the logic that determines capabilities and vm_flags. Disable BDI_CAP_MAP_DIRECT in all cases if the device can't support the protections. Allow private readonly mappings of readonly backing devices. Signed-off-by: Bernd Schmidt Signed-off-by: Mike Frysinger Acked-by: David McCullough Acked-by: Greg Ungerer Acked-by: Paul Mundt Acked-by: David Howells Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 63fa17d121f0..b76f3ee0abe0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -918,14 +918,6 @@ static int validate_mmap_request(struct file *file, if (!(capabilities & BDI_CAP_MAP_DIRECT)) return -ENODEV; - if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || - ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || - ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) - ) { - printk("MAP_SHARED not completely supported on !MMU\n"); - return -EINVAL; - } - /* we mustn't privatise shared mappings */ capabilities &= ~BDI_CAP_MAP_COPY; } @@ -941,6 +933,20 @@ static int validate_mmap_request(struct file *file, capabilities &= ~BDI_CAP_MAP_DIRECT; } + if (capabilities & BDI_CAP_MAP_DIRECT) { + if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || + ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || + ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) + ) { + capabilities &= ~BDI_CAP_MAP_DIRECT; + if (flags & MAP_SHARED) { + printk(KERN_WARNING + "MAP_SHARED not completely supported on !MMU\n"); + return -EINVAL; + } + } + } + /* handle executable mappings and implied executable * mappings */ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { @@ -996,22 +1002,20 @@ static unsigned long determine_vm_flags(struct file *file, unsigned long vm_flags; vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); - vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* vm_flags |= mm->def_flags; */ if (!(capabilities & BDI_CAP_MAP_DIRECT)) { /* attempt to share read-only copies of mapped file chunks */ + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (file && !(prot & PROT_WRITE)) vm_flags |= VM_MAYSHARE; - } - else { + } else { /* overlay a shareable mapping on the backing device or inode * if possible - used for chardevs, ramfs/tmpfs/shmfs and * romfs/cramfs */ + vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); if (flags & MAP_SHARED) - vm_flags |= VM_MAYSHARE | VM_SHARED; - else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0) - vm_flags |= VM_MAYSHARE; + vm_flags |= VM_SHARED; } /* refuse to let anyone share private mappings with this process if -- cgit v1.2.3-59-g8ed1b From 91803b499cca2fe558abad709ce83dc896b80950 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 26 May 2010 11:49:40 -0400 Subject: do_generic_file_read: clear page errors when issuing a fresh read of the page I/O errors can happen due to temporary failures, like multipath errors or losing network contact with the iSCSI server. Because of that, the VM will retry readpage on the page. However, do_generic_file_read does not clear PG_error. This causes the system to be unable to actually use the data in the page cache page, even if the subsequent readpage completes successfully! The function filemap_fault has had a ClearPageError before readpage forever. This patch simply adds the same to do_generic_file_read. Signed-off-by: Jeff Moyer Signed-off-by: Rik van Riel Acked-by: Larry Woodman Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 88d719665a28..35e12d186566 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1105,6 +1105,12 @@ page_not_up_to_date_locked: } readpage: + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); -- cgit v1.2.3-59-g8ed1b From dc98df5a1b7be402a0e1c71f1b89ccf249ac15ee Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 26 May 2010 14:42:36 -0700 Subject: memcg: oom wakeup filter memcg's oom waitqueue is a system-wide wait_queue (for handling hierarchy.) So, it's better to add custom wake function and do filtering in wake up path. This patch adds a filtering feature for waking up oom-waiters. Hierarchy is properly handled. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 63 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c8569bc298ff..94ac208b1490 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1293,14 +1293,56 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) static DEFINE_MUTEX(memcg_oom_mutex); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); +struct oom_wait_info { + struct mem_cgroup *mem; + wait_queue_t wait; +}; + +static int memcg_oom_wake_function(wait_queue_t *wait, + unsigned mode, int sync, void *arg) +{ + struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; + struct oom_wait_info *oom_wait_info; + + oom_wait_info = container_of(wait, struct oom_wait_info, wait); + + if (oom_wait_info->mem == wake_mem) + goto wakeup; + /* if no hierarchy, no match */ + if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) + return 0; + /* + * Both of oom_wait_info->mem and wake_mem are stable under us. + * Then we can use css_is_ancestor without taking care of RCU. + */ + if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && + !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) + return 0; + +wakeup: + return autoremove_wake_function(wait, mode, sync, arg); +} + +static void memcg_wakeup_oom(struct mem_cgroup *mem) +{ + /* for filtering, pass "mem" as argument. */ + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); +} + /* * try to call OOM killer. returns false if we should exit memory-reclaim loop. */ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) { - DEFINE_WAIT(wait); + struct oom_wait_info owait; bool locked; + owait.mem = mem; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.task_list); + /* At first, try to OOM lock hierarchy under mem.*/ mutex_lock(&memcg_oom_mutex); locked = mem_cgroup_oom_lock(mem); @@ -1310,31 +1352,18 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * under OOM is always welcomed, use TASK_KILLABLE here. */ if (!locked) - prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); mutex_unlock(&memcg_oom_mutex); if (locked) mem_cgroup_out_of_memory(mem, mask); else { schedule(); - finish_wait(&memcg_oom_waitq, &wait); + finish_wait(&memcg_oom_waitq, &owait.wait); } mutex_lock(&memcg_oom_mutex); mem_cgroup_oom_unlock(mem); - /* - * Here, we use global waitq .....more fine grained waitq ? - * Assume following hierarchy. - * A/ - * 01 - * 02 - * assume OOM happens both in A and 01 at the same time. Tthey are - * mutually exclusive by lock. (kill in 01 helps A.) - * When we use per memcg waitq, we have to wake up waiters on A and 02 - * in addtion to waiters on 01. We use global waitq for avoiding mess. - * It will not be a big problem. - * (And a task may be moved to other groups while it's waiting for OOM.) - */ - wake_up_all(&memcg_oom_waitq); + memcg_wakeup_oom(mem); mutex_unlock(&memcg_oom_mutex); if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) -- cgit v1.2.3-59-g8ed1b From 9490ff275606da012d5b373342a49610ad61cb81 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 26 May 2010 14:42:36 -0700 Subject: memcg: oom notifier Considering containers or other resource management softwares in userland, event notification of OOM in memcg should be implemented. Now, memcg has "threshold" notifier which uses eventfd, we can make use of it for oom notification. This patch adds oom notification eventfd callback for memcg. The usage is very similar to threshold notifier, but control file is memory.oom_control and no arguments other than eventfd is required. % cgroup_event_notifier /cgroup/A/memory.oom_control dummy (About cgroup_event_notifier, see Documentation/cgroup/) Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 20 +++++++- mm/memcontrol.c | 100 +++++++++++++++++++++++++++++++++++---- 2 files changed, 111 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 6cab1f29da4c..eac22d3b2f7b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -184,6 +184,9 @@ limits on the root cgroup. Note2: When panic_on_oom is set to "2", the whole system will panic. +When oom event notifier is registered, event will be delivered. +(See oom_control section) + 2. Locking The memory controller uses the following hierarchy @@ -488,7 +491,22 @@ threshold in any direction. It's applicable for root and non-root cgroup. -10. TODO +10. OOM Control + +Memory controler implements oom notifier using cgroup notification +API (See cgroups.txt). It allows to register multiple oom notification +delivery and gets notification when oom happens. + +To register a notifier, application need: + - create an eventfd using eventfd(2) + - open memory.oom_control file + - write string like " " to cgroup.event_control + +Application will be notifier through eventfd when oom happens. +OOM notification doesn't work for root cgroup. + + +11. TODO 1. Add support for accounting huge pages (as a separate controller) 2. Make per-cgroup scanner reclaim not-shared pages first diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94ac208b1490..da2ed3913316 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -149,6 +149,7 @@ struct mem_cgroup_threshold { u64 threshold; }; +/* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below usage. */ atomic_t current_threshold; @@ -157,8 +158,14 @@ struct mem_cgroup_threshold_ary { /* Array of thresholds */ struct mem_cgroup_threshold entries[0]; }; +/* for OOM */ +struct mem_cgroup_eventfd_list { + struct list_head list; + struct eventfd_ctx *eventfd; +}; static void mem_cgroup_threshold(struct mem_cgroup *mem); +static void mem_cgroup_oom_notify(struct mem_cgroup *mem); /* * The memory controller data structure. The memory controller controls both @@ -220,6 +227,9 @@ struct mem_cgroup { /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_threshold_ary *memsw_thresholds; + /* For oom notifier event fd */ + struct list_head oom_notify; + /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -282,9 +292,12 @@ enum charge_type { /* for encoding cft->private value on file */ #define _MEM (0) #define _MEMSWAP (1) +#define _OOM_TYPE (2) #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +/* Used for OOM nofiier */ +#define OOM_CONTROL (0) /* * Reclaim flags for mem_cgroup_hierarchical_reclaim @@ -1353,6 +1366,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) */ if (!locked) prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + else + mem_cgroup_oom_notify(mem); mutex_unlock(&memcg_oom_mutex); if (locked) @@ -3398,8 +3413,22 @@ static int compare_thresholds(const void *a, const void *b) return _a->threshold - _b->threshold; } -static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd, const char *args) +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) +{ + struct mem_cgroup_eventfd_list *ev; + + list_for_each_entry(ev, &mem->oom_notify, list) + eventfd_signal(ev->eventfd, 1); + return 0; +} + +static void mem_cgroup_oom_notify(struct mem_cgroup *mem) +{ + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); +} + +static int mem_cgroup_usage_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; @@ -3483,8 +3512,8 @@ unlock: return ret; } -static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd) +static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; @@ -3568,13 +3597,61 @@ unlock: return ret; } +static int mem_cgroup_oom_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *event; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + mutex_lock(&memcg_oom_mutex); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (atomic_read(&memcg->oom_lock)) + eventfd_signal(eventfd, 1); + mutex_unlock(&memcg_oom_mutex); + + return 0; +} + +static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *ev, *tmp; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + + mutex_lock(&memcg_oom_mutex); + + list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + mutex_unlock(&memcg_oom_mutex); + + return 0; +} + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read_u64 = mem_cgroup_read, - .register_event = mem_cgroup_register_event, - .unregister_event = mem_cgroup_unregister_event, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "max_usage_in_bytes", @@ -3623,6 +3700,12 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_move_charge_read, .write_u64 = mem_cgroup_move_charge_write, }, + { + .name = "oom_control", + .register_event = mem_cgroup_oom_register_event, + .unregister_event = mem_cgroup_oom_unregister_event, + .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -3631,8 +3714,8 @@ static struct cftype memsw_cgroup_files[] = { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), .read_u64 = mem_cgroup_read, - .register_event = mem_cgroup_register_event, - .unregister_event = mem_cgroup_unregister_event, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "memsw.max_usage_in_bytes", @@ -3878,6 +3961,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } mem->last_scanned_child = 0; spin_lock_init(&mem->reclaim_param_lock); + INIT_LIST_HEAD(&mem->oom_notify); if (parent) mem->swappiness = get_swappiness(parent); -- cgit v1.2.3-59-g8ed1b From 3c11ecf448eff8f12922c498b8274ce98587eb74 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 26 May 2010 14:42:37 -0700 Subject: memcg: oom kill disable and oom status This adds a feature to disable oom-killer for memcg, if disabled, of course, tasks under memcg will stop. But now, we have oom-notifier for memcg. And the world around memcg is not under out-of-memory. memcg's out-of-memory just shows memcg hits limit. Then, administrator or management daemon can recover the situation by - kill some process - enlarge limit, add more swap. - migrate some tasks - remove file cache on tmps (difficult ?) Unlike oom-killer, you can take enough information before killing tasks. (by gcore, or, ps etc.) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 23 ++++++++ mm/memcontrol.c | 113 ++++++++++++++++++++++++++++++++------- 2 files changed, 117 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index eac22d3b2f7b..44e7ded33448 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -493,6 +493,8 @@ It's applicable for root and non-root cgroup. 10. OOM Control +memory.oom_control file is for OOM notification and other controls. + Memory controler implements oom notifier using cgroup notification API (See cgroups.txt). It allows to register multiple oom notification delivery and gets notification when oom happens. @@ -505,6 +507,27 @@ To register a notifier, application need: Application will be notifier through eventfd when oom happens. OOM notification doesn't work for root cgroup. +You can disable oom-killer by writing "1" to memory.oom_control file. +As. + #echo 1 > memory.oom_control + +This operation is only allowed to the top cgroup of subhierarchy. +If oom-killer is disabled, tasks under cgroup will hang/sleep +in memcg's oom-waitq when they request accountable memory. + +For running them, you have to relax the memcg's oom sitaution by + * enlarge limit or reduce usage. +To reduce usage, + * kill some tasks. + * move some tasks to other group with account migration. + * remove some files (on tmpfs?) + +Then, stopped tasks will work again. + +At reading, current status of OOM is shown. + oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) + under_oom 0 or 1 (if 1, the memcg is under OOM,tasks may + be stopped.) 11. TODO diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da2ed3913316..53eb30ebdb49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -214,6 +214,8 @@ struct mem_cgroup { atomic_t refcnt; unsigned int swappiness; + /* OOM-Killer disable */ + int oom_kill_disable; /* set when res.limit == memsw.limit */ bool memsw_is_minimum; @@ -235,7 +237,6 @@ struct mem_cgroup { * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; - /* * percpu counter. */ @@ -1342,20 +1343,26 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); } +static void memcg_oom_recover(struct mem_cgroup *mem) +{ + if (mem->oom_kill_disable && atomic_read(&mem->oom_lock)) + memcg_wakeup_oom(mem); +} + /* * try to call OOM killer. returns false if we should exit memory-reclaim loop. */ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) { struct oom_wait_info owait; - bool locked; + bool locked, need_to_kill; owait.mem = mem; owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); - + need_to_kill = true; /* At first, try to OOM lock hierarchy under mem.*/ mutex_lock(&memcg_oom_mutex); locked = mem_cgroup_oom_lock(mem); @@ -1364,15 +1371,17 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL * under OOM is always welcomed, use TASK_KILLABLE here. */ - if (!locked) - prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - else + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + if (!locked || mem->oom_kill_disable) + need_to_kill = false; + if (locked) mem_cgroup_oom_notify(mem); mutex_unlock(&memcg_oom_mutex); - if (locked) + if (need_to_kill) { + finish_wait(&memcg_oom_waitq, &owait.wait); mem_cgroup_out_of_memory(mem, mask); - else { + } else { schedule(); finish_wait(&memcg_oom_waitq, &owait.wait); } @@ -2162,15 +2171,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) /* If swapout, usage of swap doesn't decrease */ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) uncharge_memsw = false; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; batch = ¤t->memcg_batch; /* @@ -2180,6 +2180,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) */ if (!batch->memcg) batch->memcg = mem; + /* + * do_batch > 0 when unmapping pages or inode invalidate/truncate. + * In those cases, all pages freed continously can be expected to be in + * the same cgroup and we have chance to coalesce uncharges. + * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) + * because we want to do uncharge as soon as possible. + */ + + if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) + goto direct_uncharge; + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. @@ -2196,6 +2207,8 @@ direct_uncharge: res_counter_uncharge(&mem->res, PAGE_SIZE); if (uncharge_memsw) res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (unlikely(batch->memcg != mem)) + memcg_oom_recover(mem); return; } @@ -2332,6 +2345,7 @@ void mem_cgroup_uncharge_end(void) res_counter_uncharge(&batch->memcg->res, batch->bytes); if (batch->memsw_bytes) res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + memcg_oom_recover(batch->memcg); /* forget this pointer (for sanity check) */ batch->memcg = NULL; } @@ -2568,10 +2582,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memswlimit; + u64 memswlimit, memlimit; int ret = 0; int children = mem_cgroup_count_children(memcg); u64 curusage, oldusage; + int enlarge; /* * For keeping hierarchical_reclaim simple, how long we should retry @@ -2582,6 +2597,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); + enlarge = 0; while (retry_count) { if (signal_pending(current)) { ret = -EINTR; @@ -2599,6 +2615,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, mutex_unlock(&set_limit_mutex); break; } + + memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); + if (memlimit < val) + enlarge = 1; + ret = res_counter_set_limit(&memcg->res, val); if (!ret) { if (memswlimit == val) @@ -2620,6 +2641,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, else oldusage = curusage; } + if (!ret && enlarge) + memcg_oom_recover(memcg); return ret; } @@ -2628,9 +2651,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memlimit, oldusage, curusage; + u64 memlimit, memswlimit, oldusage, curusage; int children = mem_cgroup_count_children(memcg); int ret = -EBUSY; + int enlarge = 0; /* see mem_cgroup_resize_res_limit */ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; @@ -2652,6 +2676,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, mutex_unlock(&set_limit_mutex); break; } + memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + if (memswlimit < val) + enlarge = 1; ret = res_counter_set_limit(&memcg->memsw, val); if (!ret) { if (memlimit == val) @@ -2674,6 +2701,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, else oldusage = curusage; } + if (!ret && enlarge) + memcg_oom_recover(memcg); return ret; } @@ -2865,6 +2894,7 @@ move_account: if (ret) break; } + memcg_oom_recover(mem); /* it seems parent cgroup doesn't have enough mem */ if (ret == -ENOMEM) goto try_to_free; @@ -3645,6 +3675,46 @@ static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, return 0; } +static int mem_cgroup_oom_control_read(struct cgroup *cgrp, + struct cftype *cft, struct cgroup_map_cb *cb) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); + + if (atomic_read(&mem->oom_lock)) + cb->fill(cb, "under_oom", 1); + else + cb->fill(cb, "under_oom", 0); + return 0; +} + +/* + */ +static int mem_cgroup_oom_control_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *parent; + + /* cannot set to root cgroup and only 0 and 1 are allowed */ + if (!cgrp->parent || !((val == 0) || (val == 1))) + return -EINVAL; + + parent = mem_cgroup_from_cont(cgrp->parent); + + cgroup_lock(); + /* oom-kill-disable is a flag for subhierarchy. */ + if ((parent->use_hierarchy) || + (mem->use_hierarchy && !list_empty(&cgrp->children))) { + cgroup_unlock(); + return -EINVAL; + } + mem->oom_kill_disable = val; + cgroup_unlock(); + return 0; +} + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -3702,6 +3772,8 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "oom_control", + .read_map = mem_cgroup_oom_control_read, + .write_u64 = mem_cgroup_oom_control_write, .register_event = mem_cgroup_oom_register_event, .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), @@ -3943,6 +4015,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; + mem->oom_kill_disable = parent->oom_kill_disable; } if (parent && parent->use_hierarchy) { @@ -4215,6 +4288,7 @@ static void mem_cgroup_clear_mc(void) if (mc.precharge) { __mem_cgroup_cancel_charge(mc.to, mc.precharge); mc.precharge = 0; + memcg_oom_recover(mc.to); } /* * we didn't uncharge from mc.from at mem_cgroup_move_account(), so @@ -4223,6 +4297,7 @@ static void mem_cgroup_clear_mc(void) if (mc.moved_charge) { __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; + memcg_oom_recover(mc.from); } /* we must fixup refcnts and charges */ if (mc.moved_swap) { -- cgit v1.2.3-59-g8ed1b From 90254a65833b67502d14736410b3857a15535c67 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 26 May 2010 14:42:38 -0700 Subject: memcg: clean up move charge This patch cleans up move charge code by: - define functions to handle pte for each types, and make is_target_pte_for_mc() cleaner. - instead of checking the MOVE_CHARGE_TYPE_ANON bit, define a function that checks the bit. Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 96 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53eb30ebdb49..e5277e8a42a8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -266,6 +266,12 @@ static struct move_charge_struct { .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), }; +static bool move_anon(void) +{ + return test_bit(MOVE_CHARGE_TYPE_ANON, + &mc.to->move_charge_at_immigrate); +} + /* * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. @@ -4162,50 +4168,66 @@ enum mc_target_type { MC_TARGET_SWAP, }; -static int is_target_pte_for_mc(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, union mc_target *target) +static struct page *mc_handle_present_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent) { - struct page *page = NULL; - struct page_cgroup *pc; - int ret = 0; - swp_entry_t ent = { .val = 0 }; - int usage_count = 0; - bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, - &mc.to->move_charge_at_immigrate); + struct page *page = vm_normal_page(vma, addr, ptent); - if (!pte_present(ptent)) { - /* TODO: handle swap of shmes/tmpfs */ - if (pte_none(ptent) || pte_file(ptent)) - return 0; - else if (is_swap_pte(ptent)) { - ent = pte_to_swp_entry(ptent); - if (!move_anon || non_swap_entry(ent)) - return 0; - usage_count = mem_cgroup_count_swap_user(ent, &page); - } - } else { - page = vm_normal_page(vma, addr, ptent); - if (!page || !page_mapped(page)) - return 0; + if (!page || !page_mapped(page)) + return NULL; + if (PageAnon(page)) { + /* we don't move shared anon */ + if (!move_anon() || page_mapcount(page) > 2) + return NULL; + } else /* * TODO: We don't move charges of file(including shmem/tmpfs) * pages for now. */ - if (!move_anon || !PageAnon(page)) - return 0; - if (!get_page_unless_zero(page)) - return 0; - usage_count = page_mapcount(page); - } - if (usage_count > 1) { - /* - * TODO: We don't move charges of shared(used by multiple - * processes) pages for now. - */ + return NULL; + if (!get_page_unless_zero(page)) + return NULL; + + return page; +} + +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + int usage_count; + struct page *page = NULL; + swp_entry_t ent = pte_to_swp_entry(ptent); + + if (!move_anon() || non_swap_entry(ent)) + return NULL; + usage_count = mem_cgroup_count_swap_user(ent, &page); + if (usage_count > 1) { /* we don't move shared anon */ if (page) put_page(page); - return 0; + return NULL; } + if (do_swap_account) + entry->val = ent.val; + + return page; +} + +static int is_target_pte_for_mc(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, union mc_target *target) +{ + struct page *page = NULL; + struct page_cgroup *pc; + int ret = 0; + swp_entry_t ent = { .val = 0 }; + + if (pte_present(ptent)) + page = mc_handle_present_pte(vma, addr, ptent); + else if (is_swap_pte(ptent)) + page = mc_handle_swap_pte(vma, addr, ptent, &ent); + /* TODO: handle swap of shmes/tmpfs */ + + if (!page && !ent.val) + return 0; if (page) { pc = lookup_page_cgroup(page); /* @@ -4221,8 +4243,8 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, if (!ret || !target) put_page(page); } - /* throught */ - if (ent.val && do_swap_account && !ret && + /* There is a swap entry and a page doesn't exist or isn't charged */ + if (ent.val && !ret && css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { ret = MC_TARGET_SWAP; if (target) -- cgit v1.2.3-59-g8ed1b From 87946a72283be3de936adc754b7007df7d3e6aeb Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 26 May 2010 14:42:39 -0700 Subject: memcg: move charge of file pages This patch adds support for moving charge of file pages, which include normal file, tmpfs file and swaps of tmpfs file. It's enabled by setting bit 1 of /memory.move_charge_at_immigrate. Unlike the case of anonymous pages, file pages(and swaps) in the range mmapped by the task will be moved even if the task hasn't done page fault, i.e. they might not be the task's "RSS", but other task's "RSS" that maps the same file. And mapcount of the page is ignored(the page can be moved even if page_mapcount(page) > 1). So, conditions that the page/swap should be met to be moved is that it must be in the range mmapped by the target task and it must be charged to the old cgroup. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix warning] Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 18 +++++++---- include/linux/swap.h | 5 ++++ mm/memcontrol.c | 56 +++++++++++++++++++++++++++-------- mm/shmem.c | 64 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 44e7ded33448..5e028870ee8a 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -454,21 +454,27 @@ And if you want disable it again: 8.2 Type of charges which can be move Each bits of move_charge_at_immigrate has its own meaning about what type of -charges should be moved. +charges should be moved. But in any cases, it must be noted that an account of +a page or a swap can be moved only when it is charged to the task's current(old) +memory cgroup. bit | what type of charges would be moved ? -----+------------------------------------------------------------------------ 0 | A charge of an anonymous page(or swap of it) used by the target task. | Those pages and swaps must be used only by the target task. You must | enable Swap Extension(see 2.4) to enable move of swap charges. - -Note: Those pages and swaps must be charged to the old cgroup. -Note: More type of pages(e.g. file cache, shmem,) will be supported by other - bits in future. + -----+------------------------------------------------------------------------ + 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory) + | and swaps of tmpfs file) mmaped by the target task. Unlike the case of + | anonymous pages, file pages(and swaps) in the range mmapped by the task + | will be moved even if the task hasn't done page fault, i.e. they might + | not be the task's "RSS", but other task's "RSS" that maps the same file. + | And mapcount of the page is ignored(the page can be moved even if + | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to + | enable move of swap charges. 8.3 TODO -- Add support for other types of pages(e.g. file cache, shmem, etc.). - Implement madvise(2) to let users decide the vma to be moved or not to be moved. - All of moving charge operations are done under cgroup_mutex. It's not good diff --git a/include/linux/swap.h b/include/linux/swap.h index b6b614364dd8..ff4acea9bbdb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -282,6 +282,11 @@ extern void kswapd_stop(int nid); extern int shmem_unuse(swp_entry_t entry, struct page *page); #endif /* CONFIG_MMU */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent); +#endif + extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); #ifdef CONFIG_SWAP diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e5277e8a42a8..be5f478351bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,6 +250,7 @@ struct mem_cgroup { */ enum move_type { MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ + MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ NR_MOVE_TYPE, }; @@ -272,6 +273,12 @@ static bool move_anon(void) &mc.to->move_charge_at_immigrate); } +static bool move_file(void) +{ + return test_bit(MOVE_CHARGE_TYPE_FILE, + &mc.to->move_charge_at_immigrate); +} + /* * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. @@ -4179,11 +4186,8 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, /* we don't move shared anon */ if (!move_anon() || page_mapcount(page) > 2) return NULL; - } else - /* - * TODO: We don't move charges of file(including shmem/tmpfs) - * pages for now. - */ + } else if (!move_file()) + /* we ignore mapcount for file pages */ return NULL; if (!get_page_unless_zero(page)) return NULL; @@ -4212,6 +4216,39 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return page; } +static struct page *mc_handle_file_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + struct page *page = NULL; + struct inode *inode; + struct address_space *mapping; + pgoff_t pgoff; + + if (!vma->vm_file) /* anonymous vma */ + return NULL; + if (!move_file()) + return NULL; + + inode = vma->vm_file->f_path.dentry->d_inode; + mapping = vma->vm_file->f_mapping; + if (pte_none(ptent)) + pgoff = linear_page_index(vma, addr); + else /* pte_file(ptent) is true */ + pgoff = pte_to_pgoff(ptent); + + /* page is moved even if it's not RSS of this task(page-faulted). */ + if (!mapping_cap_swap_backed(mapping)) { /* normal file */ + page = find_get_page(mapping, pgoff); + } else { /* shmem/tmpfs file. we should take account of swap too. */ + swp_entry_t ent; + mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); + if (do_swap_account) + entry->val = ent.val; + } + + return page; +} + static int is_target_pte_for_mc(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { @@ -4224,7 +4261,8 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, page = mc_handle_present_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, addr, ptent, &ent); - /* TODO: handle swap of shmes/tmpfs */ + else if (pte_none(ptent) || pte_file(ptent)) + page = mc_handle_file_pte(vma, addr, ptent, &ent); if (!page && !ent.val) return 0; @@ -4285,9 +4323,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) }; if (is_vm_hugetlb_page(vma)) continue; - /* TODO: We don't move charges of shmem/tmpfs pages for now. */ - if (vma->vm_flags & VM_SHARED) - continue; walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } @@ -4484,9 +4519,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) }; if (is_vm_hugetlb_page(vma)) continue; - /* TODO: We don't move charges of shmem/tmpfs pages for now. */ - if (vma->vm_flags & VM_SHARED) - continue; ret = walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_move_charge_walk); if (ret) diff --git a/mm/shmem.c b/mm/shmem.c index 4ef9797bd430..855eaf5b8d5b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2559,6 +2559,45 @@ out4: return error; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + swp_entry_t entry = { .val = 0 }, *ptr; + struct page *page = NULL; + struct shmem_inode_info *info = SHMEM_I(inode); + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + + spin_lock(&info->lock); + ptr = shmem_swp_entry(info, pgoff, NULL); +#ifdef CONFIG_SWAP + if (ptr && ptr->val) { + entry.val = ptr->val; + page = find_get_page(&swapper_space, entry.val); + } else +#endif + page = find_get_page(inode->i_mapping, pgoff); + if (ptr) + shmem_swp_unmap(ptr); + spin_unlock(&info->lock); +out: + *pagep = page; + *ent = entry; +} +#endif + #else /* !CONFIG_SHMEM */ /* @@ -2598,6 +2637,31 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + struct page *page = NULL; + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + page = find_get_page(inode->i_mapping, pgoff); +out: + *pagep = page; + *ent = (swp_entry_t){ .val = 0 }; +} +#endif + #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) -- cgit v1.2.3-59-g8ed1b From df64f81bb1e01cbef967a96642dacf208acb7e72 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 26 May 2010 14:42:41 -0700 Subject: memcg: make oom killer a no-op when no killable task can be found It's pointless to try to kill current if select_bad_process() did not find an eligible task to kill in mem_cgroup_out_of_memory() since it's guaranteed that current is a member of the memcg that is oom and it is, by definition, unkillable. Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Li Zefan Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b68e802a7a7d..709aedfaa014 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -479,12 +479,9 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) read_lock(&tasklist_lock); retry: p = select_bad_process(&points, mem); - if (PTR_ERR(p) == -1UL) + if (!p || PTR_ERR(p) == -1UL) goto out; - if (!p) - p = current; - if (oom_kill_process(p, gfp_mask, 0, points, mem, "Memory cgroup out of memory")) goto retry; -- cgit v1.2.3-59-g8ed1b From 5407a56257b6ade44fd9bcac972c99845b7413cd Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Wed, 26 May 2010 14:42:42 -0700 Subject: mm: remove unnecessary use of atomic The bottom 4 hunks are atomically changing memory to which there are no aliases as it's freshly allocated, so there's no need to use atomic operations. The other hunks are just atomic_read and atomic_set, and do not involve any read-modify-write. The use of atomic_{read,set} doesn't prevent a read/write or write/write race, so if a race were possible (I'm not saying one is), then it would still be there even with atomic_set. See: http://digitalvampire.org/blog/index.php/2007/05/13/atomic-cargo-cults/ Signed-off-by: Phil Carmody Acked-by: Kirill A. Shutemov Cc: Balbir Singh Cc: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index be5f478351bd..93b0239bc34d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -152,7 +152,7 @@ struct mem_cgroup_threshold { /* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below usage. */ - atomic_t current_threshold; + int current_threshold; /* Size of entries[] */ unsigned int size; /* Array of thresholds */ @@ -3412,7 +3412,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) * If it's not true, a threshold was crossed after last * call of __mem_cgroup_threshold(). */ - i = atomic_read(&t->current_threshold); + i = t->current_threshold; /* * Iterate backward over array of thresholds starting from @@ -3436,7 +3436,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) eventfd_signal(t->entries[i].eventfd, 1); /* Update current_threshold */ - atomic_set(&t->current_threshold, i - 1); + t->current_threshold = i - 1; unlock: rcu_read_unlock(); } @@ -3528,7 +3528,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, compare_thresholds, NULL); /* Find current threshold */ - atomic_set(&thresholds_new->current_threshold, -1); + thresholds_new->current_threshold = -1; for (i = 0; i < size; i++) { if (thresholds_new->entries[i].threshold < usage) { /* @@ -3536,7 +3536,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, * until rcu_assign_pointer(), so it's safe to increment * it here. */ - atomic_inc(&thresholds_new->current_threshold); + ++thresholds_new->current_threshold; } } @@ -3607,7 +3607,7 @@ static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, thresholds_new->size = size; /* Copy thresholds and find current threshold */ - atomic_set(&thresholds_new->current_threshold, -1); + thresholds_new->current_threshold = -1; for (i = 0, j = 0; i < thresholds->size; i++) { if (thresholds->entries[i].eventfd == eventfd) continue; @@ -3619,7 +3619,7 @@ static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, * until rcu_assign_pointer(), so it's safe to increment * it here. */ - atomic_inc(&thresholds_new->current_threshold); + ++thresholds_new->current_threshold; } j++; } -- cgit v1.2.3-59-g8ed1b From 315c1998e10527ff364a9883048455e609bc7232 Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Wed, 26 May 2010 14:42:43 -0700 Subject: mm: memcontrol - uninitialised return value Only an out of memory error will cause ret to be set. Signed-off-by: Phil Carmody Acked-by: Kirill A. Shutemov Cc: Balbir Singh Cc: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93b0239bc34d..8c200e86da4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3563,7 +3563,7 @@ static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, int type = MEMFILE_TYPE(cft->private); u64 usage; int size = 0; - int i, j, ret; + int i, j, ret = 0; mutex_lock(&memcg->thresholds_lock); if (type == _MEM) -- cgit v1.2.3-59-g8ed1b From ac39cf8cb86c45eeac6a592ce0d58f9021a97235 Mon Sep 17 00:00:00 2001 From: "akpm@linux-foundation.org" Date: Wed, 26 May 2010 14:42:46 -0700 Subject: memcg: fix mis-accounting of file mapped racy with migration FILE_MAPPED per memcg of migrated file cache is not properly updated, because our hook in page_add_file_rmap() can't know to which memcg FILE_MAPPED should be counted. Basically, this patch is for fixing the bug but includes some big changes to fix up other messes. Now, at migrating mapped file, events happen in following sequence. 1. allocate a new page. 2. get memcg of an old page. 3. charge ageinst a new page before migration. But at this point, no changes to new page's page_cgroup, no commit for the charge. (IOW, PCG_USED bit is not set.) 4. page migration replaces radix-tree, old-page and new-page. 5. page migration remaps the new page if the old page was mapped. 6. Here, the new page is unlocked. 7. memcg commits the charge for newpage, Mark the new page's page_cgroup as PCG_USED. Because "commit" happens after page-remap, we can count FILE_MAPPED at "5", because we should avoid to trust page_cgroup->mem_cgroup. if PCG_USED bit is unset. (Note: memcg's LRU removal code does that but LRU-isolation logic is used for helping it. When we overwrite page_cgroup->mem_cgroup, page_cgroup is not on LRU or page_cgroup->mem_cgroup is NULL.) We can lose file_mapped accounting information at 5 because FILE_MAPPED is updated only when mapcount changes 0->1. So we should catch it. BTW, historically, above implemntation comes from migration-failure of anonymous page. Because we charge both of old page and new page with mapcount=0, we can't catch - the page is really freed before remap. - migration fails but it's freed before remap or .....corner cases. New migration sequence with memcg is: 1. allocate a new page. 2. mark PageCgroupMigration to the old page. 3. charge against a new page onto the old page's memcg. (here, new page's pc is marked as PageCgroupUsed.) 4. page migration replaces radix-tree, page table, etc... 5. At remapping, new page's page_cgroup is now makrked as "USED" We can catch 0->1 event and FILE_MAPPED will be properly updated. And we can catch SWAPOUT event after unlock this and freeing this page by unmap() can be caught. 7. Clear PageCgroupMigration of the old page. So, FILE_MAPPED will be correctly updated. Then, for what MIGRATION flag is ? Without it, at migration failure, we may have to charge old page again because it may be fully unmapped. "charge" means that we have to dive into memory reclaim or something complated. So, it's better to avoid charge it again. Before this patch, __commit_charge() was working for both of the old/new page and fixed up all. But this technique has some racy condtion around FILE_MAPPED and SWAPOUT etc... Now, the kernel use MIGRATION flag and don't uncharge old page until the end of migration. I hope this change will make memcg's page migration much simpler. This page migration has caused several troubles. Worth to add a flag for simplification. Reviewed-by: Daisuke Nishimura Tested-by: Daisuke Nishimura Reported-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 +- include/linux/page_cgroup.h | 5 ++ mm/memcontrol.c | 135 +++++++++++++++++++++++++++++++------------- mm/migrate.c | 2 +- 4 files changed, 107 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 05894795fdc1..9411d32840b0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -90,7 +90,8 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); extern int -mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); +mem_cgroup_prepare_migration(struct page *page, + struct page *newpage, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, struct page *oldpage, struct page *newpage); @@ -227,7 +228,8 @@ static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) } static inline int -mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) +mem_cgroup_prepare_migration(struct page *page, struct page *newpage, + struct mem_cgroup **ptr) { return 0; } diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index aef22ae2af47..5bb13b3db84d 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -40,6 +40,7 @@ enum { PCG_USED, /* this object is in use. */ PCG_ACCT_LRU, /* page has been accounted for */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ + PCG_MIGRATION, /* under page migration */ }; #define TESTPCGFLAG(uname, lname) \ @@ -79,6 +80,10 @@ SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) +SETPCGFLAG(Migration, MIGRATION) +CLEARPCGFLAG(Migration, MIGRATION) +TESTPCGFLAG(Migration, MIGRATION) + static inline int page_cgroup_nid(struct page_cgroup *pc) { return page_to_nid(pc->page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8c200e86da4c..df1234c0dac3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2258,7 +2258,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) switch (ctype) { case MEM_CGROUP_CHARGE_TYPE_MAPPED: case MEM_CGROUP_CHARGE_TYPE_DROP: - if (page_mapped(page)) + /* See mem_cgroup_prepare_migration() */ + if (page_mapped(page) || PageCgroupMigration(pc)) goto unlock_out; break; case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: @@ -2481,10 +2482,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * Before starting migration, account PAGE_SIZE to mem_cgroup that the old * page belongs to. */ -int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) +int mem_cgroup_prepare_migration(struct page *page, + struct page *newpage, struct mem_cgroup **ptr) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + enum charge_type ctype; int ret = 0; if (mem_cgroup_disabled()) @@ -2495,69 +2498,125 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; css_get(&mem->css); + /* + * At migrating an anonymous page, its mapcount goes down + * to 0 and uncharge() will be called. But, even if it's fully + * unmapped, migration may fail and this page has to be + * charged again. We set MIGRATION flag here and delay uncharge + * until end_migration() is called + * + * Corner Case Thinking + * A) + * When the old page was mapped as Anon and it's unmap-and-freed + * while migration was ongoing. + * If unmap finds the old page, uncharge() of it will be delayed + * until end_migration(). If unmap finds a new page, it's + * uncharged when it make mapcount to be 1->0. If unmap code + * finds swap_migration_entry, the new page will not be mapped + * and end_migration() will find it(mapcount==0). + * + * B) + * When the old page was mapped but migraion fails, the kernel + * remaps it. A charge for it is kept by MIGRATION flag even + * if mapcount goes down to 0. We can do remap successfully + * without charging it again. + * + * C) + * The "old" page is under lock_page() until the end of + * migration, so, the old page itself will not be swapped-out. + * If the new page is swapped out before end_migraton, our + * hook to usual swap-out path will catch the event. + */ + if (PageAnon(page)) + SetPageCgroupMigration(pc); } unlock_page_cgroup(pc); + /* + * If the page is not charged at this point, + * we return here. + */ + if (!mem) + return 0; *ptr = mem; - if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); - css_put(&mem->css); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + css_put(&mem->css);/* drop extra refcnt */ + if (ret || *ptr == NULL) { + if (PageAnon(page)) { + lock_page_cgroup(pc); + ClearPageCgroupMigration(pc); + unlock_page_cgroup(pc); + /* + * The old page may be fully unmapped while we kept it. + */ + mem_cgroup_uncharge_page(page); + } + return -ENOMEM; } + /* + * We charge new page before it's used/mapped. So, even if unlock_page() + * is called before end_migration, we can catch all events on this new + * page. In the case new page is migrated but not remapped, new page's + * mapcount will be finally 0 and we call uncharge in end_migration(). + */ + pc = lookup_page_cgroup(newpage); + if (PageAnon(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + else if (page_is_file_cache(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + else + ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + __mem_cgroup_commit_charge(mem, pc, ctype); return ret; } /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage) { - struct page *target, *unused; + struct page *used, *unused; struct page_cgroup *pc; - enum charge_type ctype; if (!mem) return; + /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { - target = oldpage; - unused = NULL; + used = oldpage; + unused = newpage; } else { - target = newpage; + used = newpage; unused = oldpage; } - - if (PageAnon(target)) - ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; - else if (page_is_file_cache(target)) - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - else - ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - - /* unused page is not on radix-tree now. */ - if (unused) - __mem_cgroup_uncharge_common(unused, ctype); - - pc = lookup_page_cgroup(target); /* - * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. - * So, double-counting is effectively avoided. + * We disallowed uncharge of pages under migration because mapcount + * of the page goes down to zero, temporarly. + * Clear the flag and check the page should be charged. */ - __mem_cgroup_commit_charge(mem, pc, ctype); + pc = lookup_page_cgroup(oldpage); + lock_page_cgroup(pc); + ClearPageCgroupMigration(pc); + unlock_page_cgroup(pc); + if (unused != oldpage) + pc = lookup_page_cgroup(unused); + __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); + + pc = lookup_page_cgroup(used); /* - * Both of oldpage and newpage are still under lock_page(). - * Then, we don't have to care about race in radix-tree. - * But we have to be careful that this page is unmapped or not. - * - * There is a case for !page_mapped(). At the start of - * migration, oldpage was mapped. But now, it's zapped. - * But we know *target* page is not freed/reused under us. - * mem_cgroup_uncharge_page() does all necessary checks. + * If a page is a file cache, radix-tree replacement is very atomic + * and we can skip this check. When it was an Anon page, its mapcount + * goes down to 0. But because we added MIGRATION flage, it's not + * uncharged yet. There are several case but page->mapcount check + * and USED bit check in mem_cgroup_uncharge_page() will do enough + * check. (see prepare_charge() also) */ - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) - mem_cgroup_uncharge_page(target); + if (PageAnon(used)) + mem_cgroup_uncharge_page(used); /* - * At migration, we may charge account against cgroup which has no tasks + * At migration, we may charge account against cgroup which has no + * tasks. * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ diff --git a/mm/migrate.c b/mm/migrate.c index 09e2471afa0f..4205b1d6049e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -590,7 +590,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, } /* charge against new page */ - charge = mem_cgroup_prepare_migration(page, &mem); + charge = mem_cgroup_prepare_migration(page, newpage, &mem); if (charge == -ENOMEM) { rc = -ENOMEM; goto unlock; -- cgit v1.2.3-59-g8ed1b From 907860ed381a31b0102f362df67c1c5cae6ef050 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 26 May 2010 14:42:46 -0700 Subject: cgroups: make cftype.unregister_event() void-returning Since we are unable to handle an error returned by cftype.unregister_event() properly, let's make the callback void-returning. mem_cgroup_unregister_event() has been rewritten to be a "never fail" function. On mem_cgroup_usage_register_event() we save old buffer for thresholds array and reuse it in mem_cgroup_usage_unregister_event() to avoid allocation. Signed-off-by: Kirill A. Shutemov Acked-by: KAMEZAWA Hiroyuki Cc: Phil Carmody Cc: Balbir Singh Cc: Daisuke Nishimura Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 1 - mm/memcontrol.c | 65 +++++++++++++++++++++++++++++++------------------- 3 files changed, 42 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8f78073d7caa..0c621604baa1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -397,7 +397,7 @@ struct cftype { * This callback must be implemented, if you want provide * notification functionality. */ - int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, + void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, struct eventfd_ctx *eventfd); }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 291775021b2e..422cb19f156e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2994,7 +2994,6 @@ static void cgroup_event_remove(struct work_struct *work) remove); struct cgroup *cgrp = event->cgrp; - /* TODO: check return code */ event->cft->unregister_event(cgrp, event->cft, event->eventfd); eventfd_ctx_put(event->eventfd); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index df1234c0dac3..a4172a861b30 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -226,9 +226,19 @@ struct mem_cgroup { /* thresholds for memory usage. RCU-protected */ struct mem_cgroup_threshold_ary *thresholds; + /* + * Preallocated buffer to be used in mem_cgroup_unregister_event() + * to make it "never fail". + * It must be able to store at least thresholds->size - 1 entries. + */ + struct mem_cgroup_threshold_ary *__thresholds; + /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_threshold_ary *memsw_thresholds; + /* the same as __thresholds, but for memsw_thresholds */ + struct mem_cgroup_threshold_ary *__memsw_thresholds; + /* For oom notifier event fd */ struct list_head oom_notify; @@ -3604,17 +3614,27 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, else rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); - /* To be sure that nobody uses thresholds before freeing it */ + /* To be sure that nobody uses thresholds */ synchronize_rcu(); - kfree(thresholds); + /* + * Free old preallocated buffer and use thresholds as new + * preallocated buffer. + */ + if (type == _MEM) { + kfree(memcg->__thresholds); + memcg->__thresholds = thresholds; + } else { + kfree(memcg->__memsw_thresholds); + memcg->__memsw_thresholds = thresholds; + } unlock: mutex_unlock(&memcg->thresholds_lock); return ret; } -static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, struct cftype *cft, struct eventfd_ctx *eventfd) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); @@ -3622,7 +3642,7 @@ static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, int type = MEMFILE_TYPE(cft->private); u64 usage; int size = 0; - int i, j, ret = 0; + int i, j; mutex_lock(&memcg->thresholds_lock); if (type == _MEM) @@ -3649,20 +3669,19 @@ static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, size++; } + /* Use preallocated buffer for new array of thresholds */ + if (type == _MEM) + thresholds_new = memcg->__thresholds; + else + thresholds_new = memcg->__memsw_thresholds; + /* Set thresholds array to NULL if we don't have thresholds */ if (!size) { + kfree(thresholds_new); thresholds_new = NULL; - goto assign; + goto swap_buffers; } - /* Allocate memory for new array of thresholds */ - thresholds_new = kmalloc(sizeof(*thresholds_new) + - size * sizeof(struct mem_cgroup_threshold), - GFP_KERNEL); - if (!thresholds_new) { - ret = -ENOMEM; - goto unlock; - } thresholds_new->size = size; /* Copy thresholds and find current threshold */ @@ -3683,20 +3702,20 @@ static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, j++; } -assign: - if (type == _MEM) +swap_buffers: + /* Swap thresholds array and preallocated buffer */ + if (type == _MEM) { + memcg->__thresholds = thresholds; rcu_assign_pointer(memcg->thresholds, thresholds_new); - else + } else { + memcg->__memsw_thresholds = thresholds; rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); + } - /* To be sure that nobody uses thresholds before freeing it */ + /* To be sure that nobody uses thresholds */ synchronize_rcu(); - kfree(thresholds); -unlock: mutex_unlock(&memcg->thresholds_lock); - - return ret; } static int mem_cgroup_oom_register_event(struct cgroup *cgrp, @@ -3724,7 +3743,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, return 0; } -static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, struct cftype *cft, struct eventfd_ctx *eventfd) { struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); @@ -3743,8 +3762,6 @@ static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, } mutex_unlock(&memcg_oom_mutex); - - return 0; } static int mem_cgroup_oom_control_read(struct cgroup *cgrp, -- cgit v1.2.3-59-g8ed1b From 2c488db27b614816024e7994117f599337de0f34 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 26 May 2010 14:42:47 -0700 Subject: memcg: clean up memory thresholds Introduce struct mem_cgroup_thresholds. It helps to reduce number of checks of thresholds type (memory or mem+swap). [akpm@linux-foundation.org: repair comment] Signed-off-by: Kirill A. Shutemov Cc: Phil Carmody Cc: Balbir Singh Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Acked-by: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 151 +++++++++++++++++++++++++------------------------------- 1 file changed, 66 insertions(+), 85 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a4172a861b30..c6ece0a57595 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -158,6 +158,18 @@ struct mem_cgroup_threshold_ary { /* Array of thresholds */ struct mem_cgroup_threshold entries[0]; }; + +struct mem_cgroup_thresholds { + /* Primary thresholds array */ + struct mem_cgroup_threshold_ary *primary; + /* + * Spare threshold array. + * This is needed to make mem_cgroup_unregister_event() "never fail". + * It must be able to store at least primary->size - 1 entries. + */ + struct mem_cgroup_threshold_ary *spare; +}; + /* for OOM */ struct mem_cgroup_eventfd_list { struct list_head list; @@ -224,20 +236,10 @@ struct mem_cgroup { struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ - struct mem_cgroup_threshold_ary *thresholds; - - /* - * Preallocated buffer to be used in mem_cgroup_unregister_event() - * to make it "never fail". - * It must be able to store at least thresholds->size - 1 entries. - */ - struct mem_cgroup_threshold_ary *__thresholds; + struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ - struct mem_cgroup_threshold_ary *memsw_thresholds; - - /* the same as __thresholds, but for memsw_thresholds */ - struct mem_cgroup_threshold_ary *__memsw_thresholds; + struct mem_cgroup_thresholds memsw_thresholds; /* For oom notifier event fd */ struct list_head oom_notify; @@ -3467,9 +3469,9 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) rcu_read_lock(); if (!swap) - t = rcu_dereference(memcg->thresholds); + t = rcu_dereference(memcg->thresholds.primary); else - t = rcu_dereference(memcg->memsw_thresholds); + t = rcu_dereference(memcg->memsw_thresholds.primary); if (!t) goto unlock; @@ -3543,91 +3545,78 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; int type = MEMFILE_TYPE(cft->private); u64 threshold, usage; - int size; - int i, ret; + int i, size, ret; ret = res_counter_memparse_write_strategy(args, &threshold); if (ret) return ret; mutex_lock(&memcg->thresholds_lock); + if (type == _MEM) - thresholds = memcg->thresholds; + thresholds = &memcg->thresholds; else if (type == _MEMSWAP) - thresholds = memcg->memsw_thresholds; + thresholds = &memcg->memsw_thresholds; else BUG(); usage = mem_cgroup_usage(memcg, type == _MEMSWAP); /* Check if a threshold crossed before adding a new one */ - if (thresholds) + if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); - if (thresholds) - size = thresholds->size + 1; - else - size = 1; + size = thresholds->primary ? thresholds->primary->size + 1 : 1; /* Allocate memory for new array of thresholds */ - thresholds_new = kmalloc(sizeof(*thresholds_new) + - size * sizeof(struct mem_cgroup_threshold), + new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), GFP_KERNEL); - if (!thresholds_new) { + if (!new) { ret = -ENOMEM; goto unlock; } - thresholds_new->size = size; + new->size = size; /* Copy thresholds (if any) to new array */ - if (thresholds) - memcpy(thresholds_new->entries, thresholds->entries, - thresholds->size * + if (thresholds->primary) { + memcpy(new->entries, thresholds->primary->entries, (size - 1) * sizeof(struct mem_cgroup_threshold)); + } + /* Add new threshold */ - thresholds_new->entries[size - 1].eventfd = eventfd; - thresholds_new->entries[size - 1].threshold = threshold; + new->entries[size - 1].eventfd = eventfd; + new->entries[size - 1].threshold = threshold; /* Sort thresholds. Registering of new threshold isn't time-critical */ - sort(thresholds_new->entries, size, - sizeof(struct mem_cgroup_threshold), + sort(new->entries, size, sizeof(struct mem_cgroup_threshold), compare_thresholds, NULL); /* Find current threshold */ - thresholds_new->current_threshold = -1; + new->current_threshold = -1; for (i = 0; i < size; i++) { - if (thresholds_new->entries[i].threshold < usage) { + if (new->entries[i].threshold < usage) { /* - * thresholds_new->current_threshold will not be used - * until rcu_assign_pointer(), so it's safe to increment + * new->current_threshold will not be used until + * rcu_assign_pointer(), so it's safe to increment * it here. */ - ++thresholds_new->current_threshold; + ++new->current_threshold; } } - if (type == _MEM) - rcu_assign_pointer(memcg->thresholds, thresholds_new); - else - rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); + /* Free old spare buffer and save old primary buffer as spare */ + kfree(thresholds->spare); + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); /* To be sure that nobody uses thresholds */ synchronize_rcu(); - /* - * Free old preallocated buffer and use thresholds as new - * preallocated buffer. - */ - if (type == _MEM) { - kfree(memcg->__thresholds); - memcg->__thresholds = thresholds; - } else { - kfree(memcg->__memsw_thresholds); - memcg->__memsw_thresholds = thresholds; - } unlock: mutex_unlock(&memcg->thresholds_lock); @@ -3638,17 +3627,17 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, struct cftype *cft, struct eventfd_ctx *eventfd) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; int type = MEMFILE_TYPE(cft->private); u64 usage; - int size = 0; - int i, j; + int i, j, size; mutex_lock(&memcg->thresholds_lock); if (type == _MEM) - thresholds = memcg->thresholds; + thresholds = &memcg->thresholds; else if (type == _MEMSWAP) - thresholds = memcg->memsw_thresholds; + thresholds = &memcg->memsw_thresholds; else BUG(); @@ -3664,53 +3653,45 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, __mem_cgroup_threshold(memcg, type == _MEMSWAP); /* Calculate new number of threshold */ - for (i = 0; i < thresholds->size; i++) { - if (thresholds->entries[i].eventfd != eventfd) + size = 0; + for (i = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd != eventfd) size++; } - /* Use preallocated buffer for new array of thresholds */ - if (type == _MEM) - thresholds_new = memcg->__thresholds; - else - thresholds_new = memcg->__memsw_thresholds; + new = thresholds->spare; /* Set thresholds array to NULL if we don't have thresholds */ if (!size) { - kfree(thresholds_new); - thresholds_new = NULL; + kfree(new); + new = NULL; goto swap_buffers; } - thresholds_new->size = size; + new->size = size; /* Copy thresholds and find current threshold */ - thresholds_new->current_threshold = -1; - for (i = 0, j = 0; i < thresholds->size; i++) { - if (thresholds->entries[i].eventfd == eventfd) + new->current_threshold = -1; + for (i = 0, j = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd == eventfd) continue; - thresholds_new->entries[j] = thresholds->entries[i]; - if (thresholds_new->entries[j].threshold < usage) { + new->entries[j] = thresholds->primary->entries[i]; + if (new->entries[j].threshold < usage) { /* - * thresholds_new->current_threshold will not be used + * new->current_threshold will not be used * until rcu_assign_pointer(), so it's safe to increment * it here. */ - ++thresholds_new->current_threshold; + ++new->current_threshold; } j++; } swap_buffers: - /* Swap thresholds array and preallocated buffer */ - if (type == _MEM) { - memcg->__thresholds = thresholds; - rcu_assign_pointer(memcg->thresholds, thresholds_new); - } else { - memcg->__memsw_thresholds = thresholds; - rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); - } + /* Swap primary and spare array */ + thresholds->spare = thresholds->primary; + rcu_assign_pointer(thresholds->primary, new); /* To be sure that nobody uses thresholds */ synchronize_rcu(); -- cgit v1.2.3-59-g8ed1b From 6adef3ebe570bcde67fd6c16101451ddde5712b5 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Wed, 26 May 2010 14:42:49 -0700 Subject: cpusets: new round-robin rotor for SLAB allocations We have observed several workloads running on multi-node systems where memory is assigned unevenly across the nodes in the system. There are numerous reasons for this but one is the round-robin rotor in cpuset_mem_spread_node(). For example, a simple test that writes a multi-page file will allocate pages on nodes 0 2 4 6 ... Odd nodes are skipped. (Sometimes it allocates on odd nodes & skips even nodes). An example is shown below. The program "lfile" writes a file consisting of 10 pages. The program then mmaps the file & uses get_mempolicy(..., MPOL_F_NODE) to determine the nodes where the file pages were allocated. The output is shown below: # ./lfile allocated on nodes: 2 4 6 0 1 2 6 0 2 There is a single rotor that is used for allocating both file pages & slab pages. Writing the file allocates both a data page & a slab page (buffer_head). This advances the RR rotor 2 nodes for each page allocated. A quick confirmation seems to confirm this is the cause of the uneven allocation: # echo 0 >/dev/cpuset/memory_spread_slab # ./lfile allocated on nodes: 6 7 8 9 0 1 2 3 4 5 This patch introduces a second rotor that is used for slab allocations. Signed-off-by: Jack Steiner Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Paul Menage Cc: Jack Steiner Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 6 ++++++ include/linux/sched.h | 1 + kernel/cpuset.c | 20 ++++++++++++++++---- mm/slab.c | 2 +- 4 files changed, 24 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 20b51cab6593..457ed765a116 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -69,6 +69,7 @@ extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); extern int cpuset_mem_spread_node(void); +extern int cpuset_slab_spread_node(void); static inline int cpuset_do_page_mem_spread(void) { @@ -194,6 +195,11 @@ static inline int cpuset_mem_spread_node(void) return 0; } +static inline int cpuset_slab_spread_node(void) +{ + return 0; +} + static inline int cpuset_do_page_mem_spread(void) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index c0151ffd3541..4f31a166b1a1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1423,6 +1423,7 @@ struct task_struct { nodemask_t mems_allowed; /* Protected by alloc_lock */ int mems_allowed_change_disable; int cpuset_mem_spread_rotor; + int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 61d6af7fa676..02b9611eadde 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2469,7 +2469,8 @@ void cpuset_unlock(void) } /** - * cpuset_mem_spread_node() - On which node to begin search for a page + * cpuset_mem_spread_node() - On which node to begin search for a file page + * cpuset_slab_spread_node() - On which node to begin search for a slab page * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), @@ -2494,16 +2495,27 @@ void cpuset_unlock(void) * See kmem_cache_alloc_node(). */ -int cpuset_mem_spread_node(void) +static int cpuset_spread_node(int *rotor) { int node; - node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); + node = next_node(*rotor, current->mems_allowed); if (node == MAX_NUMNODES) node = first_node(current->mems_allowed); - current->cpuset_mem_spread_rotor = node; + *rotor = node; return node; } + +int cpuset_mem_spread_node(void) +{ + return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); +} + +int cpuset_slab_spread_node(void) +{ + return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); +} + EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); /** diff --git a/mm/slab.c b/mm/slab.c index 02786e1a32d2..8270ba3d1986 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3219,7 +3219,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) nid_alloc = nid_here = numa_node_id(); get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) - nid_alloc = cpuset_mem_spread_node(); + nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); put_mems_allowed(); -- cgit v1.2.3-59-g8ed1b From eac4068013a067f5fb63005033c13b27fbf6dfca Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 26 May 2010 14:43:32 -0700 Subject: slab: convert cpu notifier to return encapsulate errno value By the previous modification, the cpu notifier can return encapsulate errno value. This converts the cpu notifiers for slab. Signed-off-by: Akinobu Mita Cc: Christoph Lameter Acked-by: Pekka Enberg Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 8270ba3d1986..6437d89a8401 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1321,7 +1321,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, mutex_unlock(&cache_chain_mutex); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __cpuinitdata cpucache_notifier = { -- cgit v1.2.3-59-g8ed1b From 7281201922a0063fa60804ce39c277fc98142a47 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Wed, 26 May 2010 14:44:56 -0700 Subject: numa: add generic percpu var numa_node_id() implementation Rework the generic version of the numa_node_id() function to use the new generic percpu variable infrastructure. Guard the new implementation with a new config option: CONFIG_USE_PERCPU_NUMA_NODE_ID. Archs which support this new implemention will default this option to 'y' when NUMA is configured. This config option could be removed if/when all archs switch over to the generic percpu implementation of numa_node_id(). Arch support involves: 1) converting any existing per cpu variable implementations to use this implementation. x86_64 is an instance of such an arch. 2) archs that don't use a per cpu variable for numa_node_id() will need to initialize the new per cpu variable "numa_node" as cpus are brought on-line. ia64 is an example. 3) Defining USE_PERCPU_NUMA_NODE_ID in arch dependent Kconfig--e.g., when NUMA is configured. This is required because I have retained the old implementation by default to allow archs to be modified incrementally, as desired. Subsequent patches will convert x86_64 and ia64 to use this implemenation. Signed-off-by: Lee Schermerhorn Cc: Tejun Heo Cc: Mel Gorman Reviewed-by: Christoph Lameter Cc: Nick Piggin Cc: David Rientjes Cc: Eric Whitney Cc: KAMEZAWA Hiroyuki Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Pekka Enberg Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/topology.h | 4 ++++ include/linux/topology.h | 51 +++++++++++++++++++++++++++++++++++++---- mm/page_alloc.c | 5 ++++ 3 files changed, 55 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index c5087d796587..a2e629476b0e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -170,6 +170,10 @@ static inline int numa_node_id(void) { return 0; } +/* + * indicate override: + */ +#define numa_node_id numa_node_id static inline int early_cpu_to_node(int cpu) { diff --git a/include/linux/topology.h b/include/linux/topology.h index 5b81156780b1..2e5518f46571 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #ifndef node_has_online_mem @@ -203,8 +204,53 @@ int arch_update_cpu_topology(void); #ifndef SD_NODE_INIT #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! #endif + #endif /* CONFIG_NUMA */ +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +DECLARE_PER_CPU(int, numa_node); + +#ifndef numa_node_id +/* Returns the number of the current Node. */ +static inline int numa_node_id(void) +{ + return __this_cpu_read(numa_node); +} +#endif + +#ifndef cpu_to_node +static inline int cpu_to_node(int cpu) +{ + return per_cpu(numa_node, cpu); +} +#endif + +#ifndef set_numa_node +static inline void set_numa_node(int node) +{ + percpu_write(numa_node, node); +} +#endif + +#ifndef set_cpu_numa_node +static inline void set_cpu_numa_node(int cpu, int node) +{ + per_cpu(numa_node, cpu) = node; +} +#endif + +#else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */ + +/* Returns the number of the current Node. */ +#ifndef numa_node_id +static inline int numa_node_id(void) +{ + return cpu_to_node(raw_smp_processor_id()); +} +#endif + +#endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ + #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif @@ -218,9 +264,4 @@ int arch_update_cpu_topology(void); #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif -/* Returns the number of the current Node. */ -#ifndef numa_node_id -#define numa_node_id() (cpu_to_node(raw_smp_processor_id())) -#endif - #endif /* _LINUX_TOPOLOGY_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 08b349931ebc..6fe1b65ee1a8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,11 @@ #include #include "internal.h" +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +DEFINE_PER_CPU(int, numa_node); +EXPORT_PER_CPU_SYMBOL(numa_node); +#endif + /* * Array of node states. */ -- cgit v1.2.3-59-g8ed1b From 7aac789885512388a66d47280d7e7777ffba1e59 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Wed, 26 May 2010 14:45:00 -0700 Subject: numa: introduce numa_mem_id()- effective local memory node id Introduce numa_mem_id(), based on generic percpu variable infrastructure to track "nearest node with memory" for archs that support memoryless nodes. Define API in when CONFIG_HAVE_MEMORYLESS_NODES defined, else stubs. Architectures will define HAVE_MEMORYLESS_NODES if/when they support them. Archs can override definitions of: numa_mem_id() - returns node number of "local memory" node set_numa_mem() - initialize [this cpus'] per cpu variable 'numa_mem' cpu_to_mem() - return numa_mem for specified cpu; may be used as lvalue Generic initialization of 'numa_mem' occurs in __build_all_zonelists(). This will initialize the boot cpu at boot time, and all cpus on change of numa_zonelist_order, or when node or memory hot-plug requires zonelist rebuild. Archs that support memoryless nodes will need to initialize 'numa_mem' for secondary cpus as they're brought on-line. [akpm@linux-foundation.org: fix build] Signed-off-by: Lee Schermerhorn Signed-off-by: Christoph Lameter Cc: Tejun Heo Cc: Mel Gorman Cc: Christoph Lameter Cc: Nick Piggin Cc: David Rientjes Cc: Eric Whitney Cc: KAMEZAWA Hiroyuki Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Pekka Enberg Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/topology.h | 3 +++ include/linux/mmzone.h | 6 +++++ include/linux/topology.h | 61 ++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 45 ++++++++++++++++++++++++++++++- 4 files changed, 114 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index 510df36dd5d4..fd60700503c8 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -34,6 +34,9 @@ #ifndef cpu_to_node #define cpu_to_node(cpu) ((void)(cpu),0) #endif +#ifndef cpu_to_mem +#define cpu_to_mem(cpu) ((void)(cpu),0) +#endif #ifndef parent_node #define parent_node(node) ((void)(node),0) #endif diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0fa491326c4a..b4d109e389b8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -671,6 +671,12 @@ void memory_present(int nid, unsigned long start, unsigned long end); static inline void memory_present(int nid, unsigned long start, unsigned long end) {} #endif +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +int local_memory_node(int node_id); +#else +static inline int local_memory_node(int node_id) { return node_id; }; +#endif + #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); #endif diff --git a/include/linux/topology.h b/include/linux/topology.h index 2e5518f46571..c44df50a05ab 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -251,6 +251,67 @@ static inline int numa_node_id(void) #endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ +#ifdef CONFIG_HAVE_MEMORYLESS_NODES + +/* + * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. + * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. + * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). + */ +DECLARE_PER_CPU(int, _numa_mem_); + +#ifndef set_numa_mem +static inline void set_numa_mem(int node) +{ + percpu_write(_numa_mem_, node); +} +#endif + +#ifndef numa_mem_id +/* Returns the number of the nearest Node with memory */ +static inline int numa_mem_id(void) +{ + return __this_cpu_read(_numa_mem_); +} +#endif + +#ifndef cpu_to_mem +static inline int cpu_to_mem(int cpu) +{ + return per_cpu(_numa_mem_, cpu); +} +#endif + +#ifndef set_cpu_numa_mem +static inline void set_cpu_numa_mem(int cpu, int node) +{ + per_cpu(_numa_mem_, cpu) = node; +} +#endif + +#else /* !CONFIG_HAVE_MEMORYLESS_NODES */ + +static inline void set_numa_mem(int node) {} + +static inline void set_cpu_numa_mem(int cpu, int node) {} + +#ifndef numa_mem_id +/* Returns the number of the nearest Node with memory */ +static inline int numa_mem_id(void) +{ + return numa_node_id(); +} +#endif + +#ifndef cpu_to_mem +static inline int cpu_to_mem(int cpu) +{ + return cpu_to_node(cpu); +} +#endif + +#endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ + #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6fe1b65ee1a8..431214b941ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -62,6 +62,17 @@ DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); #endif +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. + * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. + * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() + * defined in . + */ +DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ +EXPORT_PER_CPU_SYMBOL(_numa_mem_); +#endif + /* * Array of node states. */ @@ -2861,6 +2872,24 @@ static void build_zonelist_cache(pg_data_t *pgdat) zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); } +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * Return node id of node used for "local" allocations. + * I.e., first node id of first zone in arg node's generic zonelist. + * Used for initializing percpu 'numa_mem', which is used primarily + * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. + */ +int local_memory_node(int node) +{ + struct zone *zone; + + (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + gfp_zone(GFP_KERNEL), + NULL, + &zone); + return zone->node; +} +#endif #else /* CONFIG_NUMA */ @@ -2975,9 +3004,23 @@ static __init_refok int __build_all_zonelists(void *data) * needs the percpu allocator in order to allocate its pagesets * (a chicken-egg dilemma). */ - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { setup_pageset(&per_cpu(boot_pageset, cpu), 0); +#ifdef CONFIG_HAVE_MEMORYLESS_NODES + /* + * We now know the "local memory node" for each node-- + * i.e., the node of the first zone in the generic zonelist. + * Set up numa_mem percpu variable for on-line cpus. During + * boot, only the boot cpu should be on-line; we'll init the + * secondary cpus' numa_mem as they come on-line. During + * node/memory hotplug, we'll fixup all on-line cpus. + */ + if (cpu_online(cpu)) + set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); +#endif + } + return 0; } -- cgit v1.2.3-59-g8ed1b From 7d6e6d09de82cf6cff7fecdba55198b9f47b381c Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Wed, 26 May 2010 14:45:03 -0700 Subject: numa: slab: use numa_mem_id() for slab local memory node Example usage of generic "numa_mem_id()": The mainline slab code, since ~ 2.6.19, does not handle memoryless nodes well. Specifically, the "fast path"--____cache_alloc()--will never succeed as slab doesn't cache offnode object on the per cpu queues, and for memoryless nodes, all memory will be "off node" relative to numa_node_id(). This adds significant overhead to all kmem cache allocations, incurring a significant regression relative to earlier kernels [from before slab.c was reorganized]. This patch uses the generic topology function "numa_mem_id()" to return the "effective local memory node" for the calling context. This is the first node in the local node's generic fallback zonelist-- the same node that "local" mempolicy-based allocations would use. This lets slab cache these "local" allocations and avoid fallback/refill on every allocation. N.B.: Slab will need to handle node and memory hotplug events that could change the value returned by numa_mem_id() for any given node if recent changes to address memory hotplug don't already address this. E.g., flush all per cpu slab queues before rebuilding the zonelists while the "machine" is held in the stopped state. Performance impact on "hackbench 400 process 200" 2.6.34-rc3-mmotm-100405-1609 no-patch this-patch ia64 no memoryless nodes [avg of 10]: 11.713 11.637 ~0.65 diff ia64 cpus all on memless nodes [10]: 228.259 26.484 ~8.6x speedup The slowdown of the patched kernel from ~12 sec to ~28 seconds when configured with memoryless nodes is the result of all cpus allocating from a single node's mm pagepool. The cache lines of the single node are distributed/interleaved over the memory of the real physical nodes, but the zone lock, list heads, ... of the single node with memory still each live in a single cache line that is accessed from all processors. x86_64 [8x6 AMD] [avg of 40]: 2.883 2.845 Signed-off-by: Lee Schermerhorn Cc: Tejun Heo Cc: Mel Gorman Cc: Christoph Lameter Cc: Nick Piggin Cc: David Rientjes Cc: Eric Whitney Cc: KAMEZAWA Hiroyuki Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Pekka Enberg Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6437d89a8401..e49f8f46f46d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -821,7 +821,7 @@ static void init_reap_node(int cpu) { int node; - node = next_node(cpu_to_node(cpu), node_online_map); + node = next_node(cpu_to_mem(cpu), node_online_map); if (node == MAX_NUMNODES) node = first_node(node_online_map); @@ -1050,7 +1050,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) struct array_cache *alien = NULL; int node; - node = numa_node_id(); + node = numa_mem_id(); /* * Make sure we are not freeing a object from another node to the array @@ -1129,7 +1129,7 @@ static void __cpuinit cpuup_canceled(long cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; - int node = cpu_to_node(cpu); + int node = cpu_to_mem(cpu); const struct cpumask *mask = cpumask_of_node(node); list_for_each_entry(cachep, &cache_chain, next) { @@ -1194,7 +1194,7 @@ static int __cpuinit cpuup_prepare(long cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; - int node = cpu_to_node(cpu); + int node = cpu_to_mem(cpu); int err; /* @@ -1479,7 +1479,7 @@ void __init kmem_cache_init(void) * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ - node = numa_node_id(); + node = numa_mem_id(); /* 1) create the cache_cache */ INIT_LIST_HEAD(&cache_chain); @@ -2121,7 +2121,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } } } - cachep->nodelists[numa_node_id()]->next_reap = + cachep->nodelists[numa_mem_id()]->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; @@ -2452,7 +2452,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); + assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock); #endif } @@ -2479,7 +2479,7 @@ static void do_drain(void *arg) { struct kmem_cache *cachep = arg; struct array_cache *ac; - int node = numa_node_id(); + int node = numa_mem_id(); check_irq_off(); ac = cpu_cache_get(cachep); @@ -3012,7 +3012,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) retry: check_irq_off(); - node = numa_node_id(); + node = numa_mem_id(); ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -3216,7 +3216,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; - nid_alloc = nid_here = numa_node_id(); + nid_alloc = nid_here = numa_mem_id(); get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); @@ -3281,7 +3281,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_node_id()); + obj = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { @@ -3389,6 +3389,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, { unsigned long save_flags; void *ptr; + int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; @@ -3401,7 +3402,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, local_irq_save(save_flags); if (nodeid == -1) - nodeid = numa_node_id(); + nodeid = slab_node; if (unlikely(!cachep->nodelists[nodeid])) { /* Node not bootstrapped yet */ @@ -3409,7 +3410,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, goto out; } - if (nodeid == numa_node_id()) { + if (nodeid == slab_node) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback @@ -3453,8 +3454,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ - if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_node_id()); + if (!objp) + objp = ____cache_alloc_node(cache, flags, numa_mem_id()); out: return objp; @@ -3551,7 +3552,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) { int batchcount; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = numa_mem_id(); batchcount = ac->batchcount; #if DEBUG @@ -3985,7 +3986,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return -ENOMEM; for_each_online_cpu(i) { - new->new[i] = alloc_arraycache(cpu_to_node(i), limit, + new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, batchcount, gfp); if (!new->new[i]) { for (i--; i >= 0; i--) @@ -4007,9 +4008,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, struct array_cache *ccold = new->new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); + spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); kfree(ccold); } kfree(new); @@ -4115,7 +4116,7 @@ static void cache_reap(struct work_struct *w) { struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = numa_mem_id(); struct delayed_work *work = to_delayed_work(w); if (!mutex_trylock(&cache_chain_mutex)) -- cgit v1.2.3-59-g8ed1b From 1b061d9247f71cd15edc4c4c4600191a903642c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 May 2010 17:53:41 +0200 Subject: rename the generic fsync implementations We don't name our generic fsync implementations very well currently. The no-op implementation for in-memory filesystems currently is called simple_sync_file which doesn't make too much sense to start with, the the generic one for simple filesystems is called simple_fsync which can lead to some confusion. This patch renames the generic file fsync method to generic_file_fsync to match the other generic_file_* routines it is supposed to be used with, and the no-op implementation to noop_fsync to make it obvious what to expect. In addition add some documentation for both methods. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/inode.c | 2 +- fs/adfs/dir.c | 2 +- fs/adfs/file.c | 2 +- fs/bfs/dir.c | 2 +- fs/ext2/file.c | 2 +- fs/ext4/fsync.c | 2 +- fs/fat/file.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/libfs.c | 30 +++++++++++++++++++++--------- fs/minix/dir.c | 2 +- fs/minix/file.c | 2 +- fs/omfs/file.c | 2 +- fs/qnx4/dir.c | 2 +- fs/ramfs/file-mmu.c | 2 +- fs/ramfs/file-nommu.c | 2 +- fs/sysv/dir.c | 2 +- fs/sysv/file.c | 2 +- fs/udf/dir.c | 2 +- fs/udf/file.c | 2 +- fs/ufs/dir.c | 2 +- fs/ufs/file.c | 2 +- include/linux/fs.h | 4 ++-- mm/shmem.c | 2 +- 23 files changed, 44 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index fc1b1c42b1dc..e5e5f823d687 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -251,7 +251,7 @@ const struct file_operations spufs_context_fops = { .llseek = dcache_dir_lseek, .read = generic_read_dir, .readdir = dcache_readdir, - .fsync = simple_sync_file, + .fsync = noop_fsync, }; EXPORT_SYMBOL_GPL(spufs_context_fops); diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 23aa52f548a0..f4287e4de744 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, .readdir = adfs_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; static int diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 005ea34d1758..a36da5382b40 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .write = do_sync_write, .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 8f73841fc974..d967e052b779 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) const struct file_operations bfs_dir_operations = { .read = generic_read_dir, .readdir = bfs_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 48bcfc327014..d82e7ca93455 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -46,7 +46,7 @@ int ext2_fsync(struct file *file, int datasync) struct super_block *sb = file->f_mapping->host->i_sb; struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; - ret = simple_fsync(file, datasync); + ret = generic_file_fsync(file, datasync); if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { /* We don't really know where the IO error happened... */ ext2_error(sb, __func__, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 40f345201737..592adf2e546e 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -91,7 +91,7 @@ int ext4_sync_file(struct file *file, int datasync) return ret; if (!journal) { - ret = simple_fsync(file, datasync); + ret = generic_file_fsync(file, datasync); if (!ret && !list_empty(&inode->i_dentry)) ext4_sync_parent(inode); return ret; diff --git a/fs/fat/file.c b/fs/fat/file.c index 29a576944374..8c13b8acfd2f 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -154,7 +154,7 @@ int fat_file_fsync(struct file *filp, int datasync) struct inode *inode = filp->f_mapping->host; int res, err; - res = simple_fsync(filp, datasync); + res = generic_file_fsync(filp, datasync); err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); return res ? res : err; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a0bbd3d1b41a..a4e9a7ec3691 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -688,7 +688,7 @@ static void init_once(void *foo) const struct file_operations hugetlbfs_file_operations = { .read = hugetlbfs_read, .mmap = hugetlbfs_file_mmap, - .fsync = simple_sync_file, + .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, }; diff --git a/fs/libfs.c b/fs/libfs.c index e57ea58bda68..b84d0a7a2204 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -58,11 +58,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na return NULL; } -int simple_sync_file(struct file *file, int datasync) -{ - return 0; -} - int dcache_dir_open(struct inode *inode, struct file *file) { static struct qstr cursor_name = {.len = 1, .name = "."}; @@ -190,7 +185,7 @@ const struct file_operations simple_dir_operations = { .llseek = dcache_dir_lseek, .read = generic_read_dir, .readdir = dcache_readdir, - .fsync = simple_sync_file, + .fsync = noop_fsync, }; const struct inode_operations simple_dir_inode_operations = { @@ -851,7 +846,16 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, } EXPORT_SYMBOL_GPL(generic_fh_to_parent); -int simple_fsync(struct file *file, int datasync) +/** + * generic_file_fsync - generic fsync implementation for simple filesystems + * @file: file to synchronize + * @datasync: only synchronize essential metadata if true + * + * This is a generic implementation of the fsync method for simple + * filesystems which track all non-inode metadata in the buffers list + * hanging off the address_space structure. + */ +int generic_file_fsync(struct file *file, int datasync) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -872,7 +876,15 @@ int simple_fsync(struct file *file, int datasync) ret = err; return ret; } -EXPORT_SYMBOL(simple_fsync); +EXPORT_SYMBOL(generic_file_fsync); + +/* + * No-op implementation of ->fsync for in-memory filesystems. + */ +int noop_fsync(struct file *file, int datasync) +{ + return 0; +} EXPORT_SYMBOL(dcache_dir_close); EXPORT_SYMBOL(dcache_dir_lseek); @@ -895,7 +907,7 @@ EXPORT_SYMBOL(simple_release_fs); EXPORT_SYMBOL(simple_rename); EXPORT_SYMBOL(simple_rmdir); EXPORT_SYMBOL(simple_statfs); -EXPORT_SYMBOL(simple_sync_file); +EXPORT_SYMBOL(noop_fsync); EXPORT_SYMBOL(simple_unlink); EXPORT_SYMBOL(simple_read_from_buffer); EXPORT_SYMBOL(simple_write_to_buffer); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 6198731d7fcd..aaab97ec5bc0 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = minix_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; static inline void dir_put_page(struct page *page) diff --git a/fs/minix/file.c b/fs/minix/file.c index 3eec3e607a87..d5320ff23faf 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 399487c09364..6e7a3291bbe8 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = { .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 3d3fd4692133..6e8fc62b40a8 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -80,7 +80,7 @@ const struct file_operations qnx4_dir_operations = .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = qnx4_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; const struct inode_operations qnx4_dir_inode_operations = diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 78f613cb9c76..50020501c5eb 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -43,7 +43,7 @@ const struct file_operations ramfs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 5ea4ad81a429..869f2d80183d 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = { .aio_read = generic_file_aio_read, .write = do_sync_write, .aio_write = generic_file_aio_write, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 1dabed286b4c..79941e4964a4 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = sysv_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; static inline void dir_put_page(struct page *page) diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 96340c01f4a7..750cc22349bd 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 1660c81ffa3d..51552bf50225 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -211,5 +211,5 @@ const struct file_operations udf_dir_operations = { .read = generic_read_dir, .readdir = udf_readdir, .unlocked_ioctl = udf_ioctl, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; diff --git a/fs/udf/file.c b/fs/udf/file.c index baae3a723946..0660280aa180 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -224,7 +224,7 @@ const struct file_operations udf_file_operations = { .write = do_sync_write, .aio_write = udf_file_aio_write, .release = udf_release_file, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, .llseek = generic_file_llseek, }; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 317a0d444f6b..ec784756dc65 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -666,6 +666,6 @@ not_empty: const struct file_operations ufs_dir_operations = { .read = generic_read_dir, .readdir = ufs_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/ufs/file.c b/fs/ufs/file.c index a8962cecde5b..d4c30d1bef86 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -42,6 +42,6 @@ const struct file_operations ufs_file_operations = { .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .open = dquot_file_open, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index eb39e5eb77f5..acf6c52a50dd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2348,7 +2348,7 @@ extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -extern int simple_sync_file(struct file *, int); +extern int noop_fsync(struct file *, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, @@ -2373,7 +2373,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count, extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); -extern int simple_fsync(struct file *, int); +extern int generic_file_fsync(struct file *, int); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, diff --git a/mm/shmem.c b/mm/shmem.c index 855eaf5b8d5b..4daf8cecf422 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2433,7 +2433,7 @@ static const struct file_operations shmem_file_operations = { .write = do_sync_write, .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, #endif -- cgit v1.2.3-59-g8ed1b From 7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Thu, 27 May 2010 01:05:33 +1000 Subject: fs: introduce new truncate sequence Introduce a new truncate calling sequence into fs/mm subsystems. Rather than setattr > vmtruncate > truncate, have filesystems call their truncate sequence from ->setattr if filesystem specific operations are required. vmtruncate is deprecated, and truncate_pagecache and inode_newsize_ok helpers introduced previously should be used. simple_setattr is introduced for simple in-ram filesystems to implement the new truncate sequence. Eventually all filesystems should be converted to implement a setattr, and the default code in notify_change should go away. simple_setsize is also introduced to perform just the ATTR_SIZE portion of simple_setattr (ie. changing i_size and trimming pagecache). To implement the new truncate sequence: - filesystem specific manipulations (eg freeing blocks) must be done in the setattr method rather than ->truncate. - vmtruncate can not be used by core code to trim blocks past i_size in the event of write failure after allocation, so this must be performed in the fs code. - convert usage of helpers block_write_begin, nobh_write_begin, cont_write_begin, and *blockdev_direct_IO* to use _newtrunc postfixed variants. These avoid calling vmtruncate to trim blocks (see previous). - inode_setattr should not be used. generic_setattr is a new function to be used to copy simple attributes into the generic inode. - make use of the better opportunity to handle errors with the new sequence. Big problem with the previous calling sequence: the filesystem is not called until i_size has already changed. This means it is not allowed to fail the call, and also it does not know what the previous i_size was. Also, generic code calling vmtruncate to truncate allocated blocks in case of error had no good way to return a meaningful error (or, for example, atomically handle block deallocation). Cc: Christoph Hellwig Acked-by: Jan Kara Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- Documentation/filesystems/vfs.txt | 7 ++- fs/attr.c | 50 ++++++++++++---- fs/buffer.c | 123 ++++++++++++++++++++++++++++++-------- fs/direct-io.c | 61 ++++++++++++------- fs/libfs.c | 76 +++++++++++++++++++++++ include/linux/buffer_head.h | 9 +++ include/linux/fs.h | 27 ++++++++- mm/truncate.c | 10 ++-- 8 files changed, 300 insertions(+), 63 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index d4f5731dcbbb..94677e7dcb13 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -401,11 +401,16 @@ otherwise noted. started might not be in the page cache at the end of the walk). - truncate: called by the VFS to change the size of a file. The + truncate: Deprecated. This will not be called if ->setsize is defined. + Called by the VFS to change the size of a file. The i_size field of the inode is set to the desired size by the VFS before this method is called. This method is called by the truncate(2) system call and related functionality. + Note: ->truncate and vmtruncate are deprecated. Do not add new + instances/calls of these. Filesystems should be converted to do their + truncate sequence via ->setattr(). + permission: called by the VFS to check for access rights on a POSIX-like filesystem. diff --git a/fs/attr.c b/fs/attr.c index 0815e93bb487..b4fa3b0aa596 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok); * @offset: the new size to assign to the inode * @Returns: 0 on success, -ve errno on failure * + * inode_newsize_ok must be called with i_mutex held. + * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ * when necessary. Caller must not proceed with inode size change if failure is * returned. @inode must be a file (not directory), with appropriate * permissions to allow truncate (inode_newsize_ok does NOT check these * conditions). - * - * inode_newsize_ok must be called with i_mutex held. */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { @@ -104,17 +104,25 @@ out_big: } EXPORT_SYMBOL(inode_newsize_ok); -int inode_setattr(struct inode * inode, struct iattr * attr) +/** + * generic_setattr - copy simple metadata updates into the generic inode + * @inode: the inode to be updated + * @attr: the new attributes + * + * generic_setattr must be called with i_mutex held. + * + * generic_setattr updates the inode's metadata with that specified + * in attr. Noticably missing is inode size update, which is more complex + * as it requires pagecache updates. See simple_setsize. + * + * The inode is not marked as dirty after this operation. The rationale is + * that for "simple" filesystems, the struct inode is the inode storage. + * The caller is free to mark the inode dirty afterwards if needed. + */ +void generic_setattr(struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_SIZE && - attr->ia_size != i_size_read(inode)) { - int error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - if (ia_valid & ATTR_UID) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) @@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr) mode &= ~S_ISGID; inode->i_mode = mode; } +} +EXPORT_SYMBOL(generic_setattr); + +/* + * note this function is deprecated, the new truncate sequence should be + * used instead -- see eg. simple_setsize, generic_setattr. + */ +int inode_setattr(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_SIZE && + attr->ia_size != i_size_read(inode)) { + int error; + + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + generic_setattr(inode, attr); + mark_inode_dirty(inode); return 0; diff --git a/fs/buffer.c b/fs/buffer.c index e8aa7081d25c..d54812b198e9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, } /* - * block_write_begin takes care of the basic task of block allocation and - * bringing partial write blocks uptodate first. - * - * If *pagep is not NULL, then block_write_begin uses the locked page - * at *pagep rather than allocating its own. In this case, the page will - * not be unlocked or deallocated on failure. + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct file *file, struct address_space *mapping, +int block_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); } } out: return status; } +EXPORT_SYMBOL(block_write_begin_newtrunc); + +/* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * If *pagep is not NULL, then block_write_begin uses the locked page + * at *pagep rather than allocating its own. In this case, the page will + * not be unlocked or deallocated on failure. + */ +int block_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Filesystems which pass down their own page also cannot + * call into vmtruncate here because it would lead to lock + * inversion problems (*pagep is locked). This is a further + * example of where the old truncate sequence is inadequate. + */ + if (unlikely(ret) && *pagep == NULL) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(block_write_begin); int block_write_end(struct file *file, struct address_space *mapping, @@ -2324,7 +2351,7 @@ out: * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ -int cont_write_begin(struct file *file, struct address_space *mapping, +int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block, loff_t *bytes) @@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } *pagep = NULL; - err = block_write_begin(file, mapping, pos, len, + err = block_write_begin_newtrunc(file, mapping, pos, len, flags, pagep, fsdata, get_block); out: return err; } +EXPORT_SYMBOL(cont_write_begin_newtrunc); + +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) +{ + int ret; + + ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block, bytes); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(cont_write_begin); int block_prepare_write(struct page *page, unsigned from, unsigned to, @@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write); * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the + * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. @@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) } /* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int nobh_write_begin(struct file *file, struct address_space *mapping, +int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, - fsdata, get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, + flags, pagep, fsdata, get_block); } if (PageMappedToDisk(page)) @@ -2605,8 +2652,34 @@ out_release: page_cache_release(page); *pagep = NULL; - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + return ret; +} +EXPORT_SYMBOL(nobh_write_begin_newtrunc); + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } return ret; } diff --git a/fs/direct-io.c b/fs/direct-io.c index da111aacb46e..7600aacf531d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1134,27 +1134,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, return ret; } -/* - * This is a library function for use by filesystem drivers. - * - * The locking rules are governed by the flags parameter: - * - if the flags value contains DIO_LOCKING we use a fancy locking - * scheme for dumb filesystems. - * For writes this function is called under i_mutex and returns with - * i_mutex held, for reads, i_mutex is not held on entry, but it is - * taken and dropped again before returning. - * For reads and writes i_alloc_sem is taken in shared mode and released - * on I/O completion (which may happen asynchronously after returning to - * the caller). - * - * - if the flags value does NOT contain DIO_LOCKING we don't use any - * internal locking but rather rely on the filesystem to synchronize - * direct I/O reads/writes versus each other and truncate. - * For reads and writes both i_mutex and i_alloc_sem are not held on - * entry and are never taken. - */ ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) @@ -1247,9 +1228,46 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, nr_segs, blkbits, get_block, end_io, submit_io, dio); +out: + return retval; +} +EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc); + +/* + * This is a library function for use by filesystem drivers. + * + * The locking rules are governed by the flags parameter: + * - if the flags value contains DIO_LOCKING we use a fancy locking + * scheme for dumb filesystems. + * For writes this function is called under i_mutex and returns with + * i_mutex held, for reads, i_mutex is not held on entry, but it is + * taken and dropped again before returning. + * For reads and writes i_alloc_sem is taken in shared mode and released + * on I/O completion (which may happen asynchronously after returning to + * the caller). + * + * - if the flags value does NOT contain DIO_LOCKING we don't use any + * internal locking but rather rely on the filesystem to synchronize + * direct I/O reads/writes versus each other and truncate. + * For reads and writes both i_mutex and i_alloc_sem are not held on + * entry and are never taken. + */ +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + ssize_t retval; + + retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, + offset, nr_segs, get_block, end_io, submit_io, flags); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again for DIO_LOCKING. + * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in + * their own manner. This is a further example of where the old + * truncate sequence is inadequate. * * NOTE: filesystems with their own locking have to handle this * on their own. @@ -1257,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (flags & DIO_LOCKING) { if (unlikely((rw & WRITE) && retval < 0)) { loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + if (end > isize) vmtruncate(inode, isize); } } -out: return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/libfs.c b/fs/libfs.c index b84d0a7a2204..09e1016eb774 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -325,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; } +/** + * simple_setsize - handle core mm and vfs requirements for file size change + * @inode: inode + * @newsize: new file size + * + * Returns 0 on success, -error on failure. + * + * simple_setsize must be called with inode_mutex held. + * + * simple_setsize will check that the requested new size is OK (see + * inode_newsize_ok), and then will perform the necessary i_size update + * and pagecache truncation (if necessary). It will be typically be called + * from the filesystem's setattr function when ATTR_SIZE is passed in. + * + * The inode itself must have correct permissions and attributes to allow + * i_size to be changed, this function then just checks that the new size + * requested is valid. + * + * In the case of simple in-memory filesystems with inodes stored solely + * in the inode cache, and file data in the pagecache, nothing more needs + * to be done to satisfy a truncate request. Filesystems with on-disk + * blocks for example will need to free them in the case of truncate, in + * that case it may be easier not to use simple_setsize (but each of its + * components will likely be required at some point to update pagecache + * and inode etc). + */ +int simple_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, newsize); + if (error) + return error; + + oldsize = inode->i_size; + i_size_write(inode, newsize); + truncate_pagecache(inode, oldsize, newsize); + + return error; +} +EXPORT_SYMBOL(simple_setsize); + +/** + * simple_setattr - setattr for simple in-memory filesystem + * @dentry: dentry + * @iattr: iattr structure + * + * Returns 0 on success, -error on failure. + * + * simple_setattr implements setattr for an in-memory filesystem which + * does not store its own file data or metadata (eg. uses the page cache + * and inode cache as its data store). + */ +int simple_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (iattr->ia_valid & ATTR_SIZE) { + error = simple_setsize(inode, iattr->ia_size); + if (error) + return error; + } + + generic_setattr(inode, iattr); + + return error; +} +EXPORT_SYMBOL(simple_setattr); + int simple_readpage(struct file *file, struct page *page) { clear_highpage(page); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 05e5f5996216..1b9ba193b789 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -203,6 +203,9 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, int block_read_full_page(struct page*, get_block_t*); int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, unsigned long from); +int block_write_begin_newtrunc(struct file *, struct address_space *, + loff_t, unsigned, unsigned, + struct page **, void **, get_block_t*); int block_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **, get_block_t*); @@ -214,6 +217,9 @@ int generic_write_end(struct file *, struct address_space *, struct page *, void *); void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); +int cont_write_begin_newtrunc(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page **, void **, + get_block_t *, loff_t *); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **, get_block_t *, loff_t *); @@ -225,6 +231,9 @@ void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); int file_fsync(struct file *, int); +int nobh_write_begin_newtrunc(struct file *, struct address_space *, + loff_t, unsigned, unsigned, + struct page **, void **, get_block_t*); int nobh_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **, get_block_t*); diff --git a/include/linux/fs.h b/include/linux/fs.h index acf6c52a50dd..3428393942a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2257,6 +2257,10 @@ typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, loff_t file_offset); void dio_end_io(struct bio *bio, int error); +ssize_t __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int lock_type); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, @@ -2270,6 +2274,24 @@ enum { DIO_SKIP_HOLES = 0x02, }; +static inline ssize_t blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_block_t get_block, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, NULL, + DIO_LOCKING | DIO_SKIP_HOLES); +} + +static inline ssize_t blockdev_direct_IO_no_locking_newtrunc(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_block_t get_block, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, NULL, 0); +} static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, @@ -2342,12 +2364,14 @@ extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, void *, filldir_t); +extern int simple_setattr(struct dentry *, struct iattr *); extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +extern int simple_setsize(struct inode *, loff_t); extern int noop_fsync(struct file *, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); @@ -2384,7 +2408,8 @@ extern int buffer_migrate_page(struct address_space *, extern int inode_change_ok(const struct inode *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); -extern int __must_check inode_setattr(struct inode *, struct iattr *); +extern int __must_check inode_setattr(struct inode *, const struct iattr *); +extern void generic_setattr(struct inode *inode, const struct iattr *attr); extern void file_update_time(struct file *file); diff --git a/mm/truncate.c b/mm/truncate.c index f42675a3615d..937571b8b233 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -548,18 +548,18 @@ EXPORT_SYMBOL(truncate_pagecache); * NOTE! We have to be ready to update the memory sharing * between the file and the memory map for a potential last * incomplete page. Ugly, but necessary. + * + * This function is deprecated and simple_setsize or truncate_pagecache + * should be used instead. */ int vmtruncate(struct inode *inode, loff_t offset) { - loff_t oldsize; int error; - error = inode_newsize_ok(inode, offset); + error = simple_setsize(inode, offset); if (error) return error; - oldsize = inode->i_size; - i_size_write(inode, offset); - truncate_pagecache(inode, oldsize, offset); + if (inode->i_op->truncate) inode->i_op->truncate(inode); -- cgit v1.2.3-59-g8ed1b From 3889e6e76f66b7de208a1709d0fe530b21a2d384 Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Thu, 27 May 2010 01:05:36 +1000 Subject: tmpfs: convert to use the new truncate convention Cc: Christoph Hellwig Acked-by: Hugh Dickins Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- mm/shmem.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 4daf8cecf422..7e5030ae18ff 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -727,10 +727,11 @@ done2: if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { /* * Call truncate_inode_pages again: racing shmem_unuse_inode - * may have swizzled a page in from swap since vmtruncate or - * generic_delete_inode did it, before we lowered next_index. - * Also, though shmem_getpage checks i_size before adding to - * cache, no recheck after: so fix the narrow window there too. + * may have swizzled a page in from swap since + * truncate_pagecache or generic_delete_inode did it, before we + * lowered next_index. Also, though shmem_getpage checks + * i_size before adding to cache, no recheck after: so fix the + * narrow window there too. * * Recalling truncate_inode_pages_range and unmap_mapping_range * every time for punch_hole (which never got a chance to clear @@ -760,19 +761,16 @@ done2: } } -static void shmem_truncate(struct inode *inode) -{ - shmem_truncate_range(inode, inode->i_size, (loff_t)-1); -} - static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - struct page *page = NULL; int error; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - if (attr->ia_size < inode->i_size) { + loff_t newsize = attr->ia_size; + struct page *page = NULL; + + if (newsize < inode->i_size) { /* * If truncating down to a partial page, then * if that page is already allocated, hold it @@ -780,9 +778,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * truncate_partial_page cannnot miss it were * it assigned to swap. */ - if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + if (newsize & (PAGE_CACHE_SIZE-1)) { (void) shmem_getpage(inode, - attr->ia_size>>PAGE_CACHE_SHIFT, + newsize >> PAGE_CACHE_SHIFT, &page, SGP_READ, NULL); if (page) unlock_page(page); @@ -794,24 +792,29 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * if it's being fully truncated to zero-length: the * nrpages check is efficient enough in that case. */ - if (attr->ia_size) { + if (newsize) { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); info->flags &= ~SHMEM_PAGEIN; spin_unlock(&info->lock); } } + + error = simple_setsize(inode, newsize); + if (page) + page_cache_release(page); + if (error) + return error; + shmem_truncate_range(inode, newsize, (loff_t)-1); } error = inode_change_ok(inode, attr); if (!error) - error = inode_setattr(inode, attr); + generic_setattr(inode, attr); #ifdef CONFIG_TMPFS_POSIX_ACL if (!error && (attr->ia_valid & ATTR_MODE)) error = generic_acl_chmod(inode); #endif - if (page) - page_cache_release(page); return error; } @@ -819,11 +822,11 @@ static void shmem_delete_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); - if (inode->i_op->truncate == shmem_truncate) { + if (inode->i_mapping->a_ops == &shmem_aops) { truncate_inode_pages(inode->i_mapping, 0); shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; - shmem_truncate(inode); + shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->swaplist)) { mutex_lock(&shmem_swaplist_mutex); list_del_init(&info->swaplist); @@ -2022,7 +2025,6 @@ static const struct inode_operations shmem_symlink_inline_operations = { }; static const struct inode_operations shmem_symlink_inode_operations = { - .truncate = shmem_truncate, .readlink = generic_readlink, .follow_link = shmem_follow_link, .put_link = shmem_put_link, @@ -2440,7 +2442,6 @@ static const struct file_operations shmem_file_operations = { }; static const struct inode_operations shmem_inode_operations = { - .truncate = shmem_truncate, .setattr = shmem_notify_change, .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_POSIX_ACL -- cgit v1.2.3-59-g8ed1b