From 73293c2f900d0adbb6a415b312cd57976d5ae242 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:37 +0900 Subject: slab: correct pfmemalloc check We checked pfmemalloc by slab unit, not page unit. You can see this in is_slab_pfmemalloc(). So other pages don't need to be set/cleared pfmemalloc. And, therefore we should check pfmemalloc in page flag of first page, but current implementation don't do that. virt_to_head_page(obj) just return 'struct page' of that object, not one of first page, since the SLAB don't use __GFP_COMP when CONFIG_MMU. To get 'struct page' of first page, we first get a slab and try to get it via virt_to_head_page(slab->s_mem). Acked-by: Andi Kleen Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 2580db062df9..0b4ddafd8a03 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -930,7 +930,8 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, { if (unlikely(pfmemalloc_active)) { /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_head_page(objp); + struct slab *slabp = virt_to_slab(objp); + struct page *page = virt_to_head_page(slabp->s_mem); if (PageSlabPfmemalloc(page)) set_obj_pfmemalloc(&objp); } @@ -1776,7 +1777,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) __SetPageSlab(page + i); if (page->pfmemalloc) - SetPageSlabPfmemalloc(page + i); + SetPageSlabPfmemalloc(page); } memcg_bind_pages(cachep, cachep->gfporder); @@ -1809,9 +1810,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) else sub_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_freed); + + __ClearPageSlabPfmemalloc(page); while (i--) { BUG_ON(!PageSlab(page)); - __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); page++; } -- cgit v1.3-7-g2ca7 From 0c3aa83e00a9cd93f08e7aa42fba01924aa5f2fc Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:38 +0900 Subject: slab: change return type of kmem_getpages() to struct page It is more understandable that kmem_getpages() return struct page. And, with this, we can reduce one translation from virt addr to page and makes better code than before. Below is a change of this patch. * Before text data bss dec hex filename 22123 23434 4 45561 b1f9 mm/slab.o * After text data bss dec hex filename 22074 23434 4 45512 b1c8 mm/slab.o And this help following patch to remove struct slab's colouroff. Acked-by: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 60 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 0b4ddafd8a03..7d79bd766002 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -205,7 +205,7 @@ typedef unsigned int kmem_bufctl_t; struct slab_rcu { struct rcu_head head; struct kmem_cache *cachep; - void *addr; + struct page *page; }; /* @@ -1737,7 +1737,8 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) * did not request dmaable memory, we might get it, but that * would be relatively rare and ignorable. */ -static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, + int nodeid) { struct page *page; int nr_pages; @@ -1790,16 +1791,15 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) kmemcheck_mark_unallocated_pages(page, nr_pages); } - return page_address(page); + return page; } /* * Interface to system's page release. */ -static void kmem_freepages(struct kmem_cache *cachep, void *addr) +static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { unsigned long i = (1 << cachep->gfporder); - struct page *page = virt_to_page(addr); const unsigned long nr_freed = i; kmemcheck_free_shadow(page, cachep->gfporder); @@ -1821,7 +1821,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); + __free_memcg_kmem_pages(page, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) @@ -1829,7 +1829,7 @@ static void kmem_rcu_free(struct rcu_head *head) struct slab_rcu *slab_rcu = (struct slab_rcu *)head; struct kmem_cache *cachep = slab_rcu->cachep; - kmem_freepages(cachep, slab_rcu->addr); + kmem_freepages(cachep, slab_rcu->page); if (OFF_SLAB(cachep)) kmem_cache_free(cachep->slabp_cache, slab_rcu); } @@ -2048,7 +2048,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab */ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) { - void *addr = slabp->s_mem - slabp->colouroff; + struct page *page = virt_to_head_page(slabp->s_mem); slab_destroy_debugcheck(cachep, slabp); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { @@ -2056,10 +2056,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) slab_rcu = (struct slab_rcu *)slabp; slab_rcu->cachep = cachep; - slab_rcu->addr = addr; + slab_rcu->page = page; call_rcu(&slab_rcu->head, kmem_rcu_free); } else { - kmem_freepages(cachep, addr); + kmem_freepages(cachep, page); if (OFF_SLAB(cachep)) kmem_cache_free(cachep->slabp_cache, slabp); } @@ -2604,11 +2604,12 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) * kmem_find_general_cachep till the initialization is complete. * Hence we cannot have slabp_cache same as the original cache. */ -static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, - int colour_off, gfp_t local_flags, - int nodeid) +static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, + struct page *page, int colour_off, + gfp_t local_flags, int nodeid) { struct slab *slabp; + void *addr = page_address(page); if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ @@ -2625,12 +2626,12 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, if (!slabp) return NULL; } else { - slabp = objp + colour_off; + slabp = addr + colour_off; colour_off += cachep->slab_size; } slabp->inuse = 0; slabp->colouroff = colour_off; - slabp->s_mem = objp + colour_off; + slabp->s_mem = addr + colour_off; slabp->nodeid = nodeid; slabp->free = 0; return slabp; @@ -2741,12 +2742,9 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, * virtual address for kfree, ksize, and slab debugging. */ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, - void *addr) + struct page *page) { int nr_pages; - struct page *page; - - page = virt_to_page(addr); nr_pages = 1; if (likely(!PageCompound(page))) @@ -2764,7 +2762,7 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, * kmem_cache_alloc() when there are no active objs left in a cache. */ static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, void *objp) + gfp_t flags, int nodeid, struct page *page) { struct slab *slabp; size_t offset; @@ -2807,18 +2805,18 @@ static int cache_grow(struct kmem_cache *cachep, * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - if (!objp) - objp = kmem_getpages(cachep, local_flags, nodeid); - if (!objp) + if (!page) + page = kmem_getpages(cachep, local_flags, nodeid); + if (!page) goto failed; /* Get slab management. */ - slabp = alloc_slabmgmt(cachep, objp, offset, + slabp = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); if (!slabp) goto opps1; - slab_map_pages(cachep, slabp, objp); + slab_map_pages(cachep, slabp, page); cache_init_objs(cachep, slabp); @@ -2834,7 +2832,7 @@ static int cache_grow(struct kmem_cache *cachep, spin_unlock(&n->list_lock); return 1; opps1: - kmem_freepages(cachep, objp); + kmem_freepages(cachep, page); failed: if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -3250,18 +3248,20 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ + struct page *page; + if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_mem_id()); + page = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); - if (obj) { + if (page) { /* * Insert into the appropriate per node queues */ - nid = page_to_nid(virt_to_page(obj)); - if (cache_grow(cache, flags, nid, obj)) { + nid = page_to_nid(page); + if (cache_grow(cache, flags, nid, page)) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (!obj) -- cgit v1.3-7-g2ca7 From ac2b54edbc4768f4a069fc360e2b4e579e9907cb Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:39 +0900 Subject: slab: remove colouroff in struct slab Now there is no user colouroff, so remove it. Acked-by: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 7d79bd766002..34eb1152aba4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -219,7 +219,6 @@ struct slab { union { struct { struct list_head list; - unsigned long colouroff; void *s_mem; /* including colour offset */ unsigned int inuse; /* num of objs active in slab */ kmem_bufctl_t free; @@ -2630,7 +2629,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, colour_off += cachep->slab_size; } slabp->inuse = 0; - slabp->colouroff = colour_off; slabp->s_mem = addr + colour_off; slabp->nodeid = nodeid; slabp->free = 0; -- cgit v1.3-7-g2ca7 From 1ea991b00c26e3a275e2d2e110e5221211fc45b6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:40 +0900 Subject: slab: remove nodeid in struct slab We can get nodeid using address translation, so this field is not useful. Therefore, remove it. Acked-by: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 34eb1152aba4..71ba8f53f9b7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -222,7 +222,6 @@ struct slab { void *s_mem; /* including colour offset */ unsigned int inuse; /* num of objs active in slab */ kmem_bufctl_t free; - unsigned short nodeid; }; struct slab_rcu __slab_cover_slab_rcu; }; @@ -1099,8 +1098,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { - struct slab *slabp = virt_to_slab(objp); - int nodeid = slabp->nodeid; + int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; struct array_cache *alien = NULL; int node; @@ -1111,7 +1109,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == node)) + if (likely(nodeid == node)) return 0; n = cachep->node[node]; @@ -2630,7 +2628,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, } slabp->inuse = 0; slabp->s_mem = addr + colour_off; - slabp->nodeid = nodeid; slabp->free = 0; return slabp; } @@ -2707,7 +2704,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, next = slab_bufctl(slabp)[slabp->free]; #if DEBUG slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; - WARN_ON(slabp->nodeid != nodeid); + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); #endif slabp->free = next; @@ -2721,7 +2718,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, #if DEBUG /* Verify that the slab belongs to the intended node */ - WARN_ON(slabp->nodeid != nodeid); + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { printk(KERN_ERR "slab: double free detected in cache " -- cgit v1.3-7-g2ca7 From 07d417a1c6f1e386a2276b0cae8ae1d14b8a32cc Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:41 +0900 Subject: slab: remove cachep in struct slab_rcu We can get cachep using page in struct slab_rcu, so remove it. Acked-by: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 71ba8f53f9b7..7e1aabe2b5d8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -204,7 +204,6 @@ typedef unsigned int kmem_bufctl_t; */ struct slab_rcu { struct rcu_head head; - struct kmem_cache *cachep; struct page *page; }; @@ -1824,7 +1823,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) static void kmem_rcu_free(struct rcu_head *head) { struct slab_rcu *slab_rcu = (struct slab_rcu *)head; - struct kmem_cache *cachep = slab_rcu->cachep; + struct kmem_cache *cachep = slab_rcu->page->slab_cache; kmem_freepages(cachep, slab_rcu->page); if (OFF_SLAB(cachep)) @@ -2052,7 +2051,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) struct slab_rcu *slab_rcu; slab_rcu = (struct slab_rcu *)slabp; - slab_rcu->cachep = cachep; slab_rcu->page = page; call_rcu(&slab_rcu->head, kmem_rcu_free); } else { -- cgit v1.3-7-g2ca7 From 68126702b419fd26ef4946e314bb3a1f57d3a53f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:42 +0900 Subject: slab: overloading the RCU head over the LRU for RCU free With build-time size checking, we can overload the RCU head over the LRU of struct page to free pages of a slab in rcu context. This really help to implement to overload the struct slab over the struct page and this eventually reduce memory usage and cache footprint of the SLAB. Acked-by: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- include/linux/mm_types.h | 3 +++ include/linux/slab.h | 9 ++++++- mm/slab.c | 68 +++++++++++++++++++++--------------------------- 3 files changed, 41 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index faf4b7c1ad12..959cb369b197 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -130,6 +130,9 @@ struct page { struct list_head list; /* slobs list of pages */ struct slab *slab_page; /* slab fields */ + struct rcu_head rcu_head; /* Used by SLAB + * when destroying via RCU + */ }; /* Remainder is not double word aligned */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 6c5cc0ea8713..caaad51fee1f 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -51,7 +51,14 @@ * } * rcu_read_unlock(); * - * See also the comment on struct slab_rcu in mm/slab.c. + * This is useful if we need to approach a kernel structure obliquely, + * from its address obtained without the usual locking. We can lock + * the structure to stabilize it and check it's still at the given address, + * only if we can be sure that the memory has not been meanwhile reused + * for some other kind of object (which our subsystem's lock might corrupt). + * + * rcu_read_lock before reading the address, then rcu_read_unlock after + * taking the spinlock within the structure expected at that address. */ #define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ diff --git a/mm/slab.c b/mm/slab.c index 7e1aabe2b5d8..84c4ed62c10d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -188,25 +188,6 @@ typedef unsigned int kmem_bufctl_t; #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) -/* - * struct slab_rcu - * - * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to - * arrange for kmem_freepages to be called via RCU. This is useful if - * we need to approach a kernel structure obliquely, from its address - * obtained without the usual locking. We can lock the structure to - * stabilize it and check it's still at the given address, only if we - * can be sure that the memory has not been meanwhile reused for some - * other kind of object (which our subsystem's lock might corrupt). - * - * rcu_read_lock before reading the address, then rcu_read_unlock after - * taking the spinlock within the structure expected at that address. - */ -struct slab_rcu { - struct rcu_head head; - struct page *page; -}; - /* * struct slab * @@ -215,14 +196,11 @@ struct slab_rcu { * Slabs are chained into three list: fully used, partial, fully free slabs. */ struct slab { - union { - struct { - struct list_head list; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - }; - struct slab_rcu __slab_cover_slab_rcu; + struct { + struct list_head list; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; }; }; @@ -1509,6 +1487,8 @@ void __init kmem_cache_init(void) { int i; + BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < + sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; setup_node_pointer(kmem_cache); @@ -1822,12 +1802,13 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) static void kmem_rcu_free(struct rcu_head *head) { - struct slab_rcu *slab_rcu = (struct slab_rcu *)head; - struct kmem_cache *cachep = slab_rcu->page->slab_cache; + struct kmem_cache *cachep; + struct page *page; - kmem_freepages(cachep, slab_rcu->page); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slab_rcu); + page = container_of(head, struct page, rcu_head); + cachep = page->slab_cache; + + kmem_freepages(cachep, page); } #if DEBUG @@ -2048,16 +2029,27 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) slab_destroy_debugcheck(cachep, slabp); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct slab_rcu *slab_rcu; + struct rcu_head *head; + + /* + * RCU free overloads the RCU head over the LRU. + * slab_page has been overloeaded over the LRU, + * however it is not used from now on so that + * we can use it safely. + */ + head = (void *)&page->rcu_head; + call_rcu(head, kmem_rcu_free); - slab_rcu = (struct slab_rcu *)slabp; - slab_rcu->page = page; - call_rcu(&slab_rcu->head, kmem_rcu_free); } else { kmem_freepages(cachep, page); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); } + + /* + * From now on, we don't use slab management + * although actual page can be freed in rcu context + */ + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slabp); } /** -- cgit v1.3-7-g2ca7 From 56f295ef0dfa7e1d0be18deebe0c15fb6b2d9d5b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:43 +0900 Subject: slab: use well-defined macro, virt_to_slab() This is trivial change, just use well-defined macro. Acked-by: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 84c4ed62c10d..f9e676edeb0f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2865,7 +2865,6 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, unsigned long caller) { - struct page *page; unsigned int objnr; struct slab *slabp; @@ -2873,9 +2872,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, objp -= obj_offset(cachep); kfree_debugcheck(objp); - page = virt_to_head_page(objp); - - slabp = page->slab_page; + slabp = virt_to_slab(objp); if (cachep->flags & SLAB_RED_ZONE) { verify_redzone_free(cachep, objp); @@ -3087,7 +3084,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, struct slab *slabp; unsigned objnr; - slabp = virt_to_head_page(objp)->slab_page; + slabp = virt_to_slab(objp); objnr = (unsigned)(objp - slabp->s_mem) / cachep->size; slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; } -- cgit v1.3-7-g2ca7 From a57a49887eb3398d31bfaa8009531f7121d6537d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:44 +0900 Subject: slab: use __GFP_COMP flag for allocating slab pages If we use 'struct page' of first page as 'struct slab', there is no advantage not to use __GFP_COMP. So use __GFP_COMP flag for all the cases. Acked-by: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 43 +++++++++---------------------------------- 1 file changed, 9 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f9e676edeb0f..75c60821e382 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1718,15 +1718,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, { struct page *page; int nr_pages; - int i; - -#ifndef CONFIG_MMU - /* - * Nommu uses slab's for process anonymous memory allocations, and thus - * requires __GFP_COMP to properly refcount higher order allocations - */ - flags |= __GFP_COMP; -#endif flags |= cachep->allocflags; if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -1750,12 +1741,9 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); - for (i = 0; i < nr_pages; i++) { - __SetPageSlab(page + i); - - if (page->pfmemalloc) - SetPageSlabPfmemalloc(page); - } + __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -1775,8 +1763,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, */ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - unsigned long i = (1 << cachep->gfporder); - const unsigned long nr_freed = i; + const unsigned long nr_freed = (1 << cachep->gfporder); kmemcheck_free_shadow(page, cachep->gfporder); @@ -1787,12 +1774,9 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) sub_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_freed); + BUG_ON(!PageSlab(page)); __ClearPageSlabPfmemalloc(page); - while (i--) { - BUG_ON(!PageSlab(page)); - __ClearPageSlab(page); - page++; - } + __ClearPageSlab(page); memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) @@ -2362,7 +2346,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; - cachep->allocflags = 0; + cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->allocflags |= GFP_DMA; cachep->size = size; @@ -2729,17 +2713,8 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, struct page *page) { - int nr_pages; - - nr_pages = 1; - if (likely(!PageCompound(page))) - nr_pages <<= cache->gfporder; - - do { - page->slab_cache = cache; - page->slab_page = slab; - page++; - } while (--nr_pages); + page->slab_cache = cache; + page->slab_page = slab; } /* -- cgit v1.3-7-g2ca7 From b1cb0982bdd6f57fed690f796659733350bb2cae Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:45 +0900 Subject: slab: change the management method of free objects of the slab Current free objects management method of the slab is weird, because it touch random position of the array of kmem_bufctl_t when we try to get free object. See following example. struct slab's free = 6 kmem_bufctl_t array: 1 END 5 7 0 4 3 2 To get free objects, we access this array with following pattern. 6 -> 3 -> 7 -> 2 -> 5 -> 4 -> 0 -> 1 -> END If we have many objects, this array would be larger and be not in the same cache line. It is not good for performance. We can do same thing through more easy way, like as the stack. Only thing we have to do is to maintain stack top to free object. I use free field of struct slab for this purpose. After that, if we need to get an object, we can get it at stack top and manipulate top pointer. That's all. This method already used in array_cache management. Following is an access pattern when we use this method. struct slab's free = 0 kmem_bufctl_t array: 6 3 7 2 5 4 0 1 To get free objects, we access this array with following pattern. 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7 This may help cache line footprint if slab has many objects, and, in addition, this makes code much much simpler. Acked-by: Andi Kleen Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 94 ++++++++++++++++++++------------------------------------------- 1 file changed, 30 insertions(+), 64 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 75c60821e382..05fe37eb4a57 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -183,9 +183,6 @@ static bool pfmemalloc_active __read_mostly; */ typedef unsigned int kmem_bufctl_t; -#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) -#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) -#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) /* @@ -2653,9 +2650,8 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_bufctl(slabp)[i] = i + 1; + slab_bufctl(slabp)[i] = i; } - slab_bufctl(slabp)[i - 1] = BUFCTL_END; } static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) @@ -2671,16 +2667,14 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) { - void *objp = index_to_obj(cachep, slabp, slabp->free); - kmem_bufctl_t next; + void *objp; slabp->inuse++; - next = slab_bufctl(slabp)[slabp->free]; + objp = index_to_obj(cachep, slabp, slab_bufctl(slabp)[slabp->free]); #if DEBUG - slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); #endif - slabp->free = next; + slabp->free++; return objp; } @@ -2689,19 +2683,23 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, int nodeid) { unsigned int objnr = obj_to_index(cachep, slabp, objp); - #if DEBUG + kmem_bufctl_t i; + /* Verify that the slab belongs to the intended node */ WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); - BUG(); + /* Verify double free bug */ + for (i = slabp->free; i < cachep->num; i++) { + if (slab_bufctl(slabp)[i] == objnr) { + printk(KERN_ERR "slab: double free detected in cache " + "'%s', objp %p\n", cachep->name, objp); + BUG(); + } } #endif - slab_bufctl(slabp)[objnr] = slabp->free; - slabp->free = objnr; + slabp->free--; + slab_bufctl(slabp)[slabp->free] = objnr; slabp->inuse--; } @@ -2862,9 +2860,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); -#ifdef CONFIG_DEBUG_SLAB_LEAK - slab_bufctl(slabp)[objnr] = BUFCTL_FREE; -#endif if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { @@ -2881,33 +2876,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, return objp; } -static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) -{ - kmem_bufctl_t i; - int entries = 0; - - /* Check slab's freelist to see if this obj is there. */ - for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { - entries++; - if (entries > cachep->num || i >= cachep->num) - goto bad; - } - if (entries != cachep->num - slabp->inuse) { -bad: - printk(KERN_ERR "slab: Internal list corruption detected in " - "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse, - print_tainted()); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, - sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), - 1); - BUG(); - } -} #else #define kfree_debugcheck(x) do { } while(0) #define cache_free_debugcheck(x,objp,z) (objp) -#define check_slabp(x,y) do { } while(0) #endif static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, @@ -2957,7 +2928,6 @@ retry: } slabp = list_entry(entry, struct slab, list); - check_slabp(cachep, slabp); check_spinlock_acquired(cachep); /* @@ -2975,11 +2945,10 @@ retry: ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, node)); } - check_slabp(cachep, slabp); /* move slabp to correct slabp list: */ list_del(&slabp->list); - if (slabp->free == BUFCTL_END) + if (slabp->free == cachep->num) list_add(&slabp->list, &n->slabs_full); else list_add(&slabp->list, &n->slabs_partial); @@ -3054,16 +3023,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } -#ifdef CONFIG_DEBUG_SLAB_LEAK - { - struct slab *slabp; - unsigned objnr; - - slabp = virt_to_slab(objp); - objnr = (unsigned)(objp - slabp->s_mem) / cachep->size; - slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; - } -#endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -3269,7 +3228,6 @@ retry: slabp = list_entry(entry, struct slab, list); check_spinlock_acquired_node(cachep, nodeid); - check_slabp(cachep, slabp); STATS_INC_NODEALLOCS(cachep); STATS_INC_ACTIVE(cachep); @@ -3278,12 +3236,11 @@ retry: BUG_ON(slabp->inuse == cachep->num); obj = slab_get_obj(cachep, slabp, nodeid); - check_slabp(cachep, slabp); n->free_objects--; /* move slabp to correct slabp list: */ list_del(&slabp->list); - if (slabp->free == BUFCTL_END) + if (slabp->free == cachep->num) list_add(&slabp->list, &n->slabs_full); else list_add(&slabp->list, &n->slabs_partial); @@ -3445,11 +3402,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, n = cachep->node[node]; list_del(&slabp->list); check_spinlock_acquired_node(cachep, node); - check_slabp(cachep, slabp); slab_put_obj(cachep, slabp, objp, node); STATS_DEC_ACTIVE(cachep); n->free_objects++; - check_slabp(cachep, slabp); /* fixup slab chains */ if (slabp->inuse == 0) { @@ -4308,12 +4263,23 @@ static inline int add_caller(unsigned long *n, unsigned long v) static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) { void *p; - int i; + int i, j; + if (n[0] == n[1]) return; for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { - if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) + bool active = true; + + for (j = s->free; j < c->num; j++) { + /* Skip freed item */ + if (slab_bufctl(s)[j] == i) { + active = false; + break; + } + } + if (!active) continue; + if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) return; } -- cgit v1.3-7-g2ca7 From 16025177e1e16529451108faed257db95c7c9d6a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:46 +0900 Subject: slab: remove kmem_bufctl_t Now, we changed the management method of free objects of the slab and there is no need to use special value, BUFCTL_END, BUFCTL_FREE and BUFCTL_ACTIVE. So remove them. Acked-by: Andi Kleen Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 42 +++++++++++------------------------------- 1 file changed, 11 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 05fe37eb4a57..6ced1ccf8abb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -163,27 +163,7 @@ */ static bool pfmemalloc_active __read_mostly; -/* - * kmem_bufctl_t: - * - * Bufctl's are used for linking objs within a slab - * linked offsets. - * - * This implementation relies on "struct page" for locating the cache & - * slab an object belongs to. - * This allows the bufctl structure to be small (one int), but limits - * the number of objects a slab (not a cache) can contain when off-slab - * bufctls are used. The limit is the size of the largest general cache - * that does not use off-slab slabs. - * For 32bit archs with 4 kB pages, is this 56. - * This is not serious, as it is only for large objects, when it is unwise - * to have too many per slab. - * Note: This limit can be raised by introducing a general cache whose size - * is less than 512 (PAGE_SIZE<<3), but greater than 256. - */ - -typedef unsigned int kmem_bufctl_t; -#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) +#define SLAB_LIMIT (((unsigned int)(~0U))-1) /* * struct slab @@ -197,7 +177,7 @@ struct slab { struct list_head list; void *s_mem; /* including colour offset */ unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; + unsigned int free; }; }; @@ -613,7 +593,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) static size_t slab_mgmt_size(size_t nr_objs, size_t align) { - return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); + return ALIGN(sizeof(struct slab)+nr_objs*sizeof(unsigned int), align); } /* @@ -633,7 +613,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * slab is used for: * * - The struct slab - * - One kmem_bufctl_t for each object + * - One unsigned int for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object * @@ -658,7 +638,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * into account. */ nr_objs = (slab_size - sizeof(struct slab)) / - (buffer_size + sizeof(kmem_bufctl_t)); + (buffer_size + sizeof(unsigned int)); /* * This calculated number will be either the right @@ -2068,7 +2048,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * looping condition in cache_grow(). */ offslab_limit = size - sizeof(struct slab); - offslab_limit /= sizeof(kmem_bufctl_t); + offslab_limit /= sizeof(unsigned int); if (num > offslab_limit) break; @@ -2309,7 +2289,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!cachep->num) return -E2BIG; - slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + slab_size = ALIGN(cachep->num * sizeof(unsigned int) + sizeof(struct slab), cachep->align); /* @@ -2324,7 +2304,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ slab_size = - cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + cachep->num * sizeof(unsigned int) + sizeof(struct slab); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2603,9 +2583,9 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, return slabp; } -static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +static inline unsigned int *slab_bufctl(struct slab *slabp) { - return (kmem_bufctl_t *) (slabp + 1); + return (unsigned int *) (slabp + 1); } static void cache_init_objs(struct kmem_cache *cachep, @@ -2684,7 +2664,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, { unsigned int objnr = obj_to_index(cachep, slabp, objp); #if DEBUG - kmem_bufctl_t i; + unsigned int i; /* Verify that the slab belongs to the intended node */ WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); -- cgit v1.3-7-g2ca7 From 45eed508de6cec3174040f800aaf90d60c7b5b5b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:47 +0900 Subject: slab: remove SLAB_LIMIT It's useless now, so remove it. Acked-by: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6ced1ccf8abb..c271d5b71093 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -163,8 +163,6 @@ */ static bool pfmemalloc_active __read_mostly; -#define SLAB_LIMIT (((unsigned int)(~0U))-1) - /* * struct slab * @@ -626,8 +624,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, mgmt_size = 0; nr_objs = slab_size / buffer_size; - if (nr_objs > SLAB_LIMIT) - nr_objs = SLAB_LIMIT; } else { /* * Ignore padding for the initial guess. The padding @@ -648,9 +644,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, > slab_size) nr_objs--; - if (nr_objs > SLAB_LIMIT) - nr_objs = SLAB_LIMIT; - mgmt_size = slab_mgmt_size(nr_objs, align); } *num = nr_objs; -- cgit v1.3-7-g2ca7 From 106a74e13b329cf609c145dc198087c04f5f8ca5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:48 +0900 Subject: slab: replace free and inuse in struct slab with newly introduced active Now, free in struct slab is same meaning as inuse. So, remove both and replace them with active. Acked-by: Andi Kleen Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 54 +++++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index c271d5b71093..2ec2336a1ffc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -174,8 +174,7 @@ struct slab { struct { struct list_head list; void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - unsigned int free; + unsigned int active; /* num of objs active in slab */ }; }; @@ -1658,7 +1657,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) active_slabs++; } list_for_each_entry(slabp, &n->slabs_partial, list) { - active_objs += slabp->inuse; + active_objs += slabp->active; active_slabs++; } list_for_each_entry(slabp, &n->slabs_free, list) @@ -2451,7 +2450,7 @@ static int drain_freelist(struct kmem_cache *cache, slabp = list_entry(p, struct slab, list); #if DEBUG - BUG_ON(slabp->inuse); + BUG_ON(slabp->active); #endif list_del(&slabp->list); /* @@ -2570,9 +2569,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, slabp = addr + colour_off; colour_off += cachep->slab_size; } - slabp->inuse = 0; + slabp->active = 0; slabp->s_mem = addr + colour_off; - slabp->free = 0; return slabp; } @@ -2642,12 +2640,11 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, { void *objp; - slabp->inuse++; - objp = index_to_obj(cachep, slabp, slab_bufctl(slabp)[slabp->free]); + objp = index_to_obj(cachep, slabp, slab_bufctl(slabp)[slabp->active]); + slabp->active++; #if DEBUG WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); #endif - slabp->free++; return objp; } @@ -2663,7 +2660,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); /* Verify double free bug */ - for (i = slabp->free; i < cachep->num; i++) { + for (i = slabp->active; i < cachep->num; i++) { if (slab_bufctl(slabp)[i] == objnr) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); @@ -2671,9 +2668,8 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, } } #endif - slabp->free--; - slab_bufctl(slabp)[slabp->free] = objnr; - slabp->inuse--; + slabp->active--; + slab_bufctl(slabp)[slabp->active] = objnr; } /* @@ -2908,9 +2904,9 @@ retry: * there must be at least one object available for * allocation. */ - BUG_ON(slabp->inuse >= cachep->num); + BUG_ON(slabp->active >= cachep->num); - while (slabp->inuse < cachep->num && batchcount--) { + while (slabp->active < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); @@ -2921,7 +2917,7 @@ retry: /* move slabp to correct slabp list: */ list_del(&slabp->list); - if (slabp->free == cachep->num) + if (slabp->active == cachep->num) list_add(&slabp->list, &n->slabs_full); else list_add(&slabp->list, &n->slabs_partial); @@ -3206,14 +3202,14 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - BUG_ON(slabp->inuse == cachep->num); + BUG_ON(slabp->active == cachep->num); obj = slab_get_obj(cachep, slabp, nodeid); n->free_objects--; /* move slabp to correct slabp list: */ list_del(&slabp->list); - if (slabp->free == cachep->num) + if (slabp->active == cachep->num) list_add(&slabp->list, &n->slabs_full); else list_add(&slabp->list, &n->slabs_partial); @@ -3380,7 +3376,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, n->free_objects++; /* fixup slab chains */ - if (slabp->inuse == 0) { + if (slabp->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; /* No need to drop any previously held @@ -3441,7 +3437,7 @@ free_done: struct slab *slabp; slabp = list_entry(p, struct slab, list); - BUG_ON(slabp->inuse); + BUG_ON(slabp->active); i++; p = p->next; @@ -4066,22 +4062,22 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) spin_lock_irq(&n->list_lock); list_for_each_entry(slabp, &n->slabs_full, list) { - if (slabp->inuse != cachep->num && !error) + if (slabp->active != cachep->num && !error) error = "slabs_full accounting error"; active_objs += cachep->num; active_slabs++; } list_for_each_entry(slabp, &n->slabs_partial, list) { - if (slabp->inuse == cachep->num && !error) - error = "slabs_partial inuse accounting error"; - if (!slabp->inuse && !error) - error = "slabs_partial/inuse accounting error"; - active_objs += slabp->inuse; + if (slabp->active == cachep->num && !error) + error = "slabs_partial accounting error"; + if (!slabp->active && !error) + error = "slabs_partial accounting error"; + active_objs += slabp->active; active_slabs++; } list_for_each_entry(slabp, &n->slabs_free, list) { - if (slabp->inuse && !error) - error = "slabs_free/inuse accounting error"; + if (slabp->active && !error) + error = "slabs_free accounting error"; num_slabs++; } free_objects += n->free_objects; @@ -4243,7 +4239,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { bool active = true; - for (j = s->free; j < c->num; j++) { + for (j = s->active; j < c->num; j++) { /* Skip freed item */ if (slab_bufctl(s)[j] == i) { active = false; -- cgit v1.3-7-g2ca7 From 8456a648cf44f14365f1f44de90a3da2526a4776 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:49 +0900 Subject: slab: use struct page for slab management Now, there are a few field in struct slab, so we can overload these over struct page. This will save some memory and reduce cache footprint. After this change, slabp_cache and slab_size no longer related to a struct slab, so rename them as freelist_cache and freelist_size. These changes are just mechanical ones and there is no functional change. Acked-by: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- include/linux/mm_types.h | 21 ++-- include/linux/slab_def.h | 4 +- mm/slab.c | 306 ++++++++++++++++++++++------------------------- 3 files changed, 158 insertions(+), 173 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 959cb369b197..95bf0c5a7eb9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -42,18 +42,22 @@ struct page { /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. - */ + union { + struct address_space *mapping; /* If low bit clear, points to + * inode address_space, or NULL. + * If page mapped as anonymous + * memory, low bit is set, and + * it points to anon_vma object: + * see PAGE_MAPPING_ANON below. + */ + void *s_mem; /* slab first object */ + }; + /* Second double word */ struct { union { pgoff_t index; /* Our offset within mapping. */ - void *freelist; /* slub/slob first free object */ + void *freelist; /* sl[aou]b first free object */ bool pfmemalloc; /* If set by the page allocator, * ALLOC_NO_WATERMARKS was set * and the low watermark was not @@ -109,6 +113,7 @@ struct page { }; atomic_t _count; /* Usage count, see below. */ }; + unsigned int active; /* SLAB */ }; }; diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index cd401580bdd3..ca82e8ff89fa 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -41,8 +41,8 @@ struct kmem_cache { size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ - struct kmem_cache *slabp_cache; - unsigned int slab_size; + struct kmem_cache *freelist_cache; + unsigned int freelist_size; /* constructor func */ void (*ctor)(void *obj); diff --git a/mm/slab.c b/mm/slab.c index 2ec2336a1ffc..0e7f2e73e08e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -163,21 +163,6 @@ */ static bool pfmemalloc_active __read_mostly; -/* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. - */ -struct slab { - struct { - struct list_head list; - void *s_mem; /* including colour offset */ - unsigned int active; /* num of objs active in slab */ - }; -}; - /* * struct array_cache * @@ -405,18 +390,10 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) return page->slab_cache; } -static inline struct slab *virt_to_slab(const void *obj) -{ - struct page *page = virt_to_head_page(obj); - - VM_BUG_ON(!PageSlab(page)); - return page->slab_page; -} - -static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, +static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, unsigned int idx) { - return slab->s_mem + cache->size * idx; + return page->s_mem + cache->size * idx; } /* @@ -426,9 +403,9 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, * reciprocal_divide(offset, cache->reciprocal_buffer_size) */ static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) + const struct page *page, void *obj) { - u32 offset = (obj - slab->s_mem); + u32 offset = (obj - page->s_mem); return reciprocal_divide(offset, cache->reciprocal_buffer_size); } @@ -590,7 +567,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) static size_t slab_mgmt_size(size_t nr_objs, size_t align) { - return ALIGN(sizeof(struct slab)+nr_objs*sizeof(unsigned int), align); + return ALIGN(nr_objs * sizeof(unsigned int), align); } /* @@ -609,7 +586,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - The struct slab * - One unsigned int for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object @@ -632,8 +608,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * into the memory allocation when taking the padding * into account. */ - nr_objs = (slab_size - sizeof(struct slab)) / - (buffer_size + sizeof(unsigned int)); + nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int)); /* * This calculated number will be either the right @@ -773,11 +748,11 @@ static struct array_cache *alloc_arraycache(int node, int entries, return nc; } -static inline bool is_slab_pfmemalloc(struct slab *slabp) +static inline bool is_slab_pfmemalloc(struct page *page) { - struct page *page = virt_to_page(slabp->s_mem); + struct page *mem_page = virt_to_page(page->s_mem); - return PageSlabPfmemalloc(page); + return PageSlabPfmemalloc(mem_page); } /* Clears pfmemalloc_active if no slabs have pfmalloc set */ @@ -785,23 +760,23 @@ static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { struct kmem_cache_node *n = cachep->node[numa_mem_id()]; - struct slab *slabp; + struct page *page; unsigned long flags; if (!pfmemalloc_active) return; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(slabp, &n->slabs_full, list) - if (is_slab_pfmemalloc(slabp)) + list_for_each_entry(page, &n->slabs_full, lru) + if (is_slab_pfmemalloc(page)) goto out; - list_for_each_entry(slabp, &n->slabs_partial, list) - if (is_slab_pfmemalloc(slabp)) + list_for_each_entry(page, &n->slabs_partial, lru) + if (is_slab_pfmemalloc(page)) goto out; - list_for_each_entry(slabp, &n->slabs_free, list) - if (is_slab_pfmemalloc(slabp)) + list_for_each_entry(page, &n->slabs_free, lru) + if (is_slab_pfmemalloc(page)) goto out; pfmemalloc_active = false; @@ -841,8 +816,8 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, */ n = cachep->node[numa_mem_id()]; if (!list_empty(&n->slabs_free) && force_refill) { - struct slab *slabp = virt_to_slab(objp); - ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); + struct page *page = virt_to_head_page(objp); + ClearPageSlabPfmemalloc(virt_to_head_page(page->s_mem)); clear_obj_pfmemalloc(&objp); recheck_pfmemalloc_active(cachep, ac); return objp; @@ -874,9 +849,9 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, { if (unlikely(pfmemalloc_active)) { /* Some pfmemalloc slabs exist, check if this is one */ - struct slab *slabp = virt_to_slab(objp); - struct page *page = virt_to_head_page(slabp->s_mem); - if (PageSlabPfmemalloc(page)) + struct page *page = virt_to_head_page(objp); + struct page *mem_page = virt_to_head_page(page->s_mem); + if (PageSlabPfmemalloc(mem_page)) set_obj_pfmemalloc(&objp); } @@ -1633,7 +1608,7 @@ static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { struct kmem_cache_node *n; - struct slab *slabp; + struct page *page; unsigned long flags; int node; @@ -1652,15 +1627,15 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) continue; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(slabp, &n->slabs_full, list) { + list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_partial, list) { - active_objs += slabp->active; + list_for_each_entry(page, &n->slabs_partial, lru) { + active_objs += page->active; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_free, list) + list_for_each_entry(page, &n->slabs_free, lru) num_slabs++; free_objects += n->free_objects; @@ -1746,6 +1721,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) BUG_ON(!PageSlab(page)); __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); + page_mapcount_reset(page); + page->mapping = NULL; memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) @@ -1910,19 +1887,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print some data about the neighboring objects, if they * exist: */ - struct slab *slabp = virt_to_slab(objp); + struct page *page = virt_to_head_page(objp); unsigned int objnr; - objnr = obj_to_index(cachep, slabp, objp); + objnr = obj_to_index(cachep, page, objp); if (objnr) { - objp = index_to_obj(cachep, slabp, objnr - 1); + objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { - objp = index_to_obj(cachep, slabp, objnr + 1); + objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Next obj: start=%p, len=%d\n", realobj, size); @@ -1933,11 +1910,12 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #endif #if DEBUG -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); + void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC @@ -1962,7 +1940,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab } } #else -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) { } #endif @@ -1976,11 +1955,12 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy(struct kmem_cache *cachep, struct page *page) { - struct page *page = virt_to_head_page(slabp->s_mem); + struct freelist *freelist; - slab_destroy_debugcheck(cachep, slabp); + freelist = page->freelist; + slab_destroy_debugcheck(cachep, page); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { struct rcu_head *head; @@ -1998,11 +1978,11 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } /* - * From now on, we don't use slab management + * From now on, we don't use freelist * although actual page can be freed in rcu context */ if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + kmem_cache_free(cachep->freelist_cache, freelist); } /** @@ -2039,7 +2019,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ - offslab_limit = size - sizeof(struct slab); + offslab_limit = size; offslab_limit /= sizeof(unsigned int); if (num > offslab_limit) @@ -2162,7 +2142,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, slab_size, ralign; + size_t left_over, freelist_size, ralign; gfp_t gfp; int err; size_t size = cachep->size; @@ -2281,22 +2261,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!cachep->num) return -E2BIG; - slab_size = ALIGN(cachep->num * sizeof(unsigned int) - + sizeof(struct slab), cachep->align); + freelist_size = + ALIGN(cachep->num * sizeof(unsigned int), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ - if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { + if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { flags &= ~CFLGS_OFF_SLAB; - left_over -= slab_size; + left_over -= freelist_size; } if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - slab_size = - cachep->num * sizeof(unsigned int) + sizeof(struct slab); + freelist_size = cachep->num * sizeof(unsigned int); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2313,7 +2292,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (cachep->colour_off < cachep->align) cachep->colour_off = cachep->align; cachep->colour = left_over / cachep->colour_off; - cachep->slab_size = slab_size; + cachep->freelist_size = freelist_size; cachep->flags = flags; cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) @@ -2322,7 +2301,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { - cachep->slabp_cache = kmalloc_slab(slab_size, 0u); + cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than @@ -2330,7 +2309,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); + BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); } err = setup_cpu_cache(cachep, gfp); @@ -2436,7 +2415,7 @@ static int drain_freelist(struct kmem_cache *cache, { struct list_head *p; int nr_freed; - struct slab *slabp; + struct page *page; nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { @@ -2448,18 +2427,18 @@ static int drain_freelist(struct kmem_cache *cache, goto out; } - slabp = list_entry(p, struct slab, list); + page = list_entry(p, struct page, lru); #if DEBUG - BUG_ON(slabp->active); + BUG_ON(page->active); #endif - list_del(&slabp->list); + list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked * to the cache. */ n->free_objects -= cache->num; spin_unlock_irq(&n->list_lock); - slab_destroy(cache, slabp); + slab_destroy(cache, page); nr_freed++; } out: @@ -2542,18 +2521,18 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) * descriptors in kmem_cache_create, we search through the malloc_sizes array. * If we are creating a malloc_sizes cache here it would not be visible to * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have slabp_cache same as the original cache. + * Hence we cannot have freelist_cache same as the original cache. */ -static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, +static struct freelist *alloc_slabmgmt(struct kmem_cache *cachep, struct page *page, int colour_off, gfp_t local_flags, int nodeid) { - struct slab *slabp; + struct freelist *freelist; void *addr = page_address(page); if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ - slabp = kmem_cache_alloc_node(cachep->slabp_cache, + freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); /* * If the first object in the slab is leaked (it's allocated @@ -2561,31 +2540,31 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, * kmemleak does not treat the ->s_mem pointer as a reference * to the object. Otherwise we will not report the leak. */ - kmemleak_scan_area(&slabp->list, sizeof(struct list_head), + kmemleak_scan_area(&page->lru, sizeof(struct list_head), local_flags); - if (!slabp) + if (!freelist) return NULL; } else { - slabp = addr + colour_off; - colour_off += cachep->slab_size; + freelist = addr + colour_off; + colour_off += cachep->freelist_size; } - slabp->active = 0; - slabp->s_mem = addr + colour_off; - return slabp; + page->active = 0; + page->s_mem = addr + colour_off; + return freelist; } -static inline unsigned int *slab_bufctl(struct slab *slabp) +static inline unsigned int *slab_bufctl(struct page *page) { - return (unsigned int *) (slabp + 1); + return (unsigned int *)(page->freelist); } static void cache_init_objs(struct kmem_cache *cachep, - struct slab *slabp) + struct page *page) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); + void *objp = index_to_obj(cachep, page, i); #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) @@ -2621,7 +2600,7 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_bufctl(slabp)[i] = i; + slab_bufctl(page)[i] = i; } } @@ -2635,13 +2614,13 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, int nodeid) { void *objp; - objp = index_to_obj(cachep, slabp, slab_bufctl(slabp)[slabp->active]); - slabp->active++; + objp = index_to_obj(cachep, page, slab_bufctl(page)[page->active]); + page->active++; #if DEBUG WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); #endif @@ -2649,10 +2628,10 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, +static void slab_put_obj(struct kmem_cache *cachep, struct page *page, void *objp, int nodeid) { - unsigned int objnr = obj_to_index(cachep, slabp, objp); + unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG unsigned int i; @@ -2660,16 +2639,16 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); /* Verify double free bug */ - for (i = slabp->active; i < cachep->num; i++) { - if (slab_bufctl(slabp)[i] == objnr) { + for (i = page->active; i < cachep->num; i++) { + if (slab_bufctl(page)[i] == objnr) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); BUG(); } } #endif - slabp->active--; - slab_bufctl(slabp)[slabp->active] = objnr; + page->active--; + slab_bufctl(page)[page->active] = objnr; } /* @@ -2677,11 +2656,11 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, * for the slab allocator to be able to lookup the cache and slab of a * virtual address for kfree, ksize, and slab debugging. */ -static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, - struct page *page) +static void slab_map_pages(struct kmem_cache *cache, struct page *page, + struct freelist *freelist) { page->slab_cache = cache; - page->slab_page = slab; + page->freelist = freelist; } /* @@ -2691,7 +2670,7 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid, struct page *page) { - struct slab *slabp; + struct freelist *freelist; size_t offset; gfp_t local_flags; struct kmem_cache_node *n; @@ -2738,14 +2717,14 @@ static int cache_grow(struct kmem_cache *cachep, goto failed; /* Get slab management. */ - slabp = alloc_slabmgmt(cachep, page, offset, + freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!slabp) + if (!freelist) goto opps1; - slab_map_pages(cachep, slabp, page); + slab_map_pages(cachep, page, freelist); - cache_init_objs(cachep, slabp); + cache_init_objs(cachep, page); if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -2753,7 +2732,7 @@ static int cache_grow(struct kmem_cache *cachep, spin_lock(&n->list_lock); /* Make slab active. */ - list_add_tail(&slabp->list, &(n->slabs_free)); + list_add_tail(&page->lru, &(n->slabs_free)); STATS_INC_GROWN(cachep); n->free_objects += cachep->num; spin_unlock(&n->list_lock); @@ -2808,13 +2787,13 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, unsigned long caller) { unsigned int objnr; - struct slab *slabp; + struct page *page; BUG_ON(virt_to_cache(objp) != cachep); objp -= obj_offset(cachep); kfree_debugcheck(objp); - slabp = virt_to_slab(objp); + page = virt_to_head_page(objp); if (cachep->flags & SLAB_RED_ZONE) { verify_redzone_free(cachep, objp); @@ -2824,10 +2803,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = (void *)caller; - objnr = obj_to_index(cachep, slabp, objp); + objnr = obj_to_index(cachep, page, objp); BUG_ON(objnr >= cachep->num); - BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); + BUG_ON(objp != index_to_obj(cachep, page, objnr)); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC @@ -2886,7 +2865,7 @@ retry: while (batchcount > 0) { struct list_head *entry; - struct slab *slabp; + struct page *page; /* Get slab alloc is to come from. */ entry = n->slabs_partial.next; if (entry == &n->slabs_partial) { @@ -2896,7 +2875,7 @@ retry: goto must_grow; } - slabp = list_entry(entry, struct slab, list); + page = list_entry(entry, struct page, lru); check_spinlock_acquired(cachep); /* @@ -2904,23 +2883,23 @@ retry: * there must be at least one object available for * allocation. */ - BUG_ON(slabp->active >= cachep->num); + BUG_ON(page->active >= cachep->num); - while (slabp->active < cachep->num && batchcount--) { + while (page->active < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, + ac_put_obj(cachep, ac, slab_get_obj(cachep, page, node)); } /* move slabp to correct slabp list: */ - list_del(&slabp->list); - if (slabp->active == cachep->num) - list_add(&slabp->list, &n->slabs_full); + list_del(&page->lru); + if (page->active == cachep->num) + list_add(&page->list, &n->slabs_full); else - list_add(&slabp->list, &n->slabs_partial); + list_add(&page->list, &n->slabs_partial); } must_grow: @@ -3175,7 +3154,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct list_head *entry; - struct slab *slabp; + struct page *page; struct kmem_cache_node *n; void *obj; int x; @@ -3195,24 +3174,24 @@ retry: goto must_grow; } - slabp = list_entry(entry, struct slab, list); + page = list_entry(entry, struct page, lru); check_spinlock_acquired_node(cachep, nodeid); STATS_INC_NODEALLOCS(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - BUG_ON(slabp->active == cachep->num); + BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, slabp, nodeid); + obj = slab_get_obj(cachep, page, nodeid); n->free_objects--; /* move slabp to correct slabp list: */ - list_del(&slabp->list); + list_del(&page->lru); - if (slabp->active == cachep->num) - list_add(&slabp->list, &n->slabs_full); + if (page->active == cachep->num) + list_add(&page->lru, &n->slabs_full); else - list_add(&slabp->list, &n->slabs_partial); + list_add(&page->lru, &n->slabs_partial); spin_unlock(&n->list_lock); goto done; @@ -3362,21 +3341,21 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, for (i = 0; i < nr_objects; i++) { void *objp; - struct slab *slabp; + struct page *page; clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; - slabp = virt_to_slab(objp); + page = virt_to_head_page(objp); n = cachep->node[node]; - list_del(&slabp->list); + list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, slabp, objp, node); + slab_put_obj(cachep, page, objp, node); STATS_DEC_ACTIVE(cachep); n->free_objects++; /* fixup slab chains */ - if (slabp->active == 0) { + if (page->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; /* No need to drop any previously held @@ -3385,16 +3364,16 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, page); } else { - list_add(&slabp->list, &n->slabs_free); + list_add(&page->lru, &n->slabs_free); } } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&slabp->list, &n->slabs_partial); + list_add_tail(&page->lru, &n->slabs_partial); } } } @@ -3434,10 +3413,10 @@ free_done: p = n->slabs_free.next; while (p != &(n->slabs_free)) { - struct slab *slabp; + struct page *page; - slabp = list_entry(p, struct slab, list); - BUG_ON(slabp->active); + page = list_entry(p, struct page, lru); + BUG_ON(page->active); i++; p = p->next; @@ -4041,7 +4020,7 @@ out: #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct slab *slabp; + struct page *page; unsigned long active_objs; unsigned long num_objs; unsigned long active_slabs = 0; @@ -4061,22 +4040,22 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &n->slabs_full, list) { - if (slabp->active != cachep->num && !error) + list_for_each_entry(page, &n->slabs_full, lru) { + if (page->active != cachep->num && !error) error = "slabs_full accounting error"; active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_partial, list) { - if (slabp->active == cachep->num && !error) + list_for_each_entry(page, &n->slabs_partial, lru) { + if (page->active == cachep->num && !error) error = "slabs_partial accounting error"; - if (!slabp->active && !error) + if (!page->active && !error) error = "slabs_partial accounting error"; - active_objs += slabp->active; + active_objs += page->active; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_free, list) { - if (slabp->active && !error) + list_for_each_entry(page, &n->slabs_free, lru) { + if (page->active && !error) error = "slabs_free accounting error"; num_slabs++; } @@ -4229,19 +4208,20 @@ static inline int add_caller(unsigned long *n, unsigned long v) return 1; } -static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) +static void handle_slab(unsigned long *n, struct kmem_cache *c, + struct page *page) { void *p; int i, j; if (n[0] == n[1]) return; - for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { + for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { bool active = true; - for (j = s->active; j < c->num; j++) { + for (j = page->active; j < c->num; j++) { /* Skip freed item */ - if (slab_bufctl(s)[j] == i) { + if (slab_bufctl(page)[j] == i) { active = false; break; } @@ -4273,7 +4253,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); - struct slab *slabp; + struct page *page; struct kmem_cache_node *n; const char *name; unsigned long *x = m->private; @@ -4297,10 +4277,10 @@ static int leaks_show(struct seq_file *m, void *p) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &n->slabs_full, list) - handle_slab(x, cachep, slabp); - list_for_each_entry(slabp, &n->slabs_partial, list) - handle_slab(x, cachep, slabp); + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); spin_unlock_irq(&n->list_lock); } name = cachep->name; -- cgit v1.3-7-g2ca7 From 7ecccf9d1e416fe61bb1caa0a94605b522feeaf3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:50 +0900 Subject: slab: remove useless statement for checking pfmemalloc Now, virt_to_page(page->s_mem) is same as the page, because slab use this structure for management. So remove useless statement. Acked-by: Andi Kleen Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 0e7f2e73e08e..fbb594f2259a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -750,9 +750,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, static inline bool is_slab_pfmemalloc(struct page *page) { - struct page *mem_page = virt_to_page(page->s_mem); - - return PageSlabPfmemalloc(mem_page); + return PageSlabPfmemalloc(page); } /* Clears pfmemalloc_active if no slabs have pfmalloc set */ @@ -817,7 +815,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, n = cachep->node[numa_mem_id()]; if (!list_empty(&n->slabs_free) && force_refill) { struct page *page = virt_to_head_page(objp); - ClearPageSlabPfmemalloc(virt_to_head_page(page->s_mem)); + ClearPageSlabPfmemalloc(page); clear_obj_pfmemalloc(&objp); recheck_pfmemalloc_active(cachep, ac); return objp; @@ -850,8 +848,7 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, if (unlikely(pfmemalloc_active)) { /* Some pfmemalloc slabs exist, check if this is one */ struct page *page = virt_to_head_page(objp); - struct page *mem_page = virt_to_head_page(page->s_mem); - if (PageSlabPfmemalloc(mem_page)) + if (PageSlabPfmemalloc(page)) set_obj_pfmemalloc(&objp); } -- cgit v1.3-7-g2ca7 From e7444d9b7d49898a667074122bed2dcedf349305 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 24 Oct 2013 10:07:51 +0900 Subject: slab: rename slab_bufctl to slab_freelist Now, bufctl is not proper name to this array. So change it. Acked-by: Andi Kleen Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index fbb594f2259a..af2db76e1f5a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2550,7 +2550,7 @@ static struct freelist *alloc_slabmgmt(struct kmem_cache *cachep, return freelist; } -static inline unsigned int *slab_bufctl(struct page *page) +static inline unsigned int *slab_freelist(struct page *page) { return (unsigned int *)(page->freelist); } @@ -2597,7 +2597,7 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_bufctl(page)[i] = i; + slab_freelist(page)[i] = i; } } @@ -2616,7 +2616,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, { void *objp; - objp = index_to_obj(cachep, page, slab_bufctl(page)[page->active]); + objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]); page->active++; #if DEBUG WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); @@ -2637,7 +2637,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { - if (slab_bufctl(page)[i] == objnr) { + if (slab_freelist(page)[i] == objnr) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); BUG(); @@ -2645,7 +2645,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, } #endif page->active--; - slab_bufctl(page)[page->active] = objnr; + slab_freelist(page)[page->active] = objnr; } /* @@ -4218,7 +4218,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, for (j = page->active; j < c->num; j++) { /* Skip freed item */ - if (slab_bufctl(page)[j] == i) { + if (slab_freelist(page)[j] == i) { active = false; break; } -- cgit v1.3-7-g2ca7 From d56791b38e34e480d869d1b88735df16c81aa684 Mon Sep 17 00:00:00 2001 From: Roman Bobniev Date: Tue, 8 Oct 2013 15:58:57 -0700 Subject: slub: proper kmemleak tracking if CONFIG_SLUB_DEBUG disabled Move all kmemleak calls into hook functions, and make it so that all hooks (both inside and outside of #ifdef CONFIG_SLUB_DEBUG) call the appropriate kmemleak routines. This allows for kmemleak to be configured independently of slub debug features. It also fixes a bug where kmemleak was only partially enabled in some configurations. Acked-by: Catalin Marinas Acked-by: Christoph Lameter Signed-off-by: Roman Bobniev Signed-off-by: Tim Bird Signed-off-by: Pekka Enberg --- mm/slub.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e3ba1f2cf60c..250062c66ec5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -928,6 +928,16 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; @@ -1253,13 +1263,30 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { return 0; } static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) {} + void *object) +{ + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, + flags & gfp_allowed_mask); +} -static inline void slab_free_hook(struct kmem_cache *s, void *x) {} +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} #endif /* CONFIG_SLUB_DEBUG */ @@ -3265,7 +3292,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) if (page) ptr = page_address(page); - kmemleak_alloc(ptr, size, 1, flags); + kmalloc_large_node_hook(ptr, size, flags); return ptr; } @@ -3365,7 +3392,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); - kmemleak_free(x); + kfree_hook(x); __free_memcg_kmem_pages(page, compound_order(page)); return; } -- cgit v1.3-7-g2ca7 From 0172f779e4314639a8ed440082cfe9e3450954e8 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 30 Oct 2013 19:04:00 +0900 Subject: slab: fix to calm down kmemleak warning After using struct page as slab management, we should not call kmemleak_scan_area(), since struct page isn't the tracking object of kmemleak. Without this patch and if CONFIG_DEBUG_KMEMLEAK is enabled, so many kmemleak warnings are printed. Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index af2db76e1f5a..a8a93497d7d6 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2531,14 +2531,6 @@ static struct freelist *alloc_slabmgmt(struct kmem_cache *cachep, /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); - /* - * If the first object in the slab is leaked (it's allocated - * but no one has a reference to it), we want to make sure - * kmemleak does not treat the ->s_mem pointer as a reference - * to the object. Otherwise we will not report the leak. - */ - kmemleak_scan_area(&page->lru, sizeof(struct list_head), - local_flags); if (!freelist) return NULL; } else { -- cgit v1.3-7-g2ca7 From 7e00735520ffb00bda3e08c441d0a4dba42913a7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 30 Oct 2013 19:04:01 +0900 Subject: slab: replace non-existing 'struct freelist *' with 'void *' There is no 'strcut freelist', but codes use pointer to 'struct freelist'. Although compiler doesn't complain anything about this wrong usage and codes work fine, but fixing it is better. Signed-off-by: Joonsoo Kim Signed-off-by: Pekka Enberg --- mm/slab.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a8a93497d7d6..a983e3084332 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1954,7 +1954,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, */ static void slab_destroy(struct kmem_cache *cachep, struct page *page) { - struct freelist *freelist; + void *freelist; freelist = page->freelist; slab_destroy_debugcheck(cachep, page); @@ -2520,11 +2520,11 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) * kmem_find_general_cachep till the initialization is complete. * Hence we cannot have freelist_cache same as the original cache. */ -static struct freelist *alloc_slabmgmt(struct kmem_cache *cachep, +static void *alloc_slabmgmt(struct kmem_cache *cachep, struct page *page, int colour_off, gfp_t local_flags, int nodeid) { - struct freelist *freelist; + void *freelist; void *addr = page_address(page); if (OFF_SLAB(cachep)) { @@ -2646,7 +2646,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, * virtual address for kfree, ksize, and slab debugging. */ static void slab_map_pages(struct kmem_cache *cache, struct page *page, - struct freelist *freelist) + void *freelist) { page->slab_cache = cache; page->freelist = freelist; @@ -2659,7 +2659,7 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page, static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid, struct page *page) { - struct freelist *freelist; + void *freelist; size_t offset; gfp_t local_flags; struct kmem_cache_node *n; -- cgit v1.3-7-g2ca7 From c6f58d9b362b45c52afebe4342c9137d0dabe47f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 7 Nov 2013 16:29:15 +0000 Subject: slub: Handle NULL parameter in kmem_cache_flags Andreas Herrmann writes: When I've used slub_debug kernel option (e.g. "slub_debug=,skbuff_fclone_cache" or similar) on a debug session I've seen a panic like: Highbank #setenv bootargs console=ttyAMA0 root=/dev/sda2 kgdboc.kgdboc=ttyAMA0,115200 slub_debug=,kmalloc-4096 earlyprintk=ttyAMA0 ... Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = c0004000 [00000000] *pgd=00000000 Internal error: Oops: 5 [#1] SMP ARM Modules linked in: CPU: 0 PID: 0 Comm: swapper Tainted: G W 3.12.0-00048-gbe408cd #314 task: c0898360 ti: c088a000 task.ti: c088a000 PC is at strncmp+0x1c/0x84 LR is at kmem_cache_flags.isra.46.part.47+0x44/0x60 pc : [] lr : [] psr: 200001d3 sp : c088bea8 ip : c088beb8 fp : c088beb4 r10: 00000000 r9 : 413fc090 r8 : 00000001 r7 : 00000000 r6 : c2984a08 r5 : c0966e78 r4 : 00000000 r3 : 0000006b r2 : 0000000c r1 : 00000000 r0 : c2984a08 Flags: nzCv IRQs off FIQs off Mode SVC_32 ISA ARM Segment kernel Control: 10c5387d Table: 0000404a DAC: 00000015 Process swapper (pid: 0, stack limit = 0xc088a248) Stack: (0xc088bea8 to 0xc088c000) bea0: c088bed4 c088beb8 c0110a3c c02c6d90 c0966e78 00000040 bec0: ef001f00 00000040 c088bf14 c088bed8 c0112070 c0110a04 00000005 c010fac8 bee0: c088bf5c c088bef0 c010fac8 ef001f00 00000040 00000000 00000040 00000001 bf00: 413fc090 00000000 c088bf34 c088bf18 c0839190 c0112040 00000000 ef001f00 bf20: 00000000 00000000 c088bf54 c088bf38 c0839200 c083914c 00000006 c0961c4c bf40: c0961c28 00000000 c088bf7c c088bf58 c08392ac c08391c0 c08a2ed8 c0966e78 bf60: c086b874 c08a3f50 c0961c28 00000001 c088bfb4 c088bf80 c083b258 c0839248 bf80: 2f800000 0f000000 c08935b4 ffffffff c08cd400 ffffffff c08cd400 c0868408 bfa0: c29849c0 00000000 c088bff4 c088bfb8 c0824974 c083b1e4 ffffffff ffffffff bfc0: c08245c0 00000000 00000000 c0868408 00000000 10c5387d c0892bcc c0868404 bfe0: c0899440 0000406a 00000000 c088bff8 00008074 c0824824 00000000 00000000 [] (strncmp+0x1c/0x84) from [] (kmem_cache_flags.isra.46.part.47+0x44/0x60) [] (kmem_cache_flags.isra.46.part.47+0x44/0x60) from [] (__kmem_cache_create+0x3c/0x410) [] (__kmem_cache_create+0x3c/0x410) from [] (create_boot_cache+0x50/0x74) [] (create_boot_cache+0x50/0x74) from [] (create_kmalloc_cache+0x4c/0x88) [] (create_kmalloc_cache+0x4c/0x88) from [] (create_kmalloc_caches+0x70/0x114) [] (create_kmalloc_caches+0x70/0x114) from [] (kmem_cache_init+0x80/0xe0) [] (kmem_cache_init+0x80/0xe0) from [] (start_kernel+0x15c/0x318) [] (start_kernel+0x15c/0x318) from [<00008074>] (0x8074) Code: e3520000 01a00002 089da800 e5d03000 (e5d1c000) ---[ end trace 1b75b31a2719ed1d ]--- Kernel panic - not syncing: Fatal exception Problem is that slub_debug option is not parsed before create_boot_cache is called. Solve this by changing slub_debug to early_param. Kernels 3.11, 3.10 are also affected. I am not sure about older kernels. Christoph Lameter explains: kmem_cache_flags may be called with NULL parameter during early boot. Skip the test in that case. Cc: stable@vger.kernel.org # 3.10 and 3.11 Reported-by: Andreas Herrmann Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 250062c66ec5..2cf39826d4f0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1220,8 +1220,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size, /* * Enable debugging if selected on the kernel commandline. */ - if (slub_debug && (!slub_debug_slabs || - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) + if (slub_debug && (!slub_debug_slabs || (name && + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) flags |= slub_debug; return flags; -- cgit v1.3-7-g2ca7 From 721ae22ae1a51c25871b7a0b543877aa94ff2a20 Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Fri, 8 Nov 2013 20:47:37 +0800 Subject: mm, slub: fix the typo in mm/slub.c Acked-by: Christoph Lameter Signed-off-by: Zhi Yong Wu Signed-off-by: Pekka Enberg --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 2cf39826d4f0..1ee14acb3bda 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -155,7 +155,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* * Maximum number of desirable partial slabs. * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 @@ -2849,8 +2849,8 @@ static struct kmem_cache *kmem_cache_node; * slab on the node for this slabcache. There are no concurrent accesses * possible. * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ static void early_kmem_cache_node_alloc(int node) -- cgit v1.3-7-g2ca7 From 8b2e9b712f6139df9c754af0d67fecc4bbc88545 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 20 Nov 2013 14:41:47 -0800 Subject: Revert "mm: create a separate slab for page->ptl allocation" This reverts commit ea1e7ed33708c7a760419ff9ded0a6cb90586a50. Al points out that while the commit *does* actually create a separate slab for the page->ptl allocation, that slab is never actually used, and the code continues to use kmalloc/kfree. Damien Wyart points out that the original patch did have the conversion to use kmem_cache_alloc/free, so it got lost somewhere on its way to me. Revert the half-arsed attempt that didn't do anything. If we really do want the special slab (remember: this is all relevant just for debug builds, so it's not necessarily all that critical) we might as well redo the patch fully. Reported-by: Al Viro Acked-by: Andrew Morton Cc: Kirill A Shutemov Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 --------- init/main.c | 2 +- mm/memory.c | 7 ------- 3 files changed, 1 insertion(+), 17 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0548eb201e05..1cedd000cf29 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1318,7 +1318,6 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #if USE_SPLIT_PTE_PTLOCKS #if BLOATED_SPINLOCKS -void __init ptlock_cache_init(void); extern bool ptlock_alloc(struct page *page); extern void ptlock_free(struct page *page); @@ -1327,7 +1326,6 @@ static inline spinlock_t *ptlock_ptr(struct page *page) return page->ptl; } #else /* BLOATED_SPINLOCKS */ -static inline void ptlock_cache_init(void) {} static inline bool ptlock_alloc(struct page *page) { return true; @@ -1380,17 +1378,10 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } -static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct page *page) { return true; } static inline void pte_lock_deinit(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ -static inline void pgtable_init(void) -{ - ptlock_cache_init(); - pgtable_cache_init(); -} - static inline bool pgtable_page_ctor(struct page *page) { inc_zone_page_state(page, NR_PAGETABLE); diff --git a/init/main.c b/init/main.c index 01573fdfa186..febc511e078a 100644 --- a/init/main.c +++ b/init/main.c @@ -476,7 +476,7 @@ static void __init mm_init(void) mem_init(); kmem_cache_init(); percpu_init_late(); - pgtable_init(); + pgtable_cache_init(); vmalloc_init(); } diff --git a/mm/memory.c b/mm/memory.c index 0409e8f43fa0..5d9025f3b3e1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4272,13 +4272,6 @@ void copy_user_huge_page(struct page *dst, struct page *src, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS -static struct kmem_cache *page_ptl_cachep; -void __init ptlock_cache_init(void) -{ - page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, - SLAB_PANIC, NULL); -} - bool ptlock_alloc(struct page *page) { spinlock_t *ptl; -- cgit v1.3-7-g2ca7 From 30b0a105d9f7141e4cbf72ae5511832457d89788 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 21 Nov 2013 14:31:58 -0800 Subject: mm: thp: give transparent hugepage code a separate copy_page Right now, the migration code in migrate_page_copy() uses copy_huge_page() for hugetlbfs and thp pages: if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); So, yay for code reuse. But: void copy_huge_page(struct page *dst, struct page *src) { struct hstate *h = page_hstate(src); and a non-hugetlbfs page has no page_hstate(). This works 99% of the time because page_hstate() determines the hstate from the page order alone. Since the page order of a THP page matches the default hugetlbfs page order, it works. But, if you change the default huge page size on the boot command-line (say default_hugepagesz=1G), then we might not even *have* a 2MB hstate so page_hstate() returns null and copy_huge_page() oopses pretty fast since copy_huge_page() dereferences the hstate: void copy_huge_page(struct page *dst, struct page *src) { struct hstate *h = page_hstate(src); if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { ... Mel noticed that the migration code is really the only user of these functions. This moves all the copy code over to migrate.c and makes copy_huge_page() work for THP by checking for it explicitly. I believe the bug was introduced in commit b32967ff101a ("mm: numa: Add THP migration for the NUMA working set scanning fault case") [akpm@linux-foundation.org: fix coding-style and comment text, per Naoya Horiguchi] Signed-off-by: Dave Hansen Acked-by: Mel Gorman Reviewed-by: Naoya Horiguchi Cc: Hillf Danton Cc: Andrea Arcangeli Tested-by: Dave Jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ---- mm/hugetlb.c | 34 ---------------------------------- mm/migrate.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index acd2010328f3..85e0c58bdfdf 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -69,7 +69,6 @@ int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); bool is_hugepage_active(struct page *page); -void copy_huge_page(struct page *dst, struct page *src); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); @@ -140,9 +139,6 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page) #define isolate_huge_page(p, l) false #define putback_active_hugepage(p) do {} while (0) #define is_hugepage_active(x) false -static inline void copy_huge_page(struct page *dst, struct page *src) -{ -} static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7d57af21f49e..2130365d387d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -476,40 +476,6 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg) return 0; } -static void copy_gigantic_page(struct page *dst, struct page *src) -{ - int i; - struct hstate *h = page_hstate(src); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_highpage(dst, src); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -void copy_huge_page(struct page *dst, struct page *src) -{ - int i; - struct hstate *h = page_hstate(src); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_highpage(dst + i, src + i); - } -} - static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); diff --git a/mm/migrate.c b/mm/migrate.c index 316e720a2023..bb940045fe85 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -441,6 +441,54 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } +/* + * Gigantic pages are so large that we do not guarantee that page++ pointer + * arithmetic will work across the entire page. We need something more + * specialized. + */ +static void __copy_gigantic_page(struct page *dst, struct page *src, + int nr_pages) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < nr_pages; ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +static void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + int nr_pages; + + if (PageHuge(src)) { + /* hugetlbfs page */ + struct hstate *h = page_hstate(src); + nr_pages = pages_per_huge_page(h); + + if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { + __copy_gigantic_page(dst, src, nr_pages); + return; + } + } else { + /* thp page */ + BUG_ON(!PageTransHuge(src)); + nr_pages = hpage_nr_pages(src); + } + + for (i = 0; i < nr_pages; i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + /* * Copy the page to its new location */ -- cgit v1.3-7-g2ca7 From 27c73ae759774e63313c1fbfeb17ba076cea64c5 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 21 Nov 2013 14:32:02 -0800 Subject: mm: hugetlbfs: fix hugetlbfs optimization Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database caused by THP") can cause dereference of a dangling pointer if split_huge_page runs during PageHuge() if there are updates to the tail_page->private field. Also it is repeating compound_head twice for hugetlbfs and it is running compound_head+compound_trans_head for THP when a single one is needed in both cases. The new code within the PageSlab() check doesn't need to verify that the THP page size is never bigger than the smallest hugetlbfs page size, to avoid memory corruption. A longstanding theoretical race condition was found while fixing the above (see the change right after the skip_unlock label, that is relevant for the compound_lock path too). By re-establishing the _mapcount tail refcounting for all compound pages, this also fixes the below problem: echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages BUG: Bad page state in process bash pfn:59a01 page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0 page flags: 0x1c00000000008000(tail) Modules linked in: CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x55/0x76 bad_page+0xd5/0x130 free_pages_prepare+0x213/0x280 __free_pages+0x36/0x80 update_and_free_page+0xc1/0xd0 free_pool_huge_page+0xc2/0xe0 set_max_huge_pages.part.58+0x14c/0x220 nr_hugepages_store_common.isra.60+0xd0/0xf0 nr_hugepages_store+0x13/0x20 kobj_attr_store+0xf/0x20 sysfs_write_file+0x189/0x1e0 vfs_write+0xc5/0x1f0 SyS_write+0x55/0xb0 system_call_fastpath+0x16/0x1b Signed-off-by: Khalid Aziz Signed-off-by: Andrea Arcangeli Tested-by: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++ mm/hugetlb.c | 17 ++++++ mm/swap.c | 143 ++++++++++++++++++++++++++++-------------------- 3 files changed, 106 insertions(+), 60 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 85e0c58bdfdf..9649ff0c63f8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -31,6 +31,7 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); void hugepage_put_subpool(struct hugepage_subpool *spool); int PageHuge(struct page *page); +int PageHeadHuge(struct page *page_head); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); @@ -103,6 +104,11 @@ static inline int PageHuge(struct page *page) return 0; } +static inline int PageHeadHuge(struct page *page_head) +{ + return 0; +} + static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2130365d387d..dee6cf4e6d34 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -702,6 +702,23 @@ int PageHuge(struct page *page) } EXPORT_SYMBOL_GPL(PageHuge); +/* + * PageHeadHuge() only returns true for hugetlbfs head page, but not for + * normal or transparent huge pages. + */ +int PageHeadHuge(struct page *page_head) +{ + compound_page_dtor *dtor; + + if (!PageHead(page_head)) + return 0; + + dtor = get_compound_page_dtor(page_head); + + return dtor == free_huge_page; +} +EXPORT_SYMBOL_GPL(PageHeadHuge); + pgoff_t __basepage_index(struct page *page) { struct page *page_head = compound_head(page); diff --git a/mm/swap.c b/mm/swap.c index 7a9f80d451f5..84b26aaabd03 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -82,19 +82,6 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { - /* - * hugetlbfs pages cannot be split from under us. If this is a - * hugetlbfs page, check refcount on head page and release the page if - * the refcount becomes zero. - */ - if (PageHuge(page)) { - page = compound_head(page); - if (put_page_testzero(page)) - __put_compound_page(page); - - return; - } - if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ struct page *page_head = compound_trans_head(page); @@ -111,14 +98,31 @@ static void put_compound_page(struct page *page) * still hot on arches that do not support * this_cpu_cmpxchg_double(). */ - if (PageSlab(page_head)) { - if (PageTail(page)) { + if (PageSlab(page_head) || PageHeadHuge(page_head)) { + if (likely(PageTail(page))) { + /* + * __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); + atomic_dec(&page->_mapcount); if (put_page_testzero(page_head)) VM_BUG_ON(1); - - atomic_dec(&page->_mapcount); - goto skip_lock_tail; + if (put_page_testzero(page_head)) + __put_compound_page(page_head); + return; } else + /* + * __split_huge_page_refcount + * run before us, "page" was a + * THP tail. The split + * page_head has been freed + * and reallocated as slab or + * hugetlbfs page of smaller + * order (only possible if + * reallocated as slab on + * x86). + */ goto skip_lock; } /* @@ -132,8 +136,27 @@ static void put_compound_page(struct page *page) /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); skip_lock: - if (put_page_testzero(page_head)) - __put_single_page(page_head); + if (put_page_testzero(page_head)) { + /* + * The head page may have been + * freed and reallocated as a + * compound page of smaller + * order and then freed again. + * All we know is that it + * cannot have become: a THP + * page, a compound page of + * higher order, a tail page. + * That is because we still + * hold the refcount of the + * split THP tail and + * page_head was the THP head + * before the split. + */ + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } out_put_single: if (put_page_testzero(page)) __put_single_page(page); @@ -155,7 +178,6 @@ out_put_single: VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); -skip_lock_tail: if (put_page_testzero(page_head)) { if (PageHead(page_head)) __put_compound_page(page_head); @@ -198,51 +220,52 @@ bool __get_page_tail(struct page *page) * proper PT lock that already serializes against * split_huge_page(). */ + unsigned long flags; bool got = false; - struct page *page_head; - - /* - * If this is a hugetlbfs page it cannot be split under us. Simply - * increment refcount for the head page. - */ - if (PageHuge(page)) { - page_head = compound_head(page); - atomic_inc(&page_head->_count); - got = true; - } else { - unsigned long flags; + struct page *page_head = compound_trans_head(page); - page_head = compound_trans_head(page); - if (likely(page != page_head && - get_page_unless_zero(page_head))) { - - /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head)) { - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - return true; - } else { - put_page(page_head); - return false; - } - } - - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head) || PageHeadHuge(page_head)) { if (likely(PageTail(page))) { + /* + * This is a hugetlbfs page or a slab + * page. __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) + return true; + } else { + /* + * __split_huge_page_refcount run + * before us, "page" was a THP + * tail. The split page_head has been + * freed and reallocated as slab or + * hugetlbfs page of smaller order + * (only possible if reallocated as + * slab on x86). + */ put_page(page_head); + return false; + } + } + + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + got = true; } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); } return got; } -- cgit v1.3-7-g2ca7 From b7a9f420ed737cb7cd4075ba06ac1a6f0da9f878 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 21 Nov 2013 14:32:06 -0800 Subject: mm, mempolicy: silence gcc warning Fengguang Wu reports that compiling mm/mempolicy.c results in a warning: mm/mempolicy.c: In function 'mpol_to_str': mm/mempolicy.c:2878:2: error: format not a string literal and no format arguments Kees says this is because he is using -Wformat-security. Silence the warning. Signed-off-by: David Rientjes Reported-by: Fengguang Wu Suggested-by: Kees Cook Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c4403cdf3433..eca4a3129129 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2950,7 +2950,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) return; } - p += snprintf(p, maxlen, policy_modes[mode]); + p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); -- cgit v1.3-7-g2ca7 From c7277090927a5e71871e799a355ed2940f6c8fc6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 2 Dec 2013 11:24:19 +0000 Subject: security: shmem: implement kernel private shmem inodes We have a problem where the big_key key storage implementation uses a shmem backed inode to hold the key contents. Because of this detail of implementation LSM checks are being done between processes trying to read the keys and the tmpfs backed inode. The LSM checks are already being handled on the key interface level and should not be enforced at the inode level (since the inode is an implementation detail, not a part of the security model) This patch implements a new function shmem_kernel_file_setup() which returns the equivalent to shmem_file_setup() only the underlying inode has S_PRIVATE set. This means that all LSM checks for the inode in question are skipped. It should only be used for kernel internal operations where the inode is not exposed to userspace without proper LSM checking. It is possible that some other users of shmem_file_setup() should use the new interface, but this has not been explored. Reproducing this bug is a little bit difficult. The steps I used on Fedora are: (1) Turn off selinux enforcing: setenforce 0 (2) Create a huge key k=`dd if=/dev/zero bs=8192 count=1 | keyctl padd big_key test-key @s` (3) Access the key in another context: runcon system_u:system_r:httpd_t:s0-s0:c0.c1023 keyctl print $k >/dev/null (4) Examine the audit logs: ausearch -m AVC -i --subject httpd_t | audit2allow If the last command's output includes a line that looks like: allow httpd_t user_tmpfs_t:file { open read }; There was an inode check between httpd and the tmpfs filesystem. With this patch no such denial will be seen. (NOTE! you should clear your audit log if you have tested for this previously) (Please return you box to enforcing) Signed-off-by: Eric Paris Signed-off-by: David Howells cc: Hugh Dickins cc: linux-mm@kvack.org --- include/linux/shmem_fs.h | 2 ++ mm/shmem.c | 36 +++++++++++++++++++++++++++++------- security/keys/big_key.c | 2 +- 3 files changed, 32 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 30aa0dc60d75..9d55438bc4ad 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -47,6 +47,8 @@ extern int shmem_init(void); extern int shmem_fill_super(struct super_block *sb, void *data, int silent); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); +extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, + unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); extern void shmem_unlock_mapping(struct address_space *mapping); diff --git a/mm/shmem.c b/mm/shmem.c index 8297623fcaed..902a14842b74 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2918,13 +2918,8 @@ static struct dentry_operations anon_ops = { .d_dname = simple_dname }; -/** - * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc//maps - * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size - */ -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +static struct file *__shmem_file_setup(const char *name, loff_t size, + unsigned long flags, unsigned int i_flags) { struct file *res; struct inode *inode; @@ -2957,6 +2952,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags if (!inode) goto put_dentry; + inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ @@ -2977,6 +2973,32 @@ put_memory: shmem_unacct_size(flags, size); return res; } + +/** + * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be + * kernel internal. There will be NO LSM permission checks against the + * underlying inode. So users of this interface must do LSM checks at a + * higher layer. The one user is the big_key implementation. LSM checks + * are provided at the key level rather than the inode level. + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, S_PRIVATE); +} + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, 0); +} EXPORT_SYMBOL_GPL(shmem_file_setup); /** diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 7f44c3207a9b..8137b27d641d 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -70,7 +70,7 @@ int big_key_instantiate(struct key *key, struct key_preparsed_payload *prep) * * TODO: Encrypt the stored data with a temporary key. */ - file = shmem_file_setup("", datalen, 0); + file = shmem_kernel_file_setup("", datalen, 0); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err_quota; -- cgit v1.3-7-g2ca7 From a0d8b00a3381f9d75764b3377590451cb0b4fe41 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Dec 2013 17:12:20 -0800 Subject: mm: memcg: do not declare OOM from __GFP_NOFAIL allocations Commit 84235de394d9 ("fs: buffer: move allocation failure loop into the allocator") started recognizing __GFP_NOFAIL in memory cgroups but forgot to disable the OOM killer. Any task that does not fail allocation will also not enter the OOM completion path. So don't declare an OOM state in this case or it'll be leaked and the task be able to bypass the limit until the next userspace-triggered page fault cleans up the OOM state. Reported-by: William Dauchy Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: [3.12.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f1a0ae6e11b8..e3aff0175d4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2696,6 +2696,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (unlikely(task_in_memcg_oom(current))) goto bypass; + if (gfp_mask & __GFP_NOFAIL) + oom = false; + /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the -- cgit v1.3-7-g2ca7 From 3592806cfa08b7cca968f793c33f8e9460bab395 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 12 Dec 2013 17:12:33 -0800 Subject: thp: move preallocated PTE page table on move_huge_pmd() Andrey Wagin reported crash on VM_BUG_ON() in pgtable_pmd_page_dtor() with fallowing backtrace: free_pgd_range+0x2bf/0x410 free_pgtables+0xce/0x120 unmap_region+0xe0/0x120 do_munmap+0x249/0x360 move_vma+0x144/0x270 SyS_mremap+0x3b9/0x510 system_call_fastpath+0x16/0x1b The crash can be reproduce with this test case: #define _GNU_SOURCE #include #include #include #define MB (1024 * 1024UL) #define GB (1024 * MB) int main(int argc, char **argv) { char *p; int i; p = mmap((void *) GB, 10 * MB, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); for (i = 0; i < 10 * MB; i += 4096) p[i] = 1; mremap(p, 10 * MB, 10 * MB, MREMAP_FIXED | MREMAP_MAYMOVE, 2 * GB); return 0; } Due to split PMD lock, we now store preallocated PTE tables for THP pages per-PMD table. It means we need to move them to other PMD table if huge PMD moved there. Signed-off-by: Kirill A. Shutemov Reported-by: Andrey Vagin Tested-by: Andrey Vagin Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bccd5a628ea6..33a5dc492810 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1481,8 +1481,18 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) + if (new_ptl != old_ptl) { + pgtable_t pgtable; + + /* + * Move preallocated PTE page table if new_pmd is on + * different PMD page table. + */ + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + spin_unlock(new_ptl); + } spin_unlock(old_ptl); } out: -- cgit v1.3-7-g2ca7 From 96f1c58d853497a757463e0b57fed140d6858f3a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Dec 2013 17:12:34 -0800 Subject: mm: memcg: fix race condition between memcg teardown and swapin There is a race condition between a memcg being torn down and a swapin triggered from a different memcg of a page that was recorded to belong to the exiting memcg on swapout (with CONFIG_MEMCG_SWAP extension). The result is unreclaimable pages pointing to dead memcgs, which can lead to anything from endless loops in later memcg teardown (the page is charged to all hierarchical parents but is not on any LRU list) or crashes from following the dangling memcg pointer. Memcgs with tasks in them can not be torn down and usually charges don't show up in memcgs without tasks. Swapin with the CONFIG_MEMCG_SWAP extension is the notable exception because it charges the cgroup that was recorded as owner during swapout, which may be empty and in the process of being torn down when a task in another memcg triggers the swapin: teardown: swapin: lookup_swap_cgroup_id() rcu_read_lock() mem_cgroup_lookup() css_tryget() rcu_read_unlock() disable css_tryget() call_rcu() offline_css() reparent_charges() res_counter_charge() (hierarchical!) css_put() css_free() pc->mem_cgroup = dead memcg add page to dead lru Add a final reparenting step into css_free() to make sure any such raced charges are moved out of the memcg before it's finally freed. In the longer term it would be cleaner to have the css_tryget() and the res_counter charge under the same RCU lock section so that the charge reparenting is deferred until the last charge whose tryget succeeded is visible. But this will require more invasive changes that will be harder to evaluate and backport into stable, so better defer them to a separate change set. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e3aff0175d4c..f6a63f5b3827 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6355,6 +6355,42 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* + * XXX: css_offline() would be where we should reparent all + * memory to prepare the cgroup for destruction. However, + * memcg does not do css_tryget() and res_counter charging + * under the same RCU lock region, which means that charging + * could race with offlining. Offlining only happens to + * cgroups with no tasks in them but charges can show up + * without any tasks from the swapin path when the target + * memcg is looked up from the swapout record and not from the + * current task as it usually is. A race like this can leak + * charges and put pages with stale cgroup pointers into + * circulation: + * + * #0 #1 + * lookup_swap_cgroup_id() + * rcu_read_lock() + * mem_cgroup_lookup() + * css_tryget() + * rcu_read_unlock() + * disable css_tryget() + * call_rcu() + * offline_css() + * reparent_charges() + * res_counter_charge() + * css_put() + * css_free() + * pc->mem_cgroup = dead memcg + * add page to lru + * + * The bulk of the charges are still moved in offline_css() to + * avoid pinning a lot of pages in case a long-term reference + * like a swapout record is deferring the css_free() to long + * after offlining. But this makes sure we catch any charges + * made after offlining: + */ + mem_cgroup_reparent_charges(memcg); memcg_destroy_kmem(memcg); __mem_cgroup_free(memcg); -- cgit v1.3-7-g2ca7 From 1f14c1ac19aa45118054b6d5425873c5c7fc23a1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Dec 2013 17:12:35 -0800 Subject: mm: memcg: do not allow task about to OOM kill to bypass the limit Commit 4942642080ea ("mm: memcg: handle non-error OOM situations more gracefully") allowed tasks that already entered a memcg OOM condition to bypass the memcg limit on subsequent allocation attempts hoping this would expedite finishing the page fault and executing the kill. David Rientjes is worried that this breaks memcg isolation guarantees and since there is no evidence that the bypass actually speeds up fault processing just change it so that these subsequent charge attempts fail outright. The notable exception being __GFP_NOFAIL charges which are required to bypass the limit regardless. Signed-off-by: Johannes Weiner Reported-by: David Rientjes Acked-by: Michal Hocko Acked-bt: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f6a63f5b3827..bf5e89457149 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2694,7 +2694,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, goto bypass; if (unlikely(task_in_memcg_oom(current))) - goto bypass; + goto nomem; if (gfp_mask & __GFP_NOFAIL) oom = false; -- cgit v1.3-7-g2ca7