diff options
Diffstat (limited to 'mm/slab.c')
| -rw-r--r-- | mm/slab.c | 249 | 
1 files changed, 162 insertions, 87 deletions
| diff --git a/mm/slab.c b/mm/slab.c index bac0f4fcc216..e49f8f46f46d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -115,6 +115,7 @@  #include	<linux/reciprocal_div.h>  #include	<linux/debugobjects.h>  #include	<linux/kmemcheck.h> +#include	<linux/memory.h>  #include	<asm/cacheflush.h>  #include	<asm/tlbflush.h> @@ -144,30 +145,6 @@  #define	BYTES_PER_WORD		sizeof(void *)  #define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long)) -#ifndef ARCH_KMALLOC_MINALIGN -/* - * Enforce a minimum alignment for the kmalloc caches. - * Usually, the kmalloc caches are cache_line_size() aligned, except when - * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. - * Some archs want to perform DMA into kmalloc caches and need a guaranteed - * alignment larger than the alignment of a 64-bit integer. - * ARCH_KMALLOC_MINALIGN allows that. - * Note that increasing this value may disable some debug features. - */ -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -/* - * Enforce a minimum alignment for all caches. - * Intended for archs that get misalignment faults even for BYTES_PER_WORD - * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. - * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables - * some debug features. - */ -#define ARCH_SLAB_MINALIGN 0 -#endif -  #ifndef ARCH_KMALLOC_FLAGS  #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN  #endif @@ -844,7 +821,7 @@ static void init_reap_node(int cpu)  {  	int node; -	node = next_node(cpu_to_node(cpu), node_online_map); +	node = next_node(cpu_to_mem(cpu), node_online_map);  	if (node == MAX_NUMNODES)  		node = first_node(node_online_map); @@ -1073,7 +1050,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)  	struct array_cache *alien = NULL;  	int node; -	node = numa_node_id(); +	node = numa_mem_id();  	/*  	 * Make sure we are not freeing a object from another node to the array @@ -1102,11 +1079,57 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)  }  #endif +/* + * Allocates and initializes nodelists for a node on each slab cache, used for + * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3 + * will be allocated off-node since memory is not yet online for the new node. + * When hotplugging memory or a cpu, existing nodelists are not replaced if + * already in use. + * + * Must hold cache_chain_mutex. + */ +static int init_cache_nodelists_node(int node) +{ +	struct kmem_cache *cachep; +	struct kmem_list3 *l3; +	const int memsize = sizeof(struct kmem_list3); + +	list_for_each_entry(cachep, &cache_chain, next) { +		/* +		 * Set up the size64 kmemlist for cpu before we can +		 * begin anything. Make sure some other cpu on this +		 * node has not already allocated this +		 */ +		if (!cachep->nodelists[node]) { +			l3 = kmalloc_node(memsize, GFP_KERNEL, node); +			if (!l3) +				return -ENOMEM; +			kmem_list3_init(l3); +			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + +			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + +			/* +			 * The l3s don't come and go as CPUs come and +			 * go.  cache_chain_mutex is sufficient +			 * protection here. +			 */ +			cachep->nodelists[node] = l3; +		} + +		spin_lock_irq(&cachep->nodelists[node]->list_lock); +		cachep->nodelists[node]->free_limit = +			(1 + nr_cpus_node(node)) * +			cachep->batchcount + cachep->num; +		spin_unlock_irq(&cachep->nodelists[node]->list_lock); +	} +	return 0; +} +  static void __cpuinit cpuup_canceled(long cpu)  {  	struct kmem_cache *cachep;  	struct kmem_list3 *l3 = NULL; -	int node = cpu_to_node(cpu); +	int node = cpu_to_mem(cpu);  	const struct cpumask *mask = cpumask_of_node(node);  	list_for_each_entry(cachep, &cache_chain, next) { @@ -1171,8 +1194,8 @@ static int __cpuinit cpuup_prepare(long cpu)  {  	struct kmem_cache *cachep;  	struct kmem_list3 *l3 = NULL; -	int node = cpu_to_node(cpu); -	const int memsize = sizeof(struct kmem_list3); +	int node = cpu_to_mem(cpu); +	int err;  	/*  	 * We need to do this right in the beginning since @@ -1180,35 +1203,9 @@ static int __cpuinit cpuup_prepare(long cpu)  	 * kmalloc_node allows us to add the slab to the right  	 * kmem_list3 and not this cpu's kmem_list3  	 */ - -	list_for_each_entry(cachep, &cache_chain, next) { -		/* -		 * Set up the size64 kmemlist for cpu before we can -		 * begin anything. Make sure some other cpu on this -		 * node has not already allocated this -		 */ -		if (!cachep->nodelists[node]) { -			l3 = kmalloc_node(memsize, GFP_KERNEL, node); -			if (!l3) -				goto bad; -			kmem_list3_init(l3); -			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + -			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - -			/* -			 * The l3s don't come and go as CPUs come and -			 * go.  cache_chain_mutex is sufficient -			 * protection here. -			 */ -			cachep->nodelists[node] = l3; -		} - -		spin_lock_irq(&cachep->nodelists[node]->list_lock); -		cachep->nodelists[node]->free_limit = -			(1 + nr_cpus_node(node)) * -			cachep->batchcount + cachep->num; -		spin_unlock_irq(&cachep->nodelists[node]->list_lock); -	} +	err = init_cache_nodelists_node(node); +	if (err < 0) +		goto bad;  	/*  	 * Now we can go ahead with allocating the shared arrays and @@ -1324,18 +1321,82 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,  		mutex_unlock(&cache_chain_mutex);  		break;  	} -	return err ? NOTIFY_BAD : NOTIFY_OK; +	return notifier_from_errno(err);  }  static struct notifier_block __cpuinitdata cpucache_notifier = {  	&cpuup_callback, NULL, 0  }; +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +/* + * Drains freelist for a node on each slab cache, used for memory hot-remove. + * Returns -EBUSY if all objects cannot be drained so that the node is not + * removed. + * + * Must hold cache_chain_mutex. + */ +static int __meminit drain_cache_nodelists_node(int node) +{ +	struct kmem_cache *cachep; +	int ret = 0; + +	list_for_each_entry(cachep, &cache_chain, next) { +		struct kmem_list3 *l3; + +		l3 = cachep->nodelists[node]; +		if (!l3) +			continue; + +		drain_freelist(cachep, l3, l3->free_objects); + +		if (!list_empty(&l3->slabs_full) || +		    !list_empty(&l3->slabs_partial)) { +			ret = -EBUSY; +			break; +		} +	} +	return ret; +} + +static int __meminit slab_memory_callback(struct notifier_block *self, +					unsigned long action, void *arg) +{ +	struct memory_notify *mnb = arg; +	int ret = 0; +	int nid; + +	nid = mnb->status_change_nid; +	if (nid < 0) +		goto out; + +	switch (action) { +	case MEM_GOING_ONLINE: +		mutex_lock(&cache_chain_mutex); +		ret = init_cache_nodelists_node(nid); +		mutex_unlock(&cache_chain_mutex); +		break; +	case MEM_GOING_OFFLINE: +		mutex_lock(&cache_chain_mutex); +		ret = drain_cache_nodelists_node(nid); +		mutex_unlock(&cache_chain_mutex); +		break; +	case MEM_ONLINE: +	case MEM_OFFLINE: +	case MEM_CANCEL_ONLINE: +	case MEM_CANCEL_OFFLINE: +		break; +	} +out: +	return ret ? notifier_from_errno(ret) : NOTIFY_OK; +} +#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ +  /*   * swap the static kmem_list3 with kmalloced memory   */ -static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, -			int nodeid) +static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, +				int nodeid)  {  	struct kmem_list3 *ptr; @@ -1418,7 +1479,7 @@ void __init kmem_cache_init(void)  	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.  	 */ -	node = numa_node_id(); +	node = numa_mem_id();  	/* 1) create the cache_cache */  	INIT_LIST_HEAD(&cache_chain); @@ -1580,6 +1641,14 @@ void __init kmem_cache_init_late(void)  	 */  	register_cpu_notifier(&cpucache_notifier); +#ifdef CONFIG_NUMA +	/* +	 * Register a memory hotplug callback that initializes and frees +	 * nodelists. +	 */ +	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +#endif +  	/*  	 * The reap timers are started later, with a module init call: That part  	 * of the kernel is not yet operational. @@ -2052,7 +2121,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)  			}  		}  	} -	cachep->nodelists[numa_node_id()]->next_reap = +	cachep->nodelists[numa_mem_id()]->next_reap =  			jiffies + REAPTIMEOUT_LIST3 +  			((unsigned long)cachep) % REAPTIMEOUT_LIST3; @@ -2220,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,  	if (ralign < align) {  		ralign = align;  	} -	/* disable debug if necessary */ -	if (ralign > __alignof__(unsigned long long)) +	/* disable debug if not aligning with REDZONE_ALIGN */ +	if (ralign & (__alignof__(unsigned long long) - 1))  		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);  	/*  	 * 4) Store it. @@ -2247,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,  	 */  	if (flags & SLAB_RED_ZONE) {  		/* add space for red zone words */ -		cachep->obj_offset += sizeof(unsigned long long); -		size += 2 * sizeof(unsigned long long); +		cachep->obj_offset += align; +		size += align + sizeof(unsigned long long);  	}  	if (flags & SLAB_STORE_USER) {  		/* user store requires one word storage behind the end of @@ -2383,7 +2452,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)  {  #ifdef CONFIG_SMP  	check_irq_off(); -	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); +	assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);  #endif  } @@ -2410,7 +2479,7 @@ static void do_drain(void *arg)  {  	struct kmem_cache *cachep = arg;  	struct array_cache *ac; -	int node = numa_node_id(); +	int node = numa_mem_id();  	check_irq_off();  	ac = cpu_cache_get(cachep); @@ -2943,7 +3012,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)  retry:  	check_irq_off(); -	node = numa_node_id(); +	node = numa_mem_id();  	ac = cpu_cache_get(cachep);  	batchcount = ac->batchcount;  	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -3147,11 +3216,13 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)  	if (in_interrupt() || (flags & __GFP_THISNODE))  		return NULL; -	nid_alloc = nid_here = numa_node_id(); +	nid_alloc = nid_here = numa_mem_id(); +	get_mems_allowed();  	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) -		nid_alloc = cpuset_mem_spread_node(); +		nid_alloc = cpuset_slab_spread_node();  	else if (current->mempolicy)  		nid_alloc = slab_node(current->mempolicy); +	put_mems_allowed();  	if (nid_alloc != nid_here)  		return ____cache_alloc_node(cachep, flags, nid_alloc);  	return NULL; @@ -3178,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)  	if (flags & __GFP_THISNODE)  		return NULL; +	get_mems_allowed();  	zonelist = node_zonelist(slab_node(current->mempolicy), flags);  	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); @@ -3209,7 +3281,7 @@ retry:  		if (local_flags & __GFP_WAIT)  			local_irq_enable();  		kmem_flagcheck(cache, flags); -		obj = kmem_getpages(cache, local_flags, numa_node_id()); +		obj = kmem_getpages(cache, local_flags, numa_mem_id());  		if (local_flags & __GFP_WAIT)  			local_irq_disable();  		if (obj) { @@ -3233,6 +3305,7 @@ retry:  			}  		}  	} +	put_mems_allowed();  	return obj;  } @@ -3316,6 +3389,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,  {  	unsigned long save_flags;  	void *ptr; +	int slab_node = numa_mem_id();  	flags &= gfp_allowed_mask; @@ -3328,7 +3402,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,  	local_irq_save(save_flags);  	if (nodeid == -1) -		nodeid = numa_node_id(); +		nodeid = slab_node;  	if (unlikely(!cachep->nodelists[nodeid])) {  		/* Node not bootstrapped yet */ @@ -3336,7 +3410,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,  		goto out;  	} -	if (nodeid == numa_node_id()) { +	if (nodeid == slab_node) {  		/*  		 * Use the locally cached objects if possible.  		 * However ____cache_alloc does not allow fallback @@ -3380,8 +3454,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)  	 * We may just have run out of memory on the local node.  	 * ____cache_alloc_node() knows how to locate memory on other nodes  	 */ - 	if (!objp) - 		objp = ____cache_alloc_node(cache, flags, numa_node_id()); +	if (!objp) +		objp = ____cache_alloc_node(cache, flags, numa_mem_id());    out:  	return objp; @@ -3478,7 +3552,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)  {  	int batchcount;  	struct kmem_list3 *l3; -	int node = numa_node_id(); +	int node = numa_mem_id();  	batchcount = ac->batchcount;  #if DEBUG @@ -3912,7 +3986,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,  		return -ENOMEM;  	for_each_online_cpu(i) { -		new->new[i] = alloc_arraycache(cpu_to_node(i), limit, +		new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,  						batchcount, gfp);  		if (!new->new[i]) {  			for (i--; i >= 0; i--) @@ -3934,9 +4008,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,  		struct array_cache *ccold = new->new[i];  		if (!ccold)  			continue; -		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); -		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); -		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); +		spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); +		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); +		spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);  		kfree(ccold);  	}  	kfree(new); @@ -4042,7 +4116,7 @@ static void cache_reap(struct work_struct *w)  {  	struct kmem_cache *searchp;  	struct kmem_list3 *l3; -	int node = numa_node_id(); +	int node = numa_mem_id();  	struct delayed_work *work = to_delayed_work(w);  	if (!mutex_trylock(&cache_chain_mutex)) @@ -4216,10 +4290,11 @@ static int s_show(struct seq_file *m, void *p)  		unsigned long node_frees = cachep->node_frees;  		unsigned long overflows = cachep->node_overflow; -		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ -				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown, -				reaped, errors, max_freeable, node_allocs, -				node_frees, overflows); +		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " +			   "%4lu %4lu %4lu %4lu %4lu", +			   allocs, high, grown, +			   reaped, errors, max_freeable, node_allocs, +			   node_frees, overflows);  	}  	/* cpu stats */  	{ | 
