aboutsummaryrefslogtreecommitdiffstats
path: root/mm/slub.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/slub.c')
-rw-r--r--mm/slub.c154
1 files changed, 93 insertions, 61 deletions
diff --git a/mm/slub.c b/mm/slub.c
index fe376fe1f4fe..06cdb1829dc9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2007,6 +2007,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
int pages;
int pobjects;
+ preempt_disable();
do {
pages = 0;
pobjects = 0;
@@ -2040,6 +2041,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
+ if (unlikely(!s->cpu_partial)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ local_irq_restore(flags);
+ }
+ preempt_enable();
#endif
}
@@ -2398,13 +2407,24 @@ redo:
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
- * Preemption is disabled for the retrieval of the tid because that
- * must occur from the current processor. We cannot allow rescheduling
- * on a different processor between the determination of the pointer
- * and the retrieval of the tid.
+ * We should guarantee that tid and kmem_cache are retrieved on
+ * the same cpu. It could be different if CONFIG_PREEMPT so we need
+ * to check if it is matched or not.
*/
- preempt_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ do {
+ tid = this_cpu_read(s->cpu_slab->tid);
+ c = raw_cpu_ptr(s->cpu_slab);
+ } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+
+ /*
+ * Irqless object alloc/free algorithm used here depends on sequence
+ * of fetching cpu_slab's data. tid should be fetched before anything
+ * on c to guarantee that object and page associated with previous tid
+ * won't be used with current tid. If we fetch tid first, object and
+ * page could be one associated with next tid and our alloc/free
+ * request will be failed. In this case, we will retry. So, no problem.
+ */
+ barrier();
/*
* The transaction ids are globally unique per cpu and per operation on
@@ -2412,8 +2432,6 @@ redo:
* occurs on the right processor and that there was no operation on the
* linked list in between.
*/
- tid = c->tid;
- preempt_enable();
object = c->freelist;
page = c->page;
@@ -2512,7 +2530,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
/*
- * Slow patch handling. This may still be called frequently since objects
+ * Slow path handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
*
* So we still attempt to reduce cache line usage. Just take the slab
@@ -2659,11 +2677,13 @@ redo:
* data is retrieved via this pointer. If we are on the same cpu
* during the cmpxchg then the free will succedd.
*/
- preempt_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ do {
+ tid = this_cpu_read(s->cpu_slab->tid);
+ c = raw_cpu_ptr(s->cpu_slab);
+ } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
- tid = c->tid;
- preempt_enable();
+ /* Same with comment on barrier() in slab_alloc_node() */
+ barrier();
if (likely(page == c->page)) {
set_freepointer(s, object, c->freelist);
@@ -3347,69 +3367,92 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);
+#define SHRINK_PROMOTE_MAX 32
+
/*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
*
* The slabs with the least items are placed last. This results in them
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
{
int node;
int i;
struct kmem_cache_node *n;
struct page *page;
struct page *t;
- int objects = oo_objects(s->max);
- struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+ struct list_head discard;
+ struct list_head promote[SHRINK_PROMOTE_MAX];
unsigned long flags;
+ int ret = 0;
- if (!slabs_by_inuse)
- return -ENOMEM;
+ if (deactivate) {
+ /*
+ * Disable empty slabs caching. Used to avoid pinning offline
+ * memory cgroups by kmem pages that can be freed.
+ */
+ s->cpu_partial = 0;
+ s->min_partial = 0;
+
+ /*
+ * s->cpu_partial is checked locklessly (see put_cpu_partial),
+ * so we have to make sure the change is visible.
+ */
+ kick_all_cpus_sync();
+ }
flush_all(s);
for_each_kmem_cache_node(s, node, n) {
- if (!n->nr_partial)
- continue;
-
- for (i = 0; i < objects; i++)
- INIT_LIST_HEAD(slabs_by_inuse + i);
+ INIT_LIST_HEAD(&discard);
+ for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+ INIT_LIST_HEAD(promote + i);
spin_lock_irqsave(&n->list_lock, flags);
/*
- * Build lists indexed by the items in use in each slab.
+ * Build lists of slabs to discard or promote.
*
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
- list_move(&page->lru, slabs_by_inuse + page->inuse);
- if (!page->inuse)
+ int free = page->objects - page->inuse;
+
+ /* Do not reread page->inuse */
+ barrier();
+
+ /* We do not keep full slabs on the list */
+ BUG_ON(free <= 0);
+
+ if (free == page->objects) {
+ list_move(&page->lru, &discard);
n->nr_partial--;
+ } else if (free <= SHRINK_PROMOTE_MAX)
+ list_move(&page->lru, promote + free - 1);
}
/*
- * Rebuild the partial list with the slabs filled up most
- * first and the least used slabs at the end.
+ * Promote the slabs filled up most to the head of the
+ * partial list.
*/
- for (i = objects - 1; i > 0; i--)
- list_splice(slabs_by_inuse + i, n->partial.prev);
+ for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+ list_splice(promote + i, &n->partial);
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+ list_for_each_entry_safe(page, t, &discard, lru)
discard_slab(s, page);
+
+ if (slabs_node(s, node))
+ ret = 1;
}
- kfree(slabs_by_inuse);
- return 0;
+ return ret;
}
static int slab_mem_going_offline_callback(void *arg)
@@ -3418,7 +3461,7 @@ static int slab_mem_going_offline_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s);
+ __kmem_cache_shrink(s, false);
mutex_unlock(&slab_mutex);
return 0;
@@ -3566,6 +3609,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
p->slab_cache = s;
#endif
}
+ slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
return s;
}
@@ -3624,13 +3668,10 @@ struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s;
+ struct kmem_cache *s, *c;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
- int i;
- struct kmem_cache *c;
-
s->refcount++;
/*
@@ -3640,10 +3681,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
+ for_each_memcg_cache(c, s) {
c->object_size = s->object_size;
c->inuse = max_t(int, c->inuse,
ALIGN(size, sizeof(void *)));
@@ -4680,12 +4718,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- if (buf[0] == '1') {
- int rc = kmem_cache_shrink(s);
-
- if (rc)
- return rc;
- } else
+ if (buf[0] == '1')
+ kmem_cache_shrink(s);
+ else
return -EINVAL;
return length;
}
@@ -4909,7 +4944,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
err = attribute->store(s, buf, len);
#ifdef CONFIG_MEMCG_KMEM
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- int i;
+ struct kmem_cache *c;
mutex_lock(&slab_mutex);
if (s->max_attr_size < len)
@@ -4932,11 +4967,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
* directly either failed or succeeded, in which case we loop
* through the descendants with best-effort propagation.
*/
- for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg_idx(s, i);
- if (c)
- attribute->store(c, buf, len);
- }
+ for_each_memcg_cache(c, s)
+ attribute->store(c, buf, len);
mutex_unlock(&slab_mutex);
}
#endif
@@ -4953,7 +4985,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
if (is_root_cache(s))
return;
- root_cache = s->memcg_params->root_cache;
+ root_cache = s->memcg_params.root_cache;
/*
* This mean this cache had no attribute written. Therefore, no point
@@ -5033,7 +5065,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
{
#ifdef CONFIG_MEMCG_KMEM
if (!is_root_cache(s))
- return s->memcg_params->root_cache->memcg_kset;
+ return s->memcg_params.root_cache->memcg_kset;
#endif
return slab_kset;
}