aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--mm/vmscan.c244
1 files changed, 203 insertions, 41 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f86f288..7e7d25504651 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -65,12 +65,6 @@ struct scan_control {
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;
- /* This context's GFP mask */
- gfp_t gfp_mask;
-
- /* Allocation order */
- int order;
-
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
* are scanned.
@@ -83,12 +77,6 @@ struct scan_control {
*/
struct mem_cgroup *target_mem_cgroup;
- /* Scan (total_size >> priority) pages at once */
- int priority;
-
- /* The highest zone to isolate pages for reclaim from */
- enum zone_type reclaim_idx;
-
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
@@ -111,6 +99,18 @@ struct scan_control {
/* One of the zones is ready for compaction */
unsigned int compaction_ready:1;
+ /* Allocation order */
+ s8 order;
+
+ /* Scan (total_size >> priority) pages at once */
+ s8 priority;
+
+ /* The highest zone to isolate pages for reclaim from */
+ s8 reclaim_idx;
+
+ /* This context's GFP mask */
+ gfp_t gfp_mask;
+
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -169,6 +169,70 @@ unsigned long vm_total_pages;
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+#ifdef CONFIG_MEMCG_KMEM
+
+/*
+ * We allow subsystems to populate their shrinker-related
+ * LRU lists before register_shrinker_prepared() is called
+ * for the shrinker, since we don't want to impose
+ * restrictions on their internal registration order.
+ * In this case shrink_slab_memcg() may find corresponding
+ * bit is set in the shrinkers map.
+ *
+ * This value is used by the function to detect registering
+ * shrinkers and to skip do_shrink_slab() calls for them.
+ */
+#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+
+static DEFINE_IDR(shrinker_idr);
+static int shrinker_nr_max;
+
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id, ret = -ENOMEM;
+
+ down_write(&shrinker_rwsem);
+ /* This may call shrinker, so it must use down_read_trylock() */
+ id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ goto unlock;
+
+ if (id >= shrinker_nr_max) {
+ if (memcg_expand_shrinker_maps(id)) {
+ idr_remove(&shrinker_idr, id);
+ goto unlock;
+ }
+
+ shrinker_nr_max = id + 1;
+ }
+ shrinker->id = id;
+ ret = 0;
+unlock:
+ up_write(&shrinker_rwsem);
+ return ret;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id = shrinker->id;
+
+ BUG_ON(id < 0);
+
+ down_write(&shrinker_rwsem);
+ idr_remove(&shrinker_idr, id);
+ up_write(&shrinker_rwsem);
+}
+#else /* CONFIG_MEMCG_KMEM */
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
@@ -313,11 +377,28 @@ int prealloc_shrinker(struct shrinker *shrinker)
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
return -ENOMEM;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ if (prealloc_memcg_shrinker(shrinker))
+ goto free_deferred;
+ }
+
return 0;
+
+free_deferred:
+ kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
+ return -ENOMEM;
}
void free_prealloced_shrinker(struct shrinker *shrinker)
{
+ if (!shrinker->nr_deferred)
+ return;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
+
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
@@ -326,6 +407,10 @@ void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
+#ifdef CONFIG_MEMCG_KMEM
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ idr_replace(&shrinker_idr, shrinker, shrinker->id);
+#endif
up_write(&shrinker_rwsem);
}
@@ -347,6 +432,8 @@ void unregister_shrinker(struct shrinker *shrinker)
{
if (!shrinker->nr_deferred)
return;
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
@@ -371,9 +458,12 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
: SHRINK_BATCH;
long scanned = 0, next_deferred;
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
freeable = shrinker->count_objects(shrinker, shrinkctl);
- if (freeable == 0)
- return 0;
+ if (freeable == 0 || freeable == SHRINK_EMPTY)
+ return freeable;
/*
* copy the current shrinker scan count into a local variable
@@ -474,6 +564,84 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
return freed;
}
+#ifdef CONFIG_MEMCG_KMEM
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ struct memcg_shrinker_map *map;
+ unsigned long freed = 0;
+ int ret, i;
+
+ if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+ return 0;
+
+ if (!down_read_trylock(&shrinker_rwsem))
+ return 0;
+
+ map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
+ true);
+ if (unlikely(!map))
+ goto unlock;
+
+ for_each_set_bit(i, map->map, shrinker_nr_max) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ .memcg = memcg,
+ };
+ struct shrinker *shrinker;
+
+ shrinker = idr_find(&shrinker_idr, i);
+ if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+ if (!shrinker)
+ clear_bit(i, map->map);
+ continue;
+ }
+
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY) {
+ clear_bit(i, map->map);
+ /*
+ * After the shrinker reported that it had no objects to
+ * free, but before we cleared the corresponding bit in
+ * the memcg shrinker map, a new object might have been
+ * added. To make sure, we have the bit set in this
+ * case, we invoke the shrinker one more time and reset
+ * the bit if it reports that it is not empty anymore.
+ * The memory barrier here pairs with the barrier in
+ * memcg_set_shrinker_bit():
+ *
+ * list_lru_add() shrink_slab_memcg()
+ * list_add_tail() clear_bit()
+ * <MB> <MB>
+ * set_bit() do_shrink_slab()
+ */
+ smp_mb__after_atomic();
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ else
+ memcg_set_shrinker_bit(memcg, nid, i);
+ }
+ freed += ret;
+
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
+ }
+unlock:
+ up_read(&shrinker_rwsem);
+ return freed;
+}
+#else /* CONFIG_MEMCG_KMEM */
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
/**
* shrink_slab - shrink slab caches
* @gfp_mask: allocation context
@@ -486,10 +654,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
* unaware shrinkers will receive a node id of 0 instead.
*
- * @memcg specifies the memory cgroup to target. If it is not NULL,
- * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise, only unaware
- * shrinkers are called.
+ * @memcg specifies the memory cgroup to target. Unaware shrinkers
+ * are called only if it is the root cgroup.
*
* @priority is sc->priority, we take the number of objects and >> by priority
* in order to get the scan target.
@@ -502,9 +668,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{
struct shrinker *shrinker;
unsigned long freed = 0;
+ int ret;
- if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
- return 0;
+ if (!mem_cgroup_is_root(memcg))
+ return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem))
goto out;
@@ -516,19 +683,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
- /*
- * If kernel memory accounting is disabled, we ignore
- * SHRINKER_MEMCG_AWARE flag and call all shrinkers
- * passing NULL for memcg.
- */
- if (memcg_kmem_enabled() &&
- !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
- continue;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- sc.nid = 0;
-
- freed += do_shrink_slab(&sc, shrinker, priority);
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ freed += ret;
/*
* Bail out if someone want to register a new shrinker to
* prevent the regsitration from being stalled for long periods
@@ -554,6 +712,7 @@ void drop_slab_node(int nid)
struct mem_cgroup *memcg = NULL;
freed = 0;
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
@@ -744,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
refcount = 2;
if (!page_ref_freeze(page, refcount))
goto cannot_free;
- /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+ /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_ref_unfreeze(page, refcount);
goto cannot_free;
@@ -2573,9 +2732,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
- if (memcg)
- shrink_slab(sc->gfp_mask, pgdat->node_id,
- memcg, sc->priority);
+ shrink_slab(sc->gfp_mask, pgdat->node_id,
+ memcg, sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
@@ -2599,10 +2757,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
- if (global_reclaim(sc))
- shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
- sc->priority);
-
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -3064,6 +3218,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
};
/*
+ * scan_control uses s8 fields for order, priority, and reclaim_idx.
+ * Confirm they are large enough for max values.
+ */
+ BUILD_BUG_ON(MAX_ORDER > S8_MAX);
+ BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
+ BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
+
+ /*
* Do not enter reclaim if fatal signal was delivered while throttled.
* 1 is returned so that the page allocator does not OOM kill at this
* point.