aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c152
1 files changed, 83 insertions, 69 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a9c74b409681..32c661d66a45 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
unsigned long freed = 0;
unsigned long long delta;
long total_scan;
- long max_pass;
+ long freeable;
long nr;
long new_nr;
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
- max_pass = shrinker->count_objects(shrinker, shrinkctl);
- if (max_pass == 0)
+ freeable = shrinker->count_objects(shrinker, shrinkctl);
+ if (freeable == 0)
return 0;
/*
@@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
total_scan = nr;
delta = (4 * nr_pages_scanned) / shrinker->seeks;
- delta *= max_pass;
+ delta *= freeable;
do_div(delta, lru_pages + 1);
total_scan += delta;
if (total_scan < 0) {
printk(KERN_ERR
"shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
- total_scan = max_pass;
+ total_scan = freeable;
}
/*
@@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
* shrinkers to return -1 all the time. This results in a large
* nr being built up so when a shrink that can do some work
* comes along it empties the entire cache due to nr >>>
- * max_pass. This is bad for sustaining a working set in
+ * freeable. This is bad for sustaining a working set in
* memory.
*
* Hence only allow the shrinker to scan the entire cache when
* a large delta change is calculated directly.
*/
- if (delta < max_pass / 4)
- total_scan = min(total_scan, max_pass / 2);
+ if (delta < freeable / 4)
+ total_scan = min(total_scan, freeable / 2);
/*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (total_scan > max_pass * 2)
- total_scan = max_pass * 2;
+ if (total_scan > freeable * 2)
+ total_scan = freeable * 2;
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
nr_pages_scanned, lru_pages,
- max_pass, delta, total_scan);
+ freeable, delta, total_scan);
/*
* Normally, we should not scan less than batch_size objects in one
@@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
*
* We detect the "tight on memory" situations by looking at the total
* number of objects we want to scan (total_scan). If it is greater
- * than the total number of objects on slab (max_pass), we must be
+ * than the total number of objects on slab (freeable), we must be
* scanning at high prio and therefore should try to reclaim as much as
* possible.
*/
while (total_scan >= batch_size ||
- total_scan >= max_pass) {
+ total_scan >= freeable) {
unsigned long ret;
unsigned long nr_to_scan = min(batch_size, total_scan);
@@ -523,7 +523,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* Same as remove_mapping, but if the page is removed from the mapping, it
* gets returned with a refcount of 0.
*/
-static int __remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page,
+ bool reclaimed)
{
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
@@ -569,10 +570,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
swapcache_free(swap, page);
} else {
void (*freepage)(struct page *);
+ void *shadow = NULL;
freepage = mapping->a_ops->freepage;
-
- __delete_from_page_cache(page);
+ /*
+ * Remember a shadow entry for reclaimed file cache in
+ * order to detect refaults, thus thrashing, later on.
+ *
+ * But don't store shadows in an address space that is
+ * already exiting. This is not just an optizimation,
+ * inode reclaim needs to empty out the radix tree or
+ * the nodes are lost. Don't plant shadows behind its
+ * back.
+ */
+ if (reclaimed && page_is_file_cache(page) &&
+ !mapping_exiting(mapping))
+ shadow = workingset_eviction(mapping, page);
+ __delete_from_page_cache(page, shadow);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
@@ -595,7 +609,7 @@ cannot_free:
*/
int remove_mapping(struct address_space *mapping, struct page *page)
{
- if (__remove_mapping(mapping, page)) {
+ if (__remove_mapping(mapping, page, false)) {
/*
* Unfreezing the refcount with 1 rather than 2 effectively
* drops the pagecache ref for us without requiring another
@@ -1065,7 +1079,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
- if (!mapping || !__remove_mapping(mapping, page))
+ if (!mapping || !__remove_mapping(mapping, page, true))
goto keep_locked;
/*
@@ -1144,7 +1158,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
TTU_UNMAP|TTU_IGNORE_ACCESS,
&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
list_splice(&clean_pages, page_list);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
return ret;
}
@@ -1848,7 +1862,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct zone *zone = lruvec_zone(lruvec);
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
- unsigned long anon, file, free;
+ unsigned long anon, file;
bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
@@ -1902,13 +1916,17 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
get_lru_size(lruvec, LRU_INACTIVE_FILE);
/*
- * If it's foreseeable that reclaiming the file cache won't be
- * enough to get the zone back into a desirable shape, we have
- * to swap. Better start now and leave the - probably heavily
- * thrashing - remaining file pages alone.
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
*/
if (global_reclaim(sc)) {
- free = zone_page_state(zone, NR_FREE_PAGES);
+ unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
+
if (unlikely(file + free <= high_wmark_pages(zone))) {
scan_balance = SCAN_ANON;
goto out;
@@ -2297,16 +2315,26 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ unsigned long lru_pages = 0;
bool aborted_reclaim = false;
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ gfp_t orig_mask;
+ struct shrink_control shrink = {
+ .gfp_mask = sc->gfp_mask,
+ };
+ enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
/*
* If the number of buffer_heads in the machine exceeds the maximum
* allowed level, force direct reclaim to scan the highmem zone as
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
+ orig_mask = sc->gfp_mask;
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
+ nodes_clear(shrink.nodes_to_scan);
+
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
@@ -2318,6 +2346,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (global_reclaim(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
+
+ lru_pages += zone_reclaimable_pages(zone);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
+
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue; /* Let kswapd poll it */
@@ -2331,7 +2363,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* noticeable problem, like transparent huge
* page allocations.
*/
- if (compaction_ready(zone, sc)) {
+ if ((zonelist_zone_idx(z) <= requested_highidx)
+ && compaction_ready(zone, sc)) {
aborted_reclaim = true;
continue;
}
@@ -2354,6 +2387,26 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
shrink_zone(zone, sc);
}
+ /*
+ * Don't shrink slabs when reclaiming memory from over limit cgroups
+ * but do shrink slab at least once when aborting reclaim for
+ * compaction to avoid unevenly scanning file/anon LRU pages over slab
+ * pages.
+ */
+ if (global_reclaim(sc)) {
+ shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
+ }
+
+ /*
+ * Restore to original mask to avoid the impact on the caller if we
+ * promoted it to __GFP_HIGHMEM.
+ */
+ sc->gfp_mask = orig_mask;
+
return aborted_reclaim;
}
@@ -2394,13 +2447,9 @@ static bool all_unreclaimable(struct zonelist *zonelist,
* else, the number of pages reclaimed
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
- struct scan_control *sc,
- struct shrink_control *shrink)
+ struct scan_control *sc)
{
unsigned long total_scanned = 0;
- struct reclaim_state *reclaim_state = current->reclaim_state;
- struct zoneref *z;
- struct zone *zone;
unsigned long writeback_threshold;
bool aborted_reclaim;
@@ -2415,32 +2464,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
aborted_reclaim = shrink_zones(zonelist, sc);
- /*
- * Don't shrink slabs when reclaiming memory from over limit
- * cgroups but do shrink slab at least once when aborting
- * reclaim for compaction to avoid unevenly scanning file/anon
- * LRU pages over slab pages.
- */
- if (global_reclaim(sc)) {
- unsigned long lru_pages = 0;
-
- nodes_clear(shrink->nodes_to_scan);
- for_each_zone_zonelist(zone, z, zonelist,
- gfp_zone(sc->gfp_mask)) {
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- lru_pages += zone_reclaimable_pages(zone);
- node_set(zone_to_nid(zone),
- shrink->nodes_to_scan);
- }
-
- shrink_slab(shrink, sc->nr_scanned, lru_pages);
- if (reclaim_state) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
- }
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;
@@ -2602,9 +2625,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.target_mem_cgroup = NULL,
.nodemask = nodemask,
};
- struct shrink_control shrink = {
- .gfp_mask = sc.gfp_mask,
- };
/*
* Do not enter reclaim if fatal signal was delivered while throttled.
@@ -2618,7 +2638,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
sc.may_writepage,
gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
@@ -2685,9 +2705,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
};
- struct shrink_control shrink = {
- .gfp_mask = sc.gfp_mask,
- };
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
@@ -2702,7 +2719,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
sc.may_writepage,
sc.gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -3337,9 +3354,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.order = 0,
.priority = DEF_PRIORITY,
};
- struct shrink_control shrink = {
- .gfp_mask = sc.gfp_mask,
- };
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
@@ -3349,7 +3363,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();