From b0a8cc58e6b9aaae3045752059e5e6260c0b94bc Mon Sep 17 00:00:00 2001 From: Takamori Yamaguchi Date: Thu, 8 Nov 2012 15:53:39 -0800 Subject: mm: bugfix: set current->reclaim_state to NULL while returning from kswapd() In kswapd(), set current->reclaim_state to NULL before returning, as current->reclaim_state holds reference to variable on kswapd()'s stack. In rare cases, while returning from kswapd() during memory offlining, __free_slab() and freepages() can access the dangling pointer of current->reclaim_state. Signed-off-by: Takamori Yamaguchi Signed-off-by: Aaditya Kumar Acked-by: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2624edcfb420..8b055e9379bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3017,6 +3017,8 @@ static int kswapd(void *p) &balanced_classzone_idx); } } + + current->reclaim_state = NULL; return 0; } -- cgit v1.2.3-59-g8ed1b From 96710098ee124951ff2fed7cd8406da92aad011a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 16 Nov 2012 14:14:59 -0800 Subject: mm: revert "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" Jiri Slaby reported the following: (It's an effective revert of "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures".) Given kswapd had hours of runtime in ps/top output yesterday in the morning and after the revert it's now 2 minutes in sum for the last 24h, I would say, it's gone. The intention of the patch in question was to compensate for the loss of lumpy reclaim. Part of the reason lumpy reclaim worked is because it aggressively reclaimed pages and this patch was meant to be a sane compromise. When compaction fails, it gets deferred and both compaction and reclaim/compaction is deferred avoid excessive reclaim. However, since commit c654345924f7 ("mm: remove __GFP_NO_KSWAPD"), kswapd is woken up each time and continues reclaiming which was not taken into account when the patch was developed. Attempts to address the problem ended up just changing the shape of the problem instead of fixing it. The release window gets closer and while a THP allocation failing is not a major problem, kswapd chewing up a lot of CPU is. This patch reverts commit 83fde0f22872 ("mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures") and will be revisited in the future. Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Tested-by: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Jiri Slaby Cc: Johannes Hirte Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 8b055e9379bc..48550c66f1f2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1760,28 +1760,6 @@ static bool in_reclaim_compaction(struct scan_control *sc) return false; } -#ifdef CONFIG_COMPACTION -/* - * If compaction is deferred for sc->order then scale the number of pages - * reclaimed based on the number of consecutive allocation failures - */ -static unsigned long scale_for_compaction(unsigned long pages_for_compaction, - struct lruvec *lruvec, struct scan_control *sc) -{ - struct zone *zone = lruvec_zone(lruvec); - - if (zone->compact_order_failed <= sc->order) - pages_for_compaction <<= zone->compact_defer_shift; - return pages_for_compaction; -} -#else -static unsigned long scale_for_compaction(unsigned long pages_for_compaction, - struct lruvec *lruvec, struct scan_control *sc) -{ - return pages_for_compaction; -} -#endif - /* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns @@ -1829,9 +1807,6 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - - pages_for_compaction = scale_for_compaction(pages_for_compaction, - lruvec, sc); inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); -- cgit v1.2.3-59-g8ed1b From 50694c28f1e1dbea18272980d265742a5027fb63 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 26 Nov 2012 16:29:48 -0800 Subject: mm: vmscan: check for fatal signals iff the process was throttled Commit 5515061d22f0 ("mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage") introduced a check for fatal signals after a process gets throttled for network storage. The intention was that if a process was throttled and got killed that it should not trigger the OOM killer. As pointed out by Minchan Kim and David Rientjes, this check is in the wrong place and too broad. If a system is in am OOM situation and a process is exiting, it can loop in __alloc_pages_slowpath() and calling direct reclaim in a loop. As the fatal signal is pending it returns 1 as if it is making forward progress and can effectively deadlock. This patch moves the fatal_signal_pending() check after throttling to throttle_direct_reclaim() where it belongs. If the process is killed while throttled, it will return immediately without direct reclaim except now it will have TIF_MEMDIE set and will use the PFMEMALLOC reserves. Minchan pointed out that it may be better to direct reclaim before returning to avoid using the reserves because there may be pages that can easily reclaim that would avoid using the reserves. However, we do no such targetted reclaim and there is no guarantee that suitable pages are available. As it is expected that this throttling happens when swap-over-NFS is used there is a possibility that the process will instead swap which may allocate network buffers from the PFMEMALLOC reserves. Hence, in the swap-over-nfs case where a process can be throtted and be killed it can use the reserves to exit or it can potentially use reserves to swap a few pages and then exit. This patch takes the option of using the reserves if necessary to allow the process exit quickly. If this patch passes review it should be considered a -stable candidate for 3.6. Signed-off-by: Mel Gorman Cc: David Rientjes Cc: Luigi Semenzato Cc: Dan Magenheimer Cc: KOSAKI Motohiro Cc: Sonny Rao Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 48550c66f1f2..cbf84e152f04 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2207,9 +2207,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) * Throttle direct reclaimers if backing storage is backed by the network * and the PFMEMALLOC reserve for the preferred node is getting dangerously * depleted. kswapd will continue to make progress and wake the processes - * when the low watermark is reached + * when the low watermark is reached. + * + * Returns true if a fatal signal was delivered during throttling. If this + * happens, the page allocator should not consider triggering the OOM killer. */ -static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, +static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { struct zone *zone; @@ -2224,13 +2227,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, * processes to block on log_wait_commit(). */ if (current->flags & PF_KTHREAD) - return; + goto out; + + /* + * If a fatal signal is pending, this process should not throttle. + * It should return quickly so it can exit and free its memory + */ + if (fatal_signal_pending(current)) + goto out; /* Check if the pfmemalloc reserves are ok */ first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); pgdat = zone->zone_pgdat; if (pfmemalloc_watermark_ok(pgdat)) - return; + goto out; /* Account for the throttling */ count_vm_event(PGSCAN_DIRECT_THROTTLE); @@ -2246,12 +2256,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, pfmemalloc_watermark_ok(pgdat), HZ); - return; + + goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, pfmemalloc_watermark_ok(pgdat)); + +check_pending: + if (fatal_signal_pending(current)) + return true; + +out: + return false; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, @@ -2273,13 +2291,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .gfp_mask = sc.gfp_mask, }; - throttle_direct_reclaim(gfp_mask, zonelist, nodemask); - /* - * Do not enter reclaim if fatal signal is pending. 1 is returned so - * that the page allocator does not consider triggering OOM + * Do not enter reclaim if fatal signal was delivered while throttled. + * 1 is returned so that the page allocator does not OOM kill at this + * point. */ - if (fatal_signal_pending(current)) + if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) return 1; trace_mm_vmscan_direct_reclaim_begin(order, -- cgit v1.2.3-59-g8ed1b