aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmpressure.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2016-01-14 15:21:32 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-14 16:00:49 -0800
commit8e8ae645249b85c8ed6c178557f8db8613a6bcc7 (patch)
treee1c347c9b18cad1a979dda026a1dff6f310d8977 /mm/vmpressure.c
parentmm: memcontrol: account socket memory in unified hierarchy memory controller (diff)
downloadlinux-dev-8e8ae645249b85c8ed6c178557f8db8613a6bcc7.tar.xz
linux-dev-8e8ae645249b85c8ed6c178557f8db8613a6bcc7.zip
mm: memcontrol: hook up vmpressure to socket pressure
Let the networking stack know when a memcg is under reclaim pressure so that it can clamp its transmit windows accordingly. Whenever the reclaim efficiency of a cgroup's LRU lists drops low enough for a MEDIUM or HIGH vmpressure event to occur, assert a pressure state in the socket and tcp memory code that tells it to curb consumption growth from sockets associated with said control group. Traditionally, vmpressure reports for the entire subtree of a memcg under pressure, which drops useful information on the individual groups reclaimed. However, it's too late to change the userinterface, so add a second reporting mode that reports on the level of reclaim instead of at the level of pressure, and use that report for sockets. vmpressure events are naturally edge triggered, so for hysteresis assert socket pressure for a second to allow for subsequent vmpressure events to occur before letting the socket code return to normal. This will likely need finetuning for a wider variety of workloads, but for now stick to the vmpressure presets and keep hysteresis simple. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: David S. Miller <davem@davemloft.net> Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmpressure.c')
-rw-r--r--mm/vmpressure.c78
1 files changed, 59 insertions, 19 deletions
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c5afd573d7da..506f03e4be47 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -137,14 +137,11 @@ struct vmpressure_event {
};
static bool vmpressure_event(struct vmpressure *vmpr,
- unsigned long scanned, unsigned long reclaimed)
+ enum vmpressure_levels level)
{
struct vmpressure_event *ev;
- enum vmpressure_levels level;
bool signalled = false;
- level = vmpressure_calc_level(scanned, reclaimed);
-
mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) {
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
struct vmpressure *vmpr = work_to_vmpressure(work);
unsigned long scanned;
unsigned long reclaimed;
+ enum vmpressure_levels level;
spin_lock(&vmpr->sr_lock);
/*
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
* here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync.
*/
- scanned = vmpr->scanned;
+ scanned = vmpr->tree_scanned;
if (!scanned) {
spin_unlock(&vmpr->sr_lock);
return;
}
- reclaimed = vmpr->reclaimed;
- vmpr->scanned = 0;
- vmpr->reclaimed = 0;
+ reclaimed = vmpr->tree_reclaimed;
+ vmpr->tree_scanned = 0;
+ vmpr->tree_reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
+ level = vmpressure_calc_level(scanned, reclaimed);
+
do {
- if (vmpressure_event(vmpr, scanned, reclaimed))
+ if (vmpressure_event(vmpr, level))
break;
/*
* If not handled, propagate the event upward into the
@@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work)
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle
+ * @tree: legacy subtree mode
* @scanned: number of pages scanned
* @reclaimed: number of pages reclaimed
*
@@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work)
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
* pressure index is then further refined and averaged over time.
*
+ * If @tree is set, vmpressure is in traditional userspace reporting
+ * mode: @memcg is considered the pressure root and userspace is
+ * notified of the entire subtree's reclaim efficiency.
+ *
+ * If @tree is not set, reclaim efficiency is recorded for @memcg, and
+ * only in-kernel users are notified.
+ *
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
if (!scanned)
return;
- spin_lock(&vmpr->sr_lock);
- vmpr->scanned += scanned;
- vmpr->reclaimed += reclaimed;
- scanned = vmpr->scanned;
- spin_unlock(&vmpr->sr_lock);
+ if (tree) {
+ spin_lock(&vmpr->sr_lock);
+ vmpr->tree_scanned += scanned;
+ vmpr->tree_reclaimed += reclaimed;
+ scanned = vmpr->scanned;
+ spin_unlock(&vmpr->sr_lock);
- if (scanned < vmpressure_win)
- return;
- schedule_work(&vmpr->work);
+ if (scanned < vmpressure_win)
+ return;
+ schedule_work(&vmpr->work);
+ } else {
+ enum vmpressure_levels level;
+
+ /* For now, no users for root-level efficiency */
+ if (memcg == root_mem_cgroup)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->scanned += scanned;
+ reclaimed = vmpr->reclaimed += reclaimed;
+ if (scanned < vmpressure_win) {
+ spin_unlock(&vmpr->sr_lock);
+ return;
+ }
+ vmpr->scanned = vmpr->reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ if (level > VMPRESSURE_LOW) {
+ /*
+ * Let the socket buffer allocator know that
+ * we are having trouble reclaiming LRU pages.
+ *
+ * For hysteresis keep the pressure state
+ * asserted for a second in which subsequent
+ * pressure events can occur.
+ */
+ memcg->socket_pressure = jiffies + HZ;
+ }
+ }
}
/**
@@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, vmpressure_win, 0);
+ vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
/**