aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c260
1 files changed, 83 insertions, 177 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bdac56009a38..c5b5f74cfd4d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -98,17 +98,8 @@ static bool do_memsw_account(void)
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
}
-static const char *const mem_cgroup_lru_names[] = {
- "inactive_anon",
- "active_anon",
- "inactive_file",
- "active_file",
- "unevictable",
-};
-
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
-#define NUMAINFO_EVENTS_TARGET 1024
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
@@ -484,7 +475,7 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- if (PageHead(page) && PageSlab(page))
+ if (PageSlab(page) && !PageTail(page))
memcg = memcg_from_slab_page(page);
else
memcg = READ_ONCE(page->mem_cgroup);
@@ -778,7 +769,7 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
if (!memcg || memcg == root_mem_cgroup) {
__mod_node_page_state(pgdat, idx, val);
} else {
- lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
rcu_read_unlock();
@@ -877,9 +868,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
- case MEM_CGROUP_TARGET_NUMAINFO:
- next = val + NUMAINFO_EVENTS_TARGET;
- break;
default:
break;
}
@@ -899,21 +887,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
- bool do_numainfo __maybe_unused;
do_softlimit = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
-#if MAX_NUMNODES > 1
- do_numainfo = mem_cgroup_event_ratelimit(memcg,
- MEM_CGROUP_TARGET_NUMAINFO);
-#endif
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
-#if MAX_NUMNODES > 1
- if (unlikely(do_numainfo))
- atomic_inc(&memcg->numainfo_events);
-#endif
}
}
@@ -960,7 +939,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
if (unlikely(!memcg))
memcg = root_mem_cgroup;
}
- } while (!css_tryget_online(&memcg->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
@@ -1052,7 +1031,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup_per_node *mz;
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
- iter = &mz->iter[reclaim->priority];
+ iter = &mz->iter;
if (prev && reclaim->generation != iter->generation)
goto out_unlock;
@@ -1152,15 +1131,11 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
- int i;
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(from, nid);
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ iter = &mz->iter;
+ cmpxchg(&iter->position, dead_memcg, NULL);
}
}
@@ -1238,7 +1213,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
- lruvec = &pgdat->lruvec;
+ lruvec = &pgdat->__lruvec;
goto out;
}
@@ -1438,7 +1413,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
+ seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
@@ -1451,8 +1426,10 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
/* Accumulated memory events */
- seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
- seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
+ memcg_events(memcg, PGFAULT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
+ memcg_events(memcg, PGMAJFAULT));
seq_buf_printf(&s, "workingset_refault %lu\n",
memcg_page_state(memcg, WORKINGSET_REFAULT));
@@ -1461,22 +1438,27 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
- seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
+ memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT));
seq_buf_printf(&s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT));
- seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
- seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
- seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
- seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
+ memcg_events(memcg, PGACTIVATE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
+ memcg_events(memcg, PGDEACTIVATE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
+ memcg_events(memcg, PGLAZYFREE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
+ memcg_events(memcg, PGLAZYFREED));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_buf_printf(&s, "thp_fault_alloc %lu\n",
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
memcg_events(memcg, THP_FAULT_ALLOC));
- seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
memcg_events(memcg, THP_COLLAPSE_ALLOC));
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -1595,104 +1577,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
return ret;
}
-#if MAX_NUMNODES > 1
-
-/**
- * test_mem_cgroup_node_reclaimable
- * @memcg: the target memcg
- * @nid: the node ID to be checked.
- * @noswap : specify true here if the user wants flle only information.
- *
- * This function returns whether the specified memcg contains any
- * reclaimable pages on a node. Returns true if there are any reclaimable
- * pages in the node.
- */
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
- int nid, bool noswap)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
-
- if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
- lruvec_page_state(lruvec, NR_ACTIVE_FILE))
- return true;
- if (noswap || !total_swap_pages)
- return false;
- if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
- lruvec_page_state(lruvec, NR_ACTIVE_ANON))
- return true;
- return false;
-
-}
-
-/*
- * Always updating the nodemask is not very good - even if we have an empty
- * list or the wrong list here, we can start from some node and traverse all
- * nodes based on the zonelist. So update the list loosely once per 10 secs.
- *
- */
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
-{
- int nid;
- /*
- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
- * pagein/pageout changes since the last update.
- */
- if (!atomic_read(&memcg->numainfo_events))
- return;
- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
- return;
-
- /* make a nodemask where this memcg uses memory from */
- memcg->scan_nodes = node_states[N_MEMORY];
-
- for_each_node_mask(nid, node_states[N_MEMORY]) {
-
- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
- node_clear(nid, memcg->scan_nodes);
- }
-
- atomic_set(&memcg->numainfo_events, 0);
- atomic_set(&memcg->numainfo_updating, 0);
-}
-
-/*
- * Selecting a node where we start reclaim from. Because what we need is just
- * reducing usage counter, start from anywhere is O,K. Considering
- * memory reclaim from current node, there are pros. and cons.
- *
- * Freeing memory from current node means freeing memory from a node which
- * we'll use or we've used. So, it may make LRU bad. And if several threads
- * hit limits, it will see a contention on a node. But freeing from remote
- * node means more costs for memory reclaim because of memory latency.
- *
- * Now, we use round-robin. Better algorithm is welcomed.
- */
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
-{
- int node;
-
- mem_cgroup_may_update_nodemask(memcg);
- node = memcg->last_scanned_node;
-
- node = next_node_in(node, memcg->scan_nodes);
- /*
- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
- * last time it really checked all the LRUs due to rate limiting.
- * Fallback to the current node in that case for simplicity.
- */
- if (unlikely(node == MAX_NUMNODES))
- node = numa_node_id();
-
- memcg->last_scanned_node = node;
- return node;
-}
-#else
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
-{
- return 0;
-}
-#endif
-
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
pg_data_t *pgdat,
gfp_t gfp_mask,
@@ -1705,7 +1589,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.pgdat = pgdat,
- .priority = 0,
};
excess = soft_limit_excess(root_memcg);
@@ -1800,7 +1683,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
@@ -2535,6 +2418,15 @@ retry:
}
/*
+ * Memcg doesn't have a dedicated reserve for atomic
+ * allocations. But like the global atomic pool, we need to
+ * put the burden of reclaim on regular allocation requests
+ * and let these go through as privileged allocations.
+ */
+ if (gfp_mask & __GFP_ATOMIC)
+ goto force;
+
+ /*
* Unlike in global OOM situations, memcg is not in a physical
* memory shortage. Allow dying and OOM-killed tasks to
* bypass the last charges so that they can exit quickly and
@@ -3741,7 +3633,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
unsigned long nr = 0;
enum lru_list lru;
@@ -3849,13 +3741,6 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
-static const char *const memcg1_event_names[] = {
- "pgpgin",
- "pgpgout",
- "pgfault",
- "pgmajfault",
-};
-
static int memcg_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -3864,7 +3749,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned int i;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
@@ -3875,11 +3759,11 @@ static int memcg_stat_show(struct seq_file *m, void *v)
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
+ seq_printf(m, "%s %lu\n", lru_list_name(i),
memcg_page_state_local(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
@@ -3904,11 +3788,12 @@ static int memcg_stat_show(struct seq_file *m, void *v)
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
+ seq_printf(m, "total_%s %llu\n",
+ vm_event_name(memcg1_events[i]),
(u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
@@ -5014,12 +4899,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- /*
- * Flush percpu vmstats and vmevents to guarantee the value correctness
- * on parent's and all ancestor levels.
- */
- memcg_flush_percpu_vmstats(memcg, false);
- memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
@@ -5030,6 +4909,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
memcg_wb_domain_exit(memcg);
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
@@ -5069,7 +4954,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
INIT_WORK(&memcg->high_work, high_work_func);
- memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
@@ -5420,6 +5304,8 @@ static int mem_cgroup_move_account(struct page *page,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
+ struct lruvec *from_vec, *to_vec;
+ struct pglist_data *pgdat;
unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
@@ -5443,11 +5329,15 @@ static int mem_cgroup_move_account(struct page *page,
anon = PageAnon(page);
+ pgdat = page_pgdat(page);
+ from_vec = mem_cgroup_lruvec(from, pgdat);
+ to_vec = mem_cgroup_lruvec(to, pgdat);
+
spin_lock_irqsave(&from->move_lock, flags);
if (!anon && page_mapped(page)) {
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
/*
@@ -5459,14 +5349,14 @@ static int mem_cgroup_move_account(struct page *page,
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
}
}
if (PageWriteback(page)) {
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -6081,7 +5971,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned long nr_pages;
+ unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ bool drained = false;
unsigned long high;
int err;
@@ -6092,12 +5983,29 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
memcg->high = high;
- nr_pages = page_counter_read(&memcg->memory);
- if (nr_pages > high)
- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, true);
+ for (;;) {
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long reclaimed;
+
+ if (nr_pages <= high)
+ break;
+
+ if (signal_pending(current))
+ break;
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+ GFP_KERNEL, true);
+
+ if (!reclaimed && !nr_retries--)
+ break;
+ }
- memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -6129,10 +6037,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_pages <= max)
break;
- if (signal_pending(current)) {
- err = -EINTR;
+ if (signal_pending(current))
break;
- }
if (!drained) {
drain_all_stock(memcg);