aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c30
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/memcontrol.c210
-rw-r--r--mm/mempolicy.c9
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/page-writeback.c15
-rw-r--r--mm/page_cgroup.c3
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slub.c495
-rw-r--r--mm/vmalloc.c15
-rw-r--r--mm/vmscan.c71
-rw-r--r--mm/vmstat.c4
13 files changed, 472 insertions, 402 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d6edf8d14f9c..a87da524a4a0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -359,6 +359,17 @@ static unsigned long bdi_longest_inactive(void)
return max(5UL * 60 * HZ, interval);
}
+/*
+ * Clear pending bit and wakeup anybody waiting for flusher thread creation or
+ * shutdown
+ */
+static void bdi_clear_pending(struct backing_dev_info *bdi)
+{
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
+}
+
static int bdi_forker_thread(void *ptr)
{
struct bdi_writeback *me = ptr;
@@ -390,6 +401,13 @@ static int bdi_forker_thread(void *ptr)
}
spin_lock_bh(&bdi_lock);
+ /*
+ * In the following loop we are going to check whether we have
+ * some work to do without any synchronization with tasks
+ * waking us up to do work for them. So we have to set task
+ * state already here so that we don't miss wakeups coming
+ * after we verify some condition.
+ */
set_current_state(TASK_INTERRUPTIBLE);
list_for_each_entry(bdi, &bdi_list, bdi_list) {
@@ -469,11 +487,13 @@ static int bdi_forker_thread(void *ptr)
spin_unlock_bh(&bdi->wb_lock);
wake_up_process(task);
}
+ bdi_clear_pending(bdi);
break;
case KILL_THREAD:
__set_current_state(TASK_RUNNING);
kthread_stop(task);
+ bdi_clear_pending(bdi);
break;
case NO_ACTION:
@@ -489,16 +509,8 @@ static int bdi_forker_thread(void *ptr)
else
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
try_to_freeze();
- /* Back to the main loop */
- continue;
+ break;
}
-
- /*
- * Clear pending bit and wakeup anybody waiting to tear us down.
- */
- clear_bit(BDI_pending, &bdi->state);
- smp_mb__after_clear_bit();
- wake_up_bit(&bdi->state, BDI_pending);
}
return 0;
diff --git a/mm/filemap.c b/mm/filemap.c
index 645a080ba4df..7771871fa353 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -827,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
{
unsigned int i;
unsigned int ret;
- unsigned int nr_found;
+ unsigned int nr_found, nr_skip;
rcu_read_lock();
restart:
nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
(void ***)pages, NULL, start, nr_pages);
ret = 0;
+ nr_skip = 0;
for (i = 0; i < nr_found; i++) {
struct page *page;
repeat:
@@ -856,6 +857,7 @@ repeat:
* here as an exceptional entry: so skip over it -
* we only reach this from invalidate_mapping_pages().
*/
+ nr_skip++;
continue;
}
@@ -876,7 +878,7 @@ repeat:
* If all entries were removed before we could secure them,
* try again, because callers stop trying once 0 is returned.
*/
- if (unlikely(!ret && nr_found))
+ if (unlikely(!ret && nr_found > nr_skip))
goto restart;
rcu_read_unlock();
return ret;
diff --git a/mm/highmem.c b/mm/highmem.c
index 693394daa2ed..5ef672c07f75 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -326,7 +326,7 @@ static struct page_address_slot {
spinlock_t lock; /* Protect this bucket's list */
} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
-static struct page_address_slot *page_slot(struct page *page)
+static struct page_address_slot *page_slot(const struct page *page)
{
return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
}
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page)
*
* Returns the page's virtual address.
*/
-void *page_address(struct page *page)
+void *page_address(const struct page *page)
{
unsigned long flags;
void *ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f4ec4e7ca4cd..3508777837c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -204,50 +204,6 @@ struct mem_cgroup_eventfd_list {
static void mem_cgroup_threshold(struct mem_cgroup *mem);
static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
-enum {
- SCAN_BY_LIMIT,
- SCAN_BY_SYSTEM,
- NR_SCAN_CONTEXT,
- SCAN_BY_SHRINK, /* not recorded now */
-};
-
-enum {
- SCAN,
- SCAN_ANON,
- SCAN_FILE,
- ROTATE,
- ROTATE_ANON,
- ROTATE_FILE,
- FREED,
- FREED_ANON,
- FREED_FILE,
- ELAPSED,
- NR_SCANSTATS,
-};
-
-struct scanstat {
- spinlock_t lock;
- unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
- unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
-};
-
-const char *scanstat_string[NR_SCANSTATS] = {
- "scanned_pages",
- "scanned_anon_pages",
- "scanned_file_pages",
- "rotated_pages",
- "rotated_anon_pages",
- "rotated_file_pages",
- "freed_pages",
- "freed_anon_pages",
- "freed_file_pages",
- "elapsed_ns",
-};
-#define SCANSTAT_WORD_LIMIT "_by_limit"
-#define SCANSTAT_WORD_SYSTEM "_by_system"
-#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"
-
-
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
@@ -313,8 +269,7 @@ struct mem_cgroup {
/* For oom notifier event fd */
struct list_head oom_notify;
- /* For recording LRU-scan statistics */
- struct scanstat scanstat;
+
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
@@ -1678,44 +1633,6 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
}
#endif
-static void __mem_cgroup_record_scanstat(unsigned long *stats,
- struct memcg_scanrecord *rec)
-{
-
- stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
- stats[SCAN_ANON] += rec->nr_scanned[0];
- stats[SCAN_FILE] += rec->nr_scanned[1];
-
- stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
- stats[ROTATE_ANON] += rec->nr_rotated[0];
- stats[ROTATE_FILE] += rec->nr_rotated[1];
-
- stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
- stats[FREED_ANON] += rec->nr_freed[0];
- stats[FREED_FILE] += rec->nr_freed[1];
-
- stats[ELAPSED] += rec->elapsed;
-}
-
-static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
-{
- struct mem_cgroup *mem;
- int context = rec->context;
-
- if (context >= NR_SCAN_CONTEXT)
- return;
-
- mem = rec->mem;
- spin_lock(&mem->scanstat.lock);
- __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
- spin_unlock(&mem->scanstat.lock);
-
- mem = rec->root;
- spin_lock(&mem->scanstat.lock);
- __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
- spin_unlock(&mem->scanstat.lock);
-}
-
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1740,9 +1657,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
- struct memcg_scanrecord rec;
unsigned long excess;
- unsigned long scanned;
+ unsigned long nr_scanned;
excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
@@ -1750,15 +1666,6 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
if (!check_soft && !shrink && root_mem->memsw_is_minimum)
noswap = true;
- if (shrink)
- rec.context = SCAN_BY_SHRINK;
- else if (check_soft)
- rec.context = SCAN_BY_SYSTEM;
- else
- rec.context = SCAN_BY_LIMIT;
-
- rec.root = root_mem;
-
while (1) {
victim = mem_cgroup_select_victim(root_mem);
if (victim == root_mem) {
@@ -1799,23 +1706,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
css_put(&victim->css);
continue;
}
- rec.mem = victim;
- rec.nr_scanned[0] = 0;
- rec.nr_scanned[1] = 0;
- rec.nr_rotated[0] = 0;
- rec.nr_rotated[1] = 0;
- rec.nr_freed[0] = 0;
- rec.nr_freed[1] = 0;
- rec.elapsed = 0;
/* we use swappiness of local cgroup */
if (check_soft) {
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
- noswap, zone, &rec, &scanned);
- *total_scanned += scanned;
+ noswap, zone, &nr_scanned);
+ *total_scanned += nr_scanned;
} else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
- noswap, &rec);
- mem_cgroup_record_scanstat(&rec);
+ noswap);
css_put(&victim->css);
/*
* At shrinking usage, we can't check we should stop here or
@@ -1841,29 +1739,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
*/
static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
- int lock_count = -1;
struct mem_cgroup *iter, *failed = NULL;
bool cond = true;
for_each_mem_cgroup_tree_cond(iter, mem, cond) {
- bool locked = iter->oom_lock;
-
- iter->oom_lock = true;
- if (lock_count == -1)
- lock_count = iter->oom_lock;
- else if (lock_count != locked) {
+ if (iter->oom_lock) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
- lock_count = 0;
failed = iter;
cond = false;
- }
+ } else
+ iter->oom_lock = true;
}
if (!failed)
- goto done;
+ return true;
/*
* OK, we failed to lock the whole subtree so we have to clean up
@@ -1877,8 +1769,7 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
}
iter->oom_lock = false;
}
-done:
- return lock_count;
+ return false;
}
/*
@@ -2091,6 +1982,7 @@ struct memcg_stock_pcp {
#define FLUSHING_CACHED_CHARGE (0)
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_MUTEX(percpu_charge_mutex);
/*
* Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -2168,13 +2060,7 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
/* Notify other cpus that system-wide "drain" is running */
get_online_cpus();
- /*
- * Get a hint for avoiding draining charges on the current cpu,
- * which must be exhausted by our charging. It is not required that
- * this be a precise check, so we use raw_smp_processor_id() instead of
- * getcpu()/putcpu().
- */
- curcpu = raw_smp_processor_id();
+ curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *mem;
@@ -2191,14 +2077,14 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
schedule_work_on(cpu, &stock->work);
}
}
+ put_cpu();
if (!sync)
goto out;
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&
- test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+ if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
flush_work(&stock->work);
}
out:
@@ -2213,14 +2099,22 @@ out:
*/
static void drain_all_stock_async(struct mem_cgroup *root_mem)
{
+ /*
+ * If someone calls draining, avoid adding more kworker runs.
+ */
+ if (!mutex_trylock(&percpu_charge_mutex))
+ return;
drain_all_stock(root_mem, false);
+ mutex_unlock(&percpu_charge_mutex);
}
/* This is a synchronous drain interface. */
static void drain_all_stock_sync(struct mem_cgroup *root_mem)
{
/* called when force_empty is called */
+ mutex_lock(&percpu_charge_mutex);
drain_all_stock(root_mem, true);
+ mutex_unlock(&percpu_charge_mutex);
}
/*
@@ -3858,18 +3752,14 @@ try_to_free:
/* try to free all pages in this cgroup */
shrink = 1;
while (nr_retries && mem->res.usage > 0) {
- struct memcg_scanrecord rec;
int progress;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
}
- rec.context = SCAN_BY_SHRINK;
- rec.mem = mem;
- rec.root = mem;
progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
- false, &rec);
+ false);
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
@@ -4713,54 +4603,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
}
#endif /* CONFIG_NUMA */
-static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
- struct cftype *cft,
- struct cgroup_map_cb *cb)
-{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
- char string[64];
- int i;
-
- for (i = 0; i < NR_SCANSTATS; i++) {
- strcpy(string, scanstat_string[i]);
- strcat(string, SCANSTAT_WORD_LIMIT);
- cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);
- }
-
- for (i = 0; i < NR_SCANSTATS; i++) {
- strcpy(string, scanstat_string[i]);
- strcat(string, SCANSTAT_WORD_SYSTEM);
- cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
- }
-
- for (i = 0; i < NR_SCANSTATS; i++) {
- strcpy(string, scanstat_string[i]);
- strcat(string, SCANSTAT_WORD_LIMIT);
- strcat(string, SCANSTAT_WORD_HIERARCHY);
- cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
- }
- for (i = 0; i < NR_SCANSTATS; i++) {
- strcpy(string, scanstat_string[i]);
- strcat(string, SCANSTAT_WORD_SYSTEM);
- strcat(string, SCANSTAT_WORD_HIERARCHY);
- cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
- }
- return 0;
-}
-
-static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
- unsigned int event)
-{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
-
- spin_lock(&mem->scanstat.lock);
- memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
- memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
- spin_unlock(&mem->scanstat.lock);
- return 0;
-}
-
-
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -4831,11 +4673,6 @@ static struct cftype mem_cgroup_files[] = {
.mode = S_IRUGO,
},
#endif
- {
- .name = "vmscan_stat",
- .read_map = mem_cgroup_vmscan_stat_read,
- .trigger = mem_cgroup_reset_vmscan_stat,
- },
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -5099,7 +4936,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
- spin_lock_init(&mem->scanstat.lock);
return &mem->css;
free_out:
__mem_cgroup_free(mem);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8b57173c1dd5..9c51f9f58cac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
- pgoff_t pgoff;
unsigned long vmstart;
unsigned long vmend;
@@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, new_pol);
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ new_pol);
if (prev) {
vma = prev;
next = vma->vm_next;
@@ -1412,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
if (!err && nmask) {
- err = copy_from_user(bm, nm, alloc_size);
+ unsigned long copy_size;
+ copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
+ err = copy_from_user(bm, nm, copy_size);
/* ensure entire bitmap is zeroed */
err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
err |= compat_put_bitmap(nmask, bm, nr_bits);
diff --git a/mm/migrate.c b/mm/migrate.c
index 666e4e677414..14d0a6a632f6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
ptep = pte_offset_map(pmd, addr);
- if (!is_swap_pte(*ptep)) {
- pte_unmap(ptep);
- goto out;
- }
+ /*
+ * Peek to check is_swap_pte() before taking ptlock? No, we
+ * can race mremap's move_ptes(), which skips anon_vma lock.
+ */
ptl = pte_lockptr(mm, pmd);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d1960744f881..0e309cd1b5b9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -754,21 +754,10 @@ static void balance_dirty_pages(struct address_space *mapping,
* 200ms is typically more than enough to curb heavy dirtiers;
* (b) the pause time limit makes the dirtiers more responsive.
*/
- if (nr_dirty < dirty_thresh +
- dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+ if (nr_dirty < dirty_thresh &&
+ bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 &&
time_after(jiffies, start_time + MAX_PAUSE))
break;
- /*
- * pass-good area. When some bdi gets blocked (eg. NFS server
- * not responding), or write bandwidth dropped dramatically due
- * to concurrent reads, or dirty threshold suddenly dropped and
- * the dirty pages cannot be brought down anytime soon (eg. on
- * slow USB stick), at least let go of the good bdi's.
- */
- if (nr_dirty < dirty_thresh +
- dirty_thresh / DIRTY_PASSGOOD_AREA &&
- bdi_dirty < bdi_thresh)
- break;
/*
* Increase the delay for each loop, up to our previous
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 39d216d535ea..6bdc67dbbc28 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -513,11 +513,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array_size = length * sizeof(void *);
- array = vmalloc(array_size);
+ array = vzalloc(array_size);
if (!array)
goto nomem;
- memset(array, 0, array_size);
ctrl = &swap_cgroup_ctrl[type];
mutex_lock(&swap_cgroup_mutex);
ctrl->length = length;
diff --git a/mm/shmem.c b/mm/shmem.c
index 32f6763f16fb..2d3577295298 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1458,7 +1458,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
error = security_inode_init_security(inode, dir,
- &dentry->d_name, NULL,
+ &dentry->d_name,
NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
@@ -1598,7 +1598,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
if (!inode)
return -ENOSPC;
- error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
+ error = security_inode_init_security(inode, dir, &dentry->d_name,
NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
diff --git a/mm/slub.c b/mm/slub.c
index 943f4906131b..95215aa6a75e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1421,7 +1421,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, last, NULL);
page->freelist = start;
- page->inuse = 0;
+ page->inuse = page->objects;
page->frozen = 1;
out:
return page;
@@ -1528,10 +1528,13 @@ static inline void remove_partial(struct kmem_cache_node *n,
* Lock slab, remove from the partial list and put the object into the
* per cpu freelist.
*
+ * Returns a list of objects or NULL if it fails.
+ *
* Must hold list_lock.
*/
-static inline int acquire_slab(struct kmem_cache *s,
- struct kmem_cache_node *n, struct page *page)
+static inline void *acquire_slab(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page,
+ int mode)
{
void *freelist;
unsigned long counters;
@@ -1546,7 +1549,8 @@ static inline int acquire_slab(struct kmem_cache *s,
freelist = page->freelist;
counters = page->counters;
new.counters = counters;
- new.inuse = page->objects;
+ if (mode)
+ new.inuse = page->objects;
VM_BUG_ON(new.frozen);
new.frozen = 1;
@@ -1557,32 +1561,19 @@ static inline int acquire_slab(struct kmem_cache *s,
"lock and freeze"));
remove_partial(n, page);
-
- if (freelist) {
- /* Populate the per cpu freelist */
- this_cpu_write(s->cpu_slab->freelist, freelist);
- this_cpu_write(s->cpu_slab->page, page);
- this_cpu_write(s->cpu_slab->node, page_to_nid(page));
- return 1;
- } else {
- /*
- * Slab page came from the wrong list. No object to allocate
- * from. Put it onto the correct list and continue partial
- * scan.
- */
- printk(KERN_ERR "SLUB: %s : Page without available objects on"
- " partial list\n", s->name);
- return 0;
- }
+ return freelist;
}
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+
/*
* Try to allocate a partial slab from a specific node.
*/
-static struct page *get_partial_node(struct kmem_cache *s,
- struct kmem_cache_node *n)
+static void *get_partial_node(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct kmem_cache_cpu *c)
{
- struct page *page;
+ struct page *page, *page2;
+ void *object = NULL;
/*
* Racy check. If we mistakenly see no partial slabs then we
@@ -1594,26 +1585,43 @@ static struct page *get_partial_node(struct kmem_cache *s,
return NULL;
spin_lock(&n->list_lock);
- list_for_each_entry(page, &n->partial, lru)
- if (acquire_slab(s, n, page))
- goto out;
- page = NULL;
-out:
+ list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ void *t = acquire_slab(s, n, page, object == NULL);
+ int available;
+
+ if (!t)
+ break;
+
+ if (!object) {
+ c->page = page;
+ c->node = page_to_nid(page);
+ stat(s, ALLOC_FROM_PARTIAL);
+ object = t;
+ available = page->objects - page->inuse;
+ } else {
+ page->freelist = t;
+ available = put_cpu_partial(s, page, 0);
+ }
+ if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
+ break;
+
+ }
spin_unlock(&n->list_lock);
- return page;
+ return object;
}
/*
* Get a page from somewhere. Search in increasing NUMA distances.
*/
-static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
+static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ struct kmem_cache_cpu *c)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
- struct page *page;
+ void *object;
/*
* The defrag ratio allows a configuration of the tradeoffs between
@@ -1646,10 +1654,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
n->nr_partial > s->min_partial) {
- page = get_partial_node(s, n);
- if (page) {
+ object = get_partial_node(s, n, c);
+ if (object) {
put_mems_allowed();
- return page;
+ return object;
}
}
}
@@ -1661,16 +1669,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
/*
* Get a partial page, lock it and return it.
*/
-static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
+static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
+ struct kmem_cache_cpu *c)
{
- struct page *page;
+ void *object;
int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
- page = get_partial_node(s, get_node(s, searchnode));
- if (page || node != NUMA_NO_NODE)
- return page;
+ object = get_partial_node(s, get_node(s, searchnode), c);
+ if (object || node != NUMA_NO_NODE)
+ return object;
- return get_any_partial(s, flags);
+ return get_any_partial(s, flags, c);
}
#ifdef CONFIG_PREEMPT
@@ -1739,9 +1748,6 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
for_each_possible_cpu(cpu)
per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
}
-/*
- * Remove the cpu slab
- */
/*
* Remove the cpu slab
@@ -1894,6 +1900,123 @@ redo:
}
}
+/* Unfreeze all the cpu partial slabs */
+static void unfreeze_partials(struct kmem_cache *s)
+{
+ struct kmem_cache_node *n = NULL;
+ struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
+ struct page *page;
+
+ while ((page = c->partial)) {
+ enum slab_modes { M_PARTIAL, M_FREE };
+ enum slab_modes l, m;
+ struct page new;
+ struct page old;
+
+ c->partial = page->next;
+ l = M_FREE;
+
+ do {
+
+ old.freelist = page->freelist;
+ old.counters = page->counters;
+ VM_BUG_ON(!old.frozen);
+
+ new.counters = old.counters;
+ new.freelist = old.freelist;
+
+ new.frozen = 0;
+
+ if (!new.inuse && (!n || n->nr_partial > s->min_partial))
+ m = M_FREE;
+ else {
+ struct kmem_cache_node *n2 = get_node(s,
+ page_to_nid(page));
+
+ m = M_PARTIAL;
+ if (n != n2) {
+ if (n)
+ spin_unlock(&n->list_lock);
+
+ n = n2;
+ spin_lock(&n->list_lock);
+ }
+ }
+
+ if (l != m) {
+ if (l == M_PARTIAL)
+ remove_partial(n, page);
+ else
+ add_partial(n, page, 1);
+
+ l = m;
+ }
+
+ } while (!cmpxchg_double_slab(s, page,
+ old.freelist, old.counters,
+ new.freelist, new.counters,
+ "unfreezing slab"));
+
+ if (m == M_FREE) {
+ stat(s, DEACTIVATE_EMPTY);
+ discard_slab(s, page);
+ stat(s, FREE_SLAB);
+ }
+ }
+
+ if (n)
+ spin_unlock(&n->list_lock);
+}
+
+/*
+ * Put a page that was just frozen (in __slab_free) into a partial page
+ * slot if available. This is done without interrupts disabled and without
+ * preemption disabled. The cmpxchg is racy and may put the partial page
+ * onto a random cpus partial slot.
+ *
+ * If we did not find a slot then simply move all the partials to the
+ * per node partial list.
+ */
+int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+{
+ struct page *oldpage;
+ int pages;
+ int pobjects;
+
+ do {
+ pages = 0;
+ pobjects = 0;
+ oldpage = this_cpu_read(s->cpu_slab->partial);
+
+ if (oldpage) {
+ pobjects = oldpage->pobjects;
+ pages = oldpage->pages;
+ if (drain && pobjects > s->cpu_partial) {
+ unsigned long flags;
+ /*
+ * partial array is full. Move the existing
+ * set to the per node partial list.
+ */
+ local_irq_save(flags);
+ unfreeze_partials(s);
+ local_irq_restore(flags);
+ pobjects = 0;
+ pages = 0;
+ }
+ }
+
+ pages++;
+ pobjects += page->objects - page->inuse;
+
+ page->pages = pages;
+ page->pobjects = pobjects;
+ page->next = oldpage;
+
+ } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+ stat(s, CPU_PARTIAL_FREE);
+ return pobjects;
+}
+
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
@@ -1909,8 +2032,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- if (likely(c && c->page))
- flush_slab(s, c);
+ if (likely(c)) {
+ if (c->page)
+ flush_slab(s, c);
+
+ unfreeze_partials(s);
+ }
}
static void flush_cpu_slab(void *d)
@@ -2001,12 +2128,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
}
}
+static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
+ int node, struct kmem_cache_cpu **pc)
+{
+ void *object;
+ struct kmem_cache_cpu *c;
+ struct page *page = new_slab(s, flags, node);
+
+ if (page) {
+ c = __this_cpu_ptr(s->cpu_slab);
+ if (c->page)
+ flush_slab(s, c);
+
+ /*
+ * No other reference to the page yet so we can
+ * muck around with it freely without cmpxchg
+ */
+ object = page->freelist;
+ page->freelist = NULL;
+
+ stat(s, ALLOC_SLAB);
+ c->node = page_to_nid(page);
+ c->page = page;
+ *pc = c;
+ } else
+ object = NULL;
+
+ return object;
+}
+
/*
* Slow path. The lockless freelist is empty or we need to perform
* debugging duties.
*
- * Interrupts are disabled.
- *
* Processing is still very fast if new objects have been freed to the
* regular freelist. In that case we simply take over the regular freelist
* as the lockless freelist and zap the regular freelist.
@@ -2023,7 +2177,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void **object;
- struct page *page;
unsigned long flags;
struct page new;
unsigned long counters;
@@ -2038,13 +2191,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
c = this_cpu_ptr(s->cpu_slab);
#endif
- /* We handle __GFP_ZERO in the caller */
- gfpflags &= ~__GFP_ZERO;
-
- page = c->page;
- if (!page)
+ if (!c->page)
goto new_slab;
-
+redo:
if (unlikely(!node_match(c, node))) {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, c);
@@ -2054,8 +2203,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
stat(s, ALLOC_SLOWPATH);
do {
- object = page->freelist;
- counters = page->counters;
+ object = c->page->freelist;
+ counters = c->page->counters;
new.counters = counters;
VM_BUG_ON(!new.frozen);
@@ -2067,17 +2216,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
*
* If there are objects left then we retrieve them
* and use them to refill the per cpu queue.
- */
+ */
- new.inuse = page->objects;
+ new.inuse = c->page->objects;
new.frozen = object != NULL;
- } while (!__cmpxchg_double_slab(s, page,
+ } while (!__cmpxchg_double_slab(s, c->page,
object, counters,
NULL, new.counters,
"__slab_alloc"));
- if (unlikely(!object)) {
+ if (!object) {
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
@@ -2086,58 +2235,47 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
stat(s, ALLOC_REFILL);
load_freelist:
- VM_BUG_ON(!page->frozen);
c->freelist = get_freepointer(s, object);
c->tid = next_tid(c->tid);
local_irq_restore(flags);
return object;
new_slab:
- page = get_partial(s, gfpflags, node);
- if (page) {
- stat(s, ALLOC_FROM_PARTIAL);
- object = c->freelist;
- if (kmem_cache_debug(s))
- goto debug;
- goto load_freelist;
+ if (c->partial) {
+ c->page = c->partial;
+ c->partial = c->page->next;
+ c->node = page_to_nid(c->page);
+ stat(s, CPU_PARTIAL_ALLOC);
+ c->freelist = NULL;
+ goto redo;
}
- page = new_slab(s, gfpflags, node);
+ /* Then do expensive stuff like retrieving pages from the partial lists */
+ object = get_partial(s, gfpflags, node, c);
- if (page) {
- c = __this_cpu_ptr(s->cpu_slab);
- if (c->page)
- flush_slab(s, c);
+ if (unlikely(!object)) {
- /*
- * No other reference to the page yet so we can
- * muck around with it freely without cmpxchg
- */
- object = page->freelist;
- page->freelist = NULL;
- page->inuse = page->objects;
+ object = new_slab_objects(s, gfpflags, node, &c);
- stat(s, ALLOC_SLAB);
- c->node = page_to_nid(page);
- c->page = page;
+ if (unlikely(!object)) {
+ if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+ slab_out_of_memory(s, gfpflags, node);
- if (kmem_cache_debug(s))
- goto debug;
- goto load_freelist;
+ local_irq_restore(flags);
+ return NULL;
+ }
}
- if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
- slab_out_of_memory(s, gfpflags, node);
- local_irq_restore(flags);
- return NULL;
-debug:
- if (!object || !alloc_debug_processing(s, page, object, addr))
- goto new_slab;
+ if (likely(!kmem_cache_debug(s)))
+ goto load_freelist;
+
+ /* Only entered in the debug case */
+ if (!alloc_debug_processing(s, c->page, object, addr))
+ goto new_slab; /* Slab failed checks. Next slab needed */
c->freelist = get_freepointer(s, object);
deactivate_slab(s, c);
- c->page = NULL;
c->node = NUMA_NO_NODE;
local_irq_restore(flags);
return object;
@@ -2307,16 +2445,29 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
was_frozen = new.frozen;
new.inuse--;
if ((!new.inuse || !prior) && !was_frozen && !n) {
- n = get_node(s, page_to_nid(page));
- /*
- * Speculatively acquire the list_lock.
- * If the cmpxchg does not succeed then we may
- * drop the list_lock without any processing.
- *
- * Otherwise the list_lock will synchronize with
- * other processors updating the list of slabs.
- */
- spin_lock_irqsave(&n->list_lock, flags);
+
+ if (!kmem_cache_debug(s) && !prior)
+
+ /*
+ * Slab was on no list before and will be partially empty
+ * We can defer the list move and instead freeze it.
+ */
+ new.frozen = 1;
+
+ else { /* Needs to be taken off a list */
+
+ n = get_node(s, page_to_nid(page));
+ /*
+ * Speculatively acquire the list_lock.
+ * If the cmpxchg does not succeed then we may
+ * drop the list_lock without any processing.
+ *
+ * Otherwise the list_lock will synchronize with
+ * other processors updating the list of slabs.
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ }
}
inuse = new.inuse;
@@ -2326,7 +2477,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
"__slab_free"));
if (likely(!n)) {
- /*
+
+ /*
+ * If we just froze the page then put it onto the
+ * per cpu partial list.
+ */
+ if (new.frozen && !was_frozen)
+ put_cpu_partial(s, page, 1);
+
+ /*
* The list lock was not taken therefore no list
* activity can be necessary.
*/
@@ -2395,7 +2554,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
slab_free_hook(s, x);
redo:
-
/*
* Determine the currently cpus per cpu slab.
* The cpu may change afterward. However that does not matter since
@@ -2659,7 +2817,7 @@ static void early_kmem_cache_node_alloc(int node)
n = page->freelist;
BUG_ON(!n);
page->freelist = get_freepointer(kmem_cache_node, n);
- page->inuse++;
+ page->inuse = 1;
page->frozen = 0;
kmem_cache_node->node[node] = n;
#ifdef CONFIG_SLUB_DEBUG
@@ -2885,7 +3043,34 @@ static int kmem_cache_open(struct kmem_cache *s,
* The larger the object size is, the more pages we want on the partial
* list to avoid pounding the page allocator excessively.
*/
- set_min_partial(s, ilog2(s->size));
+ set_min_partial(s, ilog2(s->size) / 2);
+
+ /*
+ * cpu_partial determined the maximum number of objects kept in the
+ * per cpu partial lists of a processor.
+ *
+ * Per cpu partial lists mainly contain slabs that just have one
+ * object freed. If they are used for allocation then they can be
+ * filled up again with minimal effort. The slab will never hit the
+ * per node partial lists and therefore no locking will be required.
+ *
+ * This setting also determines
+ *
+ * A) The number of objects from per cpu partial slabs dumped to the
+ * per node list when we reach the limit.
+ * B) The number of objects in cpu partial slabs to extract from the
+ * per node list when we run out of per cpu objects. We only fetch 50%
+ * to keep some capacity around for frees.
+ */
+ if (s->size >= PAGE_SIZE)
+ s->cpu_partial = 2;
+ else if (s->size >= 1024)
+ s->cpu_partial = 6;
+ else if (s->size >= 256)
+ s->cpu_partial = 13;
+ else
+ s->cpu_partial = 30;
+
s->refcount = 1;
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
@@ -2944,13 +3129,13 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
/*
* Attempt to free all partial slabs on a node.
+ * This is called from kmem_cache_close(). We must be the last thread
+ * using the cache and therefore we do not need to lock anymore.
*/
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
- unsigned long flags;
struct page *page, *h;
- spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
remove_partial(n, page);
@@ -2960,7 +3145,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
"Objects remaining on kmem_cache_close()");
}
}
- spin_unlock_irqrestore(&n->list_lock, flags);
}
/*
@@ -2994,6 +3178,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
+ up_write(&slub_lock);
if (kmem_cache_close(s)) {
printk(KERN_ERR "SLUB %s: %s called for cache that "
"still has objects.\n", s->name, __func__);
@@ -3002,8 +3187,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
sysfs_slab_remove(s);
- }
- up_write(&slub_lock);
+ } else
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3321,23 +3506,23 @@ int kmem_cache_shrink(struct kmem_cache *s)
* list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
- if (!page->inuse) {
- remove_partial(n, page);
- discard_slab(s, page);
- } else {
- list_move(&page->lru,
- slabs_by_inuse + page->inuse);
- }
+ list_move(&page->lru, slabs_by_inuse + page->inuse);
+ if (!page->inuse)
+ n->nr_partial--;
}
/*
* Rebuild the partial list with the slabs filled up most
* first and the least used slabs at the end.
*/
- for (i = objects - 1; i >= 0; i--)
+ for (i = objects - 1; i > 0; i--)
list_splice(slabs_by_inuse + i, n->partial.prev);
spin_unlock_irqrestore(&n->list_lock, flags);
+
+ /* Release empty slabs */
+ list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+ discard_slab(s, page);
}
kfree(slabs_by_inuse);
@@ -4293,6 +4478,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
for_each_possible_cpu(cpu) {
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ struct page *page;
if (!c || c->node < 0)
continue;
@@ -4308,6 +4494,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
total += x;
nodes[c->node] += x;
}
+ page = c->partial;
+
+ if (page) {
+ x = page->pobjects;
+ total += x;
+ nodes[c->node] += x;
+ }
per_cpu[c->node]++;
}
}
@@ -4460,6 +4653,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
}
SLAB_ATTR(min_partial);
+static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->cpu_partial);
+}
+
+static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ unsigned long objects;
+ int err;
+
+ err = strict_strtoul(buf, 10, &objects);
+ if (err)
+ return err;
+
+ s->cpu_partial = objects;
+ flush_all(s);
+ return length;
+}
+SLAB_ATTR(cpu_partial);
+
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
if (!s->ctor)
@@ -4498,6 +4712,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(objects_partial);
+static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ int objects = 0;
+ int pages = 0;
+ int cpu;
+ int len;
+
+ for_each_online_cpu(cpu) {
+ struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
+
+ if (page) {
+ pages += page->pages;
+ objects += page->pobjects;
+ }
+ }
+
+ len = sprintf(buf, "%d(%d)", objects, pages);
+
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
+
+ if (page && len < PAGE_SIZE - 20)
+ len += sprintf(buf + len, " C%d=%d(%d)", cpu,
+ page->pobjects, page->pages);
+ }
+#endif
+ return len + sprintf(buf + len, "\n");
+}
+SLAB_ATTR_RO(slabs_cpu_partial);
+
static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4820,6 +5065,8 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
STAT_ATTR(ORDER_FALLBACK, order_fallback);
STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
+STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
+STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
#endif
static struct attribute *slab_attrs[] = {
@@ -4828,6 +5075,7 @@ static struct attribute *slab_attrs[] = {
&objs_per_slab_attr.attr,
&order_attr.attr,
&min_partial_attr.attr,
+ &cpu_partial_attr.attr,
&objects_attr.attr,
&objects_partial_attr.attr,
&partial_attr.attr,
@@ -4840,6 +5088,7 @@ static struct attribute *slab_attrs[] = {
&destroy_by_rcu_attr.attr,
&shrink_attr.attr,
&reserved_attr.attr,
+ &slabs_cpu_partial_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
&slabs_attr.attr,
@@ -4881,6 +5130,8 @@ static struct attribute *slab_attrs[] = {
&order_fallback_attr.attr,
&cmpxchg_double_fail_attr.attr,
&cmpxchg_double_cpu_fail_attr.attr,
+ &cpu_partial_alloc_attr.attr,
+ &cpu_partial_free_attr.attr,
#endif
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 464621d18eb2..5016f19e1661 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
-#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
- VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
- VMALLOC_PAGES / NR_CPUS / 16))
+#define VMAP_BBMAP_BITS \
+ VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
+ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
+ VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
@@ -2139,6 +2140,14 @@ struct vm_struct *alloc_vm_area(size_t size)
return NULL;
}
+ /*
+ * If the allocated address space is passed to a hypercall
+ * before being used then we cannot rely on a page fault to
+ * trigger an update of the page tables. So sync all the page
+ * tables here.
+ */
+ vmalloc_sync_all();
+
return area;
}
EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7ef69124fa3e..9fdfce7ba403 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -105,7 +105,6 @@ struct scan_control {
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
- struct memcg_scanrecord *memcg_record;
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -1349,8 +1348,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
reclaim_stat->recent_rotated[file] += numpages;
- if (!scanning_global_lru(sc))
- sc->memcg_record->nr_rotated[file] += numpages;
}
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1394,10 +1391,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
reclaim_stat->recent_scanned[0] += *nr_anon;
reclaim_stat->recent_scanned[1] += *nr_file;
- if (!scanning_global_lru(sc)) {
- sc->memcg_record->nr_scanned[0] += *nr_anon;
- sc->memcg_record->nr_scanned[1] += *nr_file;
- }
}
/*
@@ -1423,7 +1416,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
return false;
- /* If we have relaimed everything on the isolated list, no stall */
+ /* If we have reclaimed everything on the isolated list, no stall */
if (nr_freed == nr_taken)
return false;
@@ -1511,9 +1504,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
- if (!scanning_global_lru(sc))
- sc->memcg_record->nr_freed[file] += nr_reclaimed;
-
local_irq_disable();
if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1613,8 +1603,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
}
reclaim_stat->recent_scanned[file] += nr_taken;
- if (!scanning_global_lru(sc))
- sc->memcg_record->nr_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, pgscanned);
if (file)
@@ -1666,8 +1654,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* get_scan_ratio.
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- if (!scanning_global_lru(sc))
- sc->memcg_record->nr_rotated[file] += nr_rotated;
move_active_pages_to_lru(zone, &l_active,
LRU_ACTIVE + file * LRU_FILE);
@@ -1808,23 +1794,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
u64 fraction[2], denominator;
enum lru_list l;
int noswap = 0;
- int force_scan = 0;
+ bool force_scan = false;
unsigned long nr_force_scan[2];
-
- anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
- file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
- if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
- /* kswapd does zone balancing and need to scan this zone */
- if (scanning_global_lru(sc) && current_is_kswapd())
- force_scan = 1;
- /* memcg may have small limit and need to avoid priority drop */
- if (!scanning_global_lru(sc))
- force_scan = 1;
- }
+ /* kswapd does zone balancing and needs to scan this zone */
+ if (scanning_global_lru(sc) && current_is_kswapd())
+ force_scan = true;
+ /* memcg may have small limit and need to avoid priority drop */
+ if (!scanning_global_lru(sc))
+ force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1837,6 +1815,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
goto out;
}
+ anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+ file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
if (scanning_global_lru(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
@@ -2268,10 +2251,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
- gfp_t gfp_mask, bool noswap,
- struct zone *zone,
- struct memcg_scanrecord *rec,
- unsigned long *scanned)
+ gfp_t gfp_mask, bool noswap,
+ struct zone *zone,
+ unsigned long *nr_scanned)
{
struct scan_control sc = {
.nr_scanned = 0,
@@ -2281,9 +2263,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
.may_swap = !noswap,
.order = 0,
.mem_cgroup = mem,
- .memcg_record = rec,
};
- unsigned long start, end;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2292,7 +2272,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
sc.may_writepage,
sc.gfp_mask);
- start = sched_clock();
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
@@ -2301,25 +2280,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
* the priority and make it zero.
*/
shrink_zone(0, zone, &sc);
- end = sched_clock();
-
- if (rec)
- rec->elapsed += end - start;
- *scanned = sc.nr_scanned;
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+ *nr_scanned = sc.nr_scanned;
return sc.nr_reclaimed;
}
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask,
- bool noswap,
- struct memcg_scanrecord *rec)
+ bool noswap)
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
- unsigned long start, end;
int nid;
struct scan_control sc = {
.may_writepage = !laptop_mode,
@@ -2328,7 +2301,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.order = 0,
.mem_cgroup = mem_cont,
- .memcg_record = rec,
.nodemask = NULL, /* we don't care the placement */
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2337,7 +2309,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.gfp_mask = sc.gfp_mask,
};
- start = sched_clock();
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
@@ -2352,9 +2323,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
- end = sched_clock();
- if (rec)
- rec->elapsed += end - start;
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -2529,6 +2497,9 @@ loop_again:
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
break;
+ } else {
+ /* If balanced, clear the congested flag */
+ zone_clear_flag(zone, ZONE_CONGESTED);
}
}
if (i < 0)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b2..d52b13d28e8f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
}
#endif
-#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS)
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
#ifdef CONFIG_ZONE_DMA
#define TEXT_FOR_DMA(xx) xx "_dma",
#else
@@ -788,7 +788,7 @@ const char * const vmstat_text[] = {
#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
-#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
#ifdef CONFIG_PROC_FS