aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c43
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/kasan/quarantine.c51
-rw-r--r--mm/madvise.c44
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c18
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/rmap.c13
-rw-r--r--mm/vmstat.c3
9 files changed, 145 insertions, 43 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6d861d090e9f..c6f2a37028c2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -683,33 +683,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
- struct rb_node *rbn;
void **slot;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
-
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
-
- while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
- struct bdi_writeback_congested *congested =
- rb_entry(rbn, struct bdi_writeback_congested, rb_node);
-
- rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->bdi = NULL; /* mark @congested unlinked */
- }
-
spin_unlock_irq(&cgwb_lock);
/*
- * All cgwb's and their congested states must be shutdown and
- * released before returning. Drain the usage counter to wait for
- * all cgwb's and cgwb_congested's ever created on @bdi.
+ * All cgwb's must be shutdown and released before returning. Drain
+ * the usage counter to wait for all cgwb's ever created on @bdi.
*/
atomic_dec(&bdi->usage_cnt);
wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
+ /*
+ * Grab back our reference so that we hold it when @bdi gets
+ * re-registered.
+ */
+ atomic_inc(&bdi->usage_cnt);
}
/**
@@ -749,6 +742,21 @@ void wb_blkcg_offline(struct blkcg *blkcg)
spin_unlock_irq(&cgwb_lock);
}
+static void cgwb_bdi_exit(struct backing_dev_info *bdi)
+{
+ struct rb_node *rbn;
+
+ spin_lock_irq(&cgwb_lock);
+ while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
+ struct bdi_writeback_congested *congested =
+ rb_entry(rbn, struct bdi_writeback_congested, rb_node);
+
+ rb_erase(rbn, &bdi->cgwb_congested_tree);
+ congested->bdi = NULL; /* mark @congested unlinked */
+ }
+ spin_unlock_irq(&cgwb_lock);
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
@@ -769,7 +777,9 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return 0;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+
+static void cgwb_bdi_exit(struct backing_dev_info *bdi)
{
wb_congested_put(bdi->wb_congested);
}
@@ -857,6 +867,8 @@ int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
MINOR(owner->devt));
if (rc)
return rc;
+ /* Leaking owner reference... */
+ WARN_ON(bdi->owner);
bdi->owner = owner;
get_device(owner);
return 0;
@@ -898,6 +910,7 @@ static void bdi_exit(struct backing_dev_info *bdi)
{
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
+ cgwb_bdi_exit(bdi);
}
static void release_bdi(struct kref *ref)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e4766de25709..1ebc93e179f3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1828,7 +1828,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
- count_vm_event(THP_SPLIT_PMD);
+ count_vm_event(THP_SPLIT_PUD);
pudp_huge_clear_flush_notify(vma, haddr, pud);
}
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 6f1ed1630873..3a8ddf8baf7d 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -25,6 +25,7 @@
#include <linux/printk.h>
#include <linux/shrinker.h>
#include <linux/slab.h>
+#include <linux/srcu.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -103,6 +104,7 @@ static int quarantine_tail;
/* Total size of all objects in global_quarantine across all batches. */
static unsigned long quarantine_size;
static DEFINE_SPINLOCK(quarantine_lock);
+DEFINE_STATIC_SRCU(remove_cache_srcu);
/* Maximum size of the global queue. */
static unsigned long quarantine_max_size;
@@ -173,17 +175,22 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
struct qlist_head *q;
struct qlist_head temp = QLIST_INIT;
+ /*
+ * Note: irq must be disabled until after we move the batch to the
+ * global quarantine. Otherwise quarantine_remove_cache() can miss
+ * some objects belonging to the cache if they are in our local temp
+ * list. quarantine_remove_cache() executes on_each_cpu() at the
+ * beginning which ensures that it either sees the objects in per-cpu
+ * lists or in the global quarantine.
+ */
local_irq_save(flags);
q = this_cpu_ptr(&cpu_quarantine);
qlist_put(q, &info->quarantine_link, cache->size);
- if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+ if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
- local_irq_restore(flags);
-
- if (unlikely(!qlist_empty(&temp))) {
- spin_lock_irqsave(&quarantine_lock, flags);
+ spin_lock(&quarantine_lock);
WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
if (global_quarantine[quarantine_tail].bytes >=
@@ -196,20 +203,33 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
if (new_tail != quarantine_head)
quarantine_tail = new_tail;
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ spin_unlock(&quarantine_lock);
}
+
+ local_irq_restore(flags);
}
void quarantine_reduce(void)
{
size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
+ int srcu_idx;
struct qlist_head to_free = QLIST_INIT;
if (likely(READ_ONCE(quarantine_size) <=
READ_ONCE(quarantine_max_size)))
return;
+ /*
+ * srcu critical section ensures that quarantine_remove_cache()
+ * will not miss objects belonging to the cache while they are in our
+ * local to_free list. srcu is chosen because (1) it gives us private
+ * grace period domain that does not interfere with anything else,
+ * and (2) it allows synchronize_srcu() to return without waiting
+ * if there are no pending read critical sections (which is the
+ * expected case).
+ */
+ srcu_idx = srcu_read_lock(&remove_cache_srcu);
spin_lock_irqsave(&quarantine_lock, flags);
/*
@@ -237,6 +257,7 @@ void quarantine_reduce(void)
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, NULL);
+ srcu_read_unlock(&remove_cache_srcu, srcu_idx);
}
static void qlist_move_cache(struct qlist_head *from,
@@ -280,12 +301,28 @@ void quarantine_remove_cache(struct kmem_cache *cache)
unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
+ /*
+ * Must be careful to not miss any objects that are being moved from
+ * per-cpu list to the global quarantine in quarantine_put(),
+ * nor objects being freed in quarantine_reduce(). on_each_cpu()
+ * achieves the first goal, while synchronize_srcu() achieves the
+ * second.
+ */
on_each_cpu(per_cpu_remove_cache, cache, 1);
spin_lock_irqsave(&quarantine_lock, flags);
- for (i = 0; i < QUARANTINE_BATCHES; i++)
+ for (i = 0; i < QUARANTINE_BATCHES; i++) {
+ if (qlist_empty(&global_quarantine[i]))
+ continue;
qlist_move_cache(&global_quarantine[i], &to_free, cache);
+ /* Scanning whole quarantine can take a while. */
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&quarantine_lock, flags);
+ }
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);
+
+ synchronize_srcu(&remove_cache_srcu);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index dc5927c812d3..7a2abf0127ae 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -513,7 +513,43 @@ static long madvise_dontneed(struct vm_area_struct *vma,
if (!can_madv_dontneed_vma(vma))
return -EINVAL;
- userfaultfd_remove(vma, prev, start, end);
+ if (!userfaultfd_remove(vma, start, end)) {
+ *prev = NULL; /* mmap_sem has been dropped, prev is stale */
+
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, start);
+ if (!vma)
+ return -ENOMEM;
+ if (start < vma->vm_start) {
+ /*
+ * This "vma" under revalidation is the one
+ * with the lowest vma->vm_start where start
+ * is also < vma->vm_end. If start <
+ * vma->vm_start it means an hole materialized
+ * in the user address space within the
+ * virtual range passed to MADV_DONTNEED.
+ */
+ return -ENOMEM;
+ }
+ if (!can_madv_dontneed_vma(vma))
+ return -EINVAL;
+ if (end > vma->vm_end) {
+ /*
+ * Don't fail if end > vma->vm_end. If the old
+ * vma was splitted while the mmap_sem was
+ * released the effect of the concurrent
+ * operation may not cause MADV_DONTNEED to
+ * have an undefined result. There may be an
+ * adjacent next vma that we'll walk
+ * next. userfaultfd_remove() will generate an
+ * UFFD_EVENT_REMOVE repetition on the
+ * end-vma->vm_end range, but the manager can
+ * handle a repetition fine.
+ */
+ end = vma->vm_end;
+ }
+ VM_WARN_ON(start >= end);
+ }
zap_page_range(vma, start, end - start);
return 0;
}
@@ -554,8 +590,10 @@ static long madvise_remove(struct vm_area_struct *vma,
* mmap_sem.
*/
get_file(f);
- userfaultfd_remove(vma, prev, start, end);
- up_read(&current->mm->mmap_sem);
+ if (userfaultfd_remove(vma, start, end)) {
+ /* mmap_sem was not released by userfaultfd_remove() */
+ up_read(&current->mm->mmap_sem);
+ }
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
diff --git a/mm/memblock.c b/mm/memblock.c
index b64b47803e52..696f06d17c4e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1118,7 +1118,10 @@ unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
}
} while (left < right);
- return min(PHYS_PFN(type->regions[right].base), max_pfn);
+ if (right == type->cnt)
+ return max_pfn;
+ else
+ return min(PHYS_PFN(type->regions[right].base), max_pfn);
}
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c52ec893e241..2bd7541d7c11 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -466,6 +466,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
struct mem_cgroup_tree_per_node *mctz;
mctz = soft_limit_tree_from_page(page);
+ if (!mctz)
+ return;
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
@@ -503,7 +505,8 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(memcg, nid);
mctz = soft_limit_tree_node(nid);
- mem_cgroup_remove_exceeded(mz, mctz);
+ if (mctz)
+ mem_cgroup_remove_exceeded(mz, mctz);
}
}
@@ -2558,7 +2561,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
* is empty. Do it lockless to prevent lock bouncing. Races
* are acceptable as soft limit is best effort anyway.
*/
- if (RB_EMPTY_ROOT(&mctz->rb_root))
+ if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
return 0;
/*
@@ -4135,17 +4138,22 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
kfree(memcg->nodeinfo[node]);
}
-static void mem_cgroup_free(struct mem_cgroup *memcg)
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- memcg_wb_domain_exit(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->stat);
kfree(memcg);
}
+static void mem_cgroup_free(struct mem_cgroup *memcg)
+{
+ memcg_wb_domain_exit(memcg);
+ __mem_cgroup_free(memcg);
+}
+
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
@@ -4196,7 +4204,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
fail:
if (memcg->id.id > 0)
idr_remove(&mem_cgroup_idr, memcg->id.id);
- mem_cgroup_free(memcg);
+ __mem_cgroup_free(memcg);
return NULL;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index 945edac46810..0dd9ca18e19e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -443,7 +443,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
while (start < end) {
struct page *page;
- unsigned int page_mask;
+ unsigned int page_mask = 0;
unsigned long page_increm;
struct pagevec pvec;
struct zone *zone;
@@ -457,8 +457,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
* suits munlock very well (and if somehow an abnormal page
* has sneaked into the range, we won't oops here: great).
*/
- page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
- &page_mask);
+ page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
if (page && !IS_ERR(page)) {
if (PageTransTail(page)) {
@@ -469,8 +468,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
/*
* Any THP page found by follow_page_mask() may
* have gotten split before reaching
- * munlock_vma_page(), so we need to recompute
- * the page_mask here.
+ * munlock_vma_page(), so we need to compute
+ * the page_mask here instead.
*/
page_mask = munlock_vma_page(page);
unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 2984403a2424..49ed681ccc7b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1321,12 +1321,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
while (page_vma_mapped_walk(&pvmw)) {
- subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
- address = pvmw.address;
-
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_PAGE(!pvmw.pte, page);
-
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
@@ -1350,6 +1344,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
continue;
}
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
+
+
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 69f9aff39a2e..b1947f0cbee2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1065,6 +1065,9 @@ const char * const vmstat_text[] = {
"thp_split_page_failed",
"thp_deferred_split_page",
"thp_split_pmd",
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ "thp_split_pud",
+#endif
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif