aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-29 19:12:44 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-29 19:12:44 -0800
commita0908a1b7d68706ee52ed4a039756e70c8e956e9 (patch)
treece23d955c37b04d816fc7d551db71d4ff86630f7 /mm
parentMerge tag 'nfsd-4.15-1' of git://linux-nfs.org/~bfields/linux (diff)
parentfs/hugetlbfs/inode.c: change put_page/unlock_page order in hugetlbfs_fallocate() (diff)
downloadlinux-dev-a0908a1b7d68706ee52ed4a039756e70c8e956e9.tar.xz
linux-dev-a0908a1b7d68706ee52ed4a039756e70c8e956e9.zip
Merge branch 'akpm' (patches from Andrew)
Mergr misc fixes from Andrew Morton: "28 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (28 commits) fs/hugetlbfs/inode.c: change put_page/unlock_page order in hugetlbfs_fallocate() mm/hugetlb: fix NULL-pointer dereference on 5-level paging machine autofs: revert "autofs: fix AT_NO_AUTOMOUNT not being honored" autofs: revert "autofs: take more care to not update last_used on path walk" fs/fat/inode.c: fix sb_rdonly() change mm, memcg: fix mem_cgroup_swapout() for THPs mm: migrate: fix an incorrect call of prep_transhuge_page() kmemleak: add scheduling point to kmemleak_scan() scripts/bloat-o-meter: don't fail with division by 0 fs/mbcache.c: make count_objects() more robust Revert "mm/page-writeback.c: print a warning if the vm dirtiness settings are illogical" mm/madvise.c: fix madvise() infinite loop under special circumstances exec: avoid RLIMIT_STACK races with prlimit() IB/core: disable memory registration of filesystem-dax vmas v4l2: disable filesystem-dax mapping support mm: fail get_vaddr_frames() for filesystem-dax mappings mm: introduce get_user_pages_longterm device-dax: implement ->split() to catch invalid munmap attempts mm, hugetlbfs: introduce ->split() to vm_operations_struct scripts/faddr2line: extend usage on generic arch ...
Diffstat (limited to 'mm')
-rw-r--r--mm/frame_vector.c12
-rw-r--r--mm/gup.c66
-rw-r--r--mm/hmm.c8
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory.c8
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page-writeback.c5
-rw-r--r--mm/page_alloc.c13
13 files changed, 121 insertions, 32 deletions
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 2f98df0d460e..297c7238f7d4 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -53,6 +53,18 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
ret = -EFAULT;
goto out;
}
+
+ /*
+ * While get_vaddr_frames() could be used for transient (kernel
+ * controlled lifetime) pinning of memory pages all current
+ * users establish long term (userspace controlled lifetime)
+ * page pinning. Treat get_vaddr_frames() like
+ * get_user_pages_longterm() and disallow it for filesystem-dax
+ * mappings.
+ */
+ if (vma_is_fsdax(vma))
+ return -EOPNOTSUPP;
+
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
diff --git a/mm/gup.c b/mm/gup.c
index dfcde13f289a..d3fb60e5bfac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
*/
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{
- return pte_write(pte) ||
+ return pte_access_permitted(pte, WRITE) ||
((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
}
@@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
+#ifdef CONFIG_FS_DAX
+/*
+ * This is the same as get_user_pages() in that it assumes we are
+ * operating on the current task's mm, but it goes further to validate
+ * that the vmas associated with the address range are suitable for
+ * longterm elevated page reference counts. For example, filesystem-dax
+ * mappings are subject to the lifetime enforced by the filesystem and
+ * we need guarantees that longterm users like RDMA and V4L2 only
+ * establish mappings that have a kernel enforced revocation mechanism.
+ *
+ * "longterm" == userspace controlled elevated page count lifetime.
+ * Contrast this to iov_iter_get_pages() usages which are transient.
+ */
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas_arg)
+{
+ struct vm_area_struct **vmas = vmas_arg;
+ struct vm_area_struct *vma_prev = NULL;
+ long rc, i;
+
+ if (!pages)
+ return -EINVAL;
+
+ if (!vmas) {
+ vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas)
+ return -ENOMEM;
+ }
+
+ rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+
+ for (i = 0; i < rc; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma == vma_prev)
+ continue;
+
+ vma_prev = vma;
+
+ if (vma_is_fsdax(vma))
+ break;
+ }
+
+ /*
+ * Either get_user_pages() failed, or the vma validation
+ * succeeded, in either case we don't need to put_page() before
+ * returning.
+ */
+ if (i >= rc)
+ goto out;
+
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
+out:
+ if (vmas != vmas_arg)
+ kfree(vmas);
+ return rc;
+}
+EXPORT_SYMBOL(get_user_pages_longterm);
+#endif /* CONFIG_FS_DAX */
+
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
diff --git a/mm/hmm.c b/mm/hmm.c
index ea19742a5d60..3a5c172af560 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -391,11 +391,11 @@ again:
if (pmd_protnone(pmd))
return hmm_vma_walk_clear(start, end, walk);
- if (write_fault && !pmd_write(pmd))
+ if (!pmd_access_permitted(pmd, write_fault))
return hmm_vma_walk_clear(start, end, walk);
pfn = pmd_pfn(pmd) + pte_index(addr);
- flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
+ flag |= pmd_access_permitted(pmd, WRITE) ? HMM_PFN_WRITE : 0;
for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
return 0;
@@ -456,11 +456,11 @@ again:
continue;
}
- if (write_fault && !pte_write(pte))
+ if (!pte_access_permitted(pte, write_fault))
goto fault;
pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
- pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
+ pfns[i] |= pte_access_permitted(pte, WRITE) ? HMM_PFN_WRITE : 0;
continue;
fault:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0e7ded98d114..2f2f5e774902 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -870,7 +870,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
*/
WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
- if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ if (!pmd_access_permitted(*pmd, flags & FOLL_WRITE))
return NULL;
if (pmd_present(*pmd) && pmd_devmap(*pmd))
@@ -1012,7 +1012,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
assert_spin_locked(pud_lockptr(mm, pud));
- if (flags & FOLL_WRITE && !pud_write(*pud))
+ if (!pud_access_permitted(*pud, flags & FOLL_WRITE))
return NULL;
if (pud_present(*pud) && pud_devmap(*pud))
@@ -1386,7 +1386,7 @@ out_unlock:
*/
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{
- return pmd_write(pmd) ||
+ return pmd_access_permitted(pmd, WRITE) ||
((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 681b300185c0..9a334f5fb730 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3125,6 +3125,13 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
}
}
+static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ if (addr & ~(huge_page_mask(hstate_vma(vma))))
+ return -EINVAL;
+ return 0;
+}
+
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -3141,6 +3148,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
+ .split = hugetlb_vm_op_split,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -4627,7 +4635,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
- p4d = p4d_offset(pgd, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
pud = pud_alloc(mm, p4d, addr);
if (pud) {
if (sz == PUD_SIZE) {
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e4738d5e9b8c..3d4781756d50 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1523,6 +1523,8 @@ static void kmemleak_scan(void)
if (page_count(page) == 0)
continue;
scan_block(page, page + 1, NULL);
+ if (!(pfn % (MAX_SCAN_SIZE / sizeof(*page))))
+ cond_resched();
}
}
put_online_mems();
diff --git a/mm/madvise.c b/mm/madvise.c
index 375cf32087e4..751e97aa2210 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -276,15 +276,14 @@ static long madvise_willneed(struct vm_area_struct *vma,
{
struct file *file = vma->vm_file;
+ *prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
- *prev = vma;
force_swapin_readahead(vma, start, end);
return 0;
}
if (shmem_mapping(file->f_mapping)) {
- *prev = vma;
force_shm_swapin_readahead(vma, start, end,
file->f_mapping);
return 0;
@@ -299,7 +298,6 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
- *prev = vma;
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (end > vma->vm_end)
end = vma->vm_end;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 50e6906314f8..ac2ffd5e02b9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6044,7 +6044,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
memcg_check_events(memcg, page);
if (!mem_cgroup_is_root(memcg))
- css_put(&memcg->css);
+ css_put_many(&memcg->css, nr_entries);
}
/**
diff --git a/mm/memory.c b/mm/memory.c
index 85e7a87da79f..5eb3d2524bdc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3948,7 +3948,7 @@ static int handle_pte_fault(struct vm_fault *vmf)
if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
if (vmf->flags & FAULT_FLAG_WRITE) {
- if (!pte_write(entry))
+ if (!pte_access_permitted(entry, WRITE))
return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
@@ -4013,7 +4013,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
/* NUMA case for anonymous PUDs would go here */
- if (dirty && !pud_write(orig_pud)) {
+ if (dirty && !pud_access_permitted(orig_pud, WRITE)) {
ret = wp_huge_pud(&vmf, orig_pud);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4046,7 +4046,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if (dirty && !pmd_write(orig_pmd)) {
+ if (dirty && !pmd_access_permitted(orig_pmd, WRITE)) {
ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4336,7 +4336,7 @@ int follow_phys(struct vm_area_struct *vma,
goto out;
pte = *ptep;
- if ((flags & FOLL_WRITE) && !pte_write(pte))
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto unlock;
*prot = pgprot_val(pte_pgprot(pte));
diff --git a/mm/mmap.c b/mm/mmap.c
index 924839fac0e6..a4d546821214 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2555,9 +2555,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *new;
int err;
- if (is_vm_hugetlb_page(vma) && (addr &
- ~(huge_page_mask(hstate_vma(vma)))))
- return -EINVAL;
+ if (vma->vm_ops && vma->vm_ops->split) {
+ err = vma->vm_ops->split(vma, addr);
+ if (err)
+ return err;
+ }
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c86fbd1b590e..c957be32b27a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -550,7 +550,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
*/
set_bit(MMF_UNSTABLE, &mm->flags);
- tlb_gather_mmu(&tlb, mm, 0, -1);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
if (!can_madv_dontneed_vma(vma))
continue;
@@ -565,11 +564,13 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* we do not want to block exit_mmap by keeping mm ref
* count elevated without a good reason.
*/
- if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+ tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
NULL);
+ tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+ }
}
- tlb_finish_mmu(&tlb, 0, -1);
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e7095030aa1f..586f31261c83 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -433,11 +433,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
else
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
- if (unlikely(bg_thresh >= thresh)) {
- pr_warn("vm direct limit must be set greater than background limit.\n");
+ if (bg_thresh >= thresh)
bg_thresh = thresh / 2;
- }
-
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d4096f4a5c1f..73f5d4556b3d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2507,10 +2507,6 @@ void drain_all_pages(struct zone *zone)
if (WARN_ON_ONCE(!mm_percpu_wq))
return;
- /* Workqueues cannot recurse */
- if (current->flags & PF_WQ_WORKER)
- return;
-
/*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
@@ -7656,11 +7652,18 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/*
* In case of -EBUSY, we'd like to know which page causes problem.
- * So, just fall through. We will check it in test_pages_isolated().
+ * So, just fall through. test_pages_isolated() has a tracepoint
+ * which will report the busy page.
+ *
+ * It is possible that busy pages could become available before
+ * the call to test_pages_isolated, and the range will actually be
+ * allocated. So, if we fall through be sure to clear ret so that
+ * -EBUSY is not accidentally used or returned to caller.
*/
ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
+ ret =0;
/*
* Pages from [start, end) are within a MAX_ORDER_NR_PAGES