diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 60 | ||||
-rw-r--r-- | mm/huge_memory.c | 79 | ||||
-rw-r--r-- | mm/hugetlb.c | 18 | ||||
-rw-r--r-- | mm/internal.h | 15 | ||||
-rw-r--r-- | mm/madvise.c | 9 | ||||
-rw-r--r-- | mm/memblock.c | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 26 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 10 | ||||
-rw-r--r-- | mm/mempolicy.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 18 | ||||
-rw-r--r-- | mm/mlock.c | 3 | ||||
-rw-r--r-- | mm/mmap.c | 40 | ||||
-rw-r--r-- | mm/mprotect.c | 3 | ||||
-rw-r--r-- | mm/mremap.c | 50 | ||||
-rw-r--r-- | mm/rmap.c | 118 | ||||
-rw-r--r-- | mm/slab.c | 13 | ||||
-rw-r--r-- | mm/slab.h | 11 | ||||
-rw-r--r-- | mm/slab_common.c | 23 | ||||
-rw-r--r-- | mm/slob.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 204 | ||||
-rw-r--r-- | mm/userfaultfd.c | 308 | ||||
-rw-r--r-- | mm/vmscan.c | 30 |
25 files changed, 902 insertions, 167 deletions
diff --git a/mm/Makefile b/mm/Makefile index 98c4eaeabdcb..b424d5e5b6ff 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_USERFAULTFD) += userfaultfd.o diff --git a/mm/dmapool.c b/mm/dmapool.c index fd5fe4342e93..59d10d16f0a5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) return page; } -static inline int is_page_busy(struct dma_page *page) +static inline bool is_page_busy(struct dma_page *page) { return page->in_use != 0; } @@ -12,7 +12,9 @@ #include <linux/sched.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> + #include <asm/pgtable.h> +#include <asm/tlbflush.h> #include "internal.h" @@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma, return NULL; } +static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, unsigned int flags) +{ + /* No page to get reference */ + if (flags & FOLL_GET) + return -EFAULT; + + if (flags & FOLL_TOUCH) { + pte_t entry = *pte; + + if (flags & FOLL_WRITE) + entry = pte_mkdirty(entry); + entry = pte_mkyoung(entry); + + if (!pte_same(*pte, entry)) { + set_pte_at(vma->vm_mm, address, pte, entry); + update_mmu_cache(vma, address, pte); + } + } + + /* Proper page table entry exists, but no corresponding struct page */ + return -EEXIST; +} + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { @@ -73,10 +99,21 @@ retry: page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { - if ((flags & FOLL_DUMP) || - !is_zero_pfn(pte_pfn(pte))) - goto bad_page; - page = pte_page(pte); + if (flags & FOLL_DUMP) { + /* Avoid special (like zero) pages in core dumps */ + page = ERR_PTR(-EFAULT); + goto out; + } + + if (is_zero_pfn(pte_pfn(pte))) { + page = pte_page(pte); + } else { + int ret; + + ret = follow_pfn_pte(vma, address, ptep, flags); + page = ERR_PTR(ret); + goto out; + } } if (flags & FOLL_GET) @@ -114,12 +151,9 @@ retry: unlock_page(page); } } +out: pte_unmap_unlock(ptep, ptl); return page; -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) @@ -489,9 +523,15 @@ retry: goto next_page; } BUG(); - } - if (IS_ERR(page)) + } else if (PTR_ERR(page) == -EEXIST) { + /* + * Proper page table entry exists, but no corresponding + * struct page. + */ + goto next_page; + } else if (IS_ERR(page)) { return i ? i : PTR_ERR(page); + } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 097c7a4bfbd9..279a818a39b1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/migrate.h> #include <linux/hashtable.h> +#include <linux/userfaultfd_k.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -716,21 +717,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - struct page *page, gfp_t gfp) + unsigned long address, pmd_t *pmd, + struct page *page, gfp_t gfp, + unsigned int flags) { struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; + unsigned long haddr = address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) - return VM_FAULT_OOM; + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { mem_cgroup_cancel_charge(page, memcg); + put_page(page); return VM_FAULT_OOM; } @@ -750,6 +757,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; + + /* Deliver the page fault to userland */ + if (userfaultfd_missing(vma)) { + int ret; + + spin_unlock(ptl); + mem_cgroup_cancel_charge(page, memcg); + put_page(page); + pte_free(mm, pgtable); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; + } + entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); @@ -760,6 +782,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&mm->nr_ptes); spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); } return 0; @@ -771,19 +794,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) } /* Caller must hold page table lock. */ -static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) { pmd_t entry; - if (!pmd_none(*pmd)) - return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); - return true; } int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -806,6 +826,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pgtable_t pgtable; struct page *zero_page; bool set; + int ret; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -816,14 +837,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } ptl = pmd_lock(mm, pmd); - set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_page); - spin_unlock(ptl); + ret = 0; + set = false; + if (pmd_none(*pmd)) { + if (userfaultfd_missing(vma)) { + spin_unlock(ptl); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + } else { + set_huge_zero_page(pgtable, mm, vma, + haddr, pmd, + zero_page); + spin_unlock(ptl); + set = true; + } + } else + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); } - return 0; + return ret; } gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); @@ -831,14 +866,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } - - count_vm_event(THP_FAULT_ALLOC); - return 0; + return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, + flags); } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -873,16 +902,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_huge_zero_pmd(pmd)) { struct page *zero_page; - bool set; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ zero_page = get_huge_zero_page(); - set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); - BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ ret = 0; goto out_unlock; } @@ -2133,7 +2160,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2586,7 +2614,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c3087089d8..51ae41d0fbc0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -616,7 +616,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_reserves(struct vm_area_struct *vma, long chg) +static bool vma_has_reserves(struct vm_area_struct *vma, long chg) { if (vma->vm_flags & VM_NORESERVE) { /* @@ -629,23 +629,23 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg) * properly, so add work-around here. */ if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return 1; + return true; else - return 0; + return false; } /* Shared mappings always use reserves */ if (vma->vm_flags & VM_MAYSHARE) - return 1; + return true; /* * Only the process that called mmap() has reserves for * private mappings. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 1; + return true; - return 0; + return false; } static void enqueue_huge_page(struct hstate *h, struct page *page) @@ -3779,7 +3779,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, return saddr; } -static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) +static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) { unsigned long base = addr & PUD_MASK; unsigned long end = base + PUD_SIZE; @@ -3789,8 +3789,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) */ if (vma->vm_flags & VM_MAYSHARE && vma->vm_start <= base && end <= vma->vm_end) - return 1; - return 0; + return true; + return false; } /* diff --git a/mm/internal.h b/mm/internal.h index 36b23f1e2ca6..1195dd2d6a2b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -426,4 +426,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #define ALLOC_FAIR 0x100 /* fair zone allocation */ +enum ttu_flags; +struct tlbflush_unmap_batch; + +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +void try_to_unmap_flush(void); +void try_to_unmap_flush_dirty(void); +#else +static inline void try_to_unmap_flush(void) +{ +} +static inline void try_to_unmap_flush_dirty(void) +{ +} + +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/madvise.c b/mm/madvise.c index 64bb8a22110c..ce3a4222c7e7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; @@ -385,7 +386,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, } } -static int +static bool madvise_behavior_valid(int behavior) { switch (behavior) { @@ -407,10 +408,10 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: - return 1; + return true; default: - return 0; + return false; } } diff --git a/mm/memblock.c b/mm/memblock.c index 87108e77e476..95ce68c6da8a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -566,6 +566,9 @@ repeat: * area, insert that portion. */ if (rbase > base) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + WARN_ON(nid != memblock_get_region_node(rgn)); +#endif nr_new++; if (insert) memblock_insert_region(type, i++, base, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index acb93c554f6e..1af057575ce9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5965,7 +5965,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); - /* Caller disabled preemption with mapping->tree_lock */ + /* + * Interrupts should be disabled here because the caller holds the + * mapping->tree_lock lock which is taken with interrupts-off. It is + * important here to have the interrupts disabled because it is the + * only synchronisation we have for udpating the per-CPU variables. + */ + VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, -1); memcg_check_events(memcg, page); } diff --git a/mm/memory.c b/mm/memory.c index 388dcf9aa283..bb04d8f2f86c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -61,6 +61,7 @@ #include <linux/string.h> #include <linux/dma-debug.h> #include <linux/debugfs.h> +#include <linux/userfaultfd_k.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task) #ifdef HAVE_GENERIC_MMU_GATHER -static int tlb_next_batch(struct mmu_gather *tlb) +static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; batch = tlb->active; if (batch->next) { tlb->active = batch->next; - return 1; + return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) - return 0; + return false; batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) - return 0; + return false; tlb->batch_count++; batch->next = NULL; @@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb) tlb->active->next = batch; tlb->active = batch; - return 1; + return true; } /* tlb_gather_mmu @@ -2685,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } goto setpte; } @@ -2713,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + mem_cgroup_cancel_charge(page, memcg); + page_cache_release(page); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); mem_cgroup_commit_charge(page, memcg, false); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6da82bcb0a8b..8fd97dac538a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1248,6 +1248,14 @@ int __ref add_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); + /* + * Add new range to memblock so that when hotadd_new_pgdat() is called + * to allocate new pgdat, get_pfn_range_for_nid() will be able to find + * this new range and calculate total pages correctly. The range will + * be removed at hot-remove time. + */ + memblock_add_node(start, size, nid); + new_node = !node_online(nid); if (new_node) { pgdat = hotadd_new_pgdat(nid, start); @@ -1277,7 +1285,6 @@ int __ref add_memory(int nid, u64 start, u64 size) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); - memblock_add_node(start, size, nid); goto out; @@ -1286,6 +1293,7 @@ error: if (new_pgdat) rollback_node_hotadd(nid, pgdat); release_memory_resource(res); + memblock_remove(start, size); out: mem_hotplug_done(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 99d4c1d0b858..a7f1e0d1d6b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol); + vma->anon_vma, vma->vm_file, pgoff, + new_pol, vma->vm_userfaultfd_ctx); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/migrate.c b/mm/migrate.c index eb4267107d1f..5c08cab5419e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1226,7 +1226,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, pp->addr, + FOLL_GET | FOLL_SPLIT | FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1236,10 +1238,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - /* Use PageReserved to check for zero page */ - if (PageReserved(page)) - goto put_and_set; - pp->page = page; err = page_to_nid(page); @@ -1396,18 +1394,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (!vma || addr < vma->vm_start) goto set_status; - page = follow_page(vma, addr, 0); + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, addr, FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) goto set_status; - err = -ENOENT; - /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page)) - goto set_status; - - err = page_to_nid(page); + err = page ? page_to_nid(page) : -ENOENT; set_status: *status = err; diff --git a/mm/mlock.c b/mm/mlock.c index 6fd2cf15e868..25936680064f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index f126923ce683..82db4fc0a9d3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -41,6 +41,7 @@ #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> +#include <linux/userfaultfd_k.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -919,7 +920,8 @@ again: remove_next = 1 + (end > next->vm_end); * per-vma resources, so we don't attempt to merge those. */ static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags) + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (vma->vm_ops && vma->vm_ops->close) return 0; + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) + return 0; return 1; } @@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, */ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, */ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy) + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff)) { + anon_vma, file, pgoff, + vm_userfaultfd_ctx)) { /* * OK, it can. Can we now merge in the successor as well? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen) && + anon_vma, file, + pgoff+pglen, + vm_userfaultfd_ctx) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen)) { + anon_vma, file, pgoff+pglen, + vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); @@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* * Can we just expand an old mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, - NULL); + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index e7d6f1171ecb..ef5be8eaab00 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*pprev) { vma = *pprev; goto success; diff --git a/mm/mremap.c b/mm/mremap.c index a7c93eceb1c8..5a71cce8c6ea 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, need_rmap_locks); if (moved_len < old_len) { + err = -ENOMEM; + } else if (vma->vm_ops && vma->vm_ops->mremap) { + err = vma->vm_ops->mremap(new_vma); + } + + if (unlikely(err)) { /* * On error, move entries back from new area to old, * which will succeed since page tables still there, @@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, vma = new_vma; old_len = new_len; old_addr = new_addr; - new_addr = -ENOMEM; + new_addr = err; } else { - if (vma->vm_file && vma->vm_file->f_op->mremap) { - err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); - if (err < 0) { - move_page_tables(new_vma, new_addr, vma, - old_addr, moved_len, true); - return err; - } - } arch_remap(mm, old_addr, old_addr + old_len, new_addr, new_addr + new_len); } @@ -348,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = find_vma(mm, addr); + unsigned long pgoff; if (!vma || vma->vm_start > addr) return ERR_PTR(-EFAULT); @@ -359,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (old_len > vma->vm_end - addr) return ERR_PTR(-EFAULT); + if (new_len == old_len) + return vma; + /* Need to be careful about a growing mapping */ - if (new_len > old_len) { - unsigned long pgoff; - - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - return ERR_PTR(-EFAULT); - pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - pgoff += vma->vm_pgoff; - if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - return ERR_PTR(-EINVAL); - } + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + return ERR_PTR(-EINVAL); + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) + return ERR_PTR(-EFAULT); if (vma->vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; @@ -408,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) goto out; - /* Check if the location we're moving into overlaps the - * old location at all, and fail if it does. - */ - if ((new_addr <= addr) && (new_addr+new_len) > addr) - goto out; - - if ((addr <= new_addr) && (addr+old_len) > new_addr) + /* Ensure the old/new locations do not overlap */ + if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; ret = do_munmap(mm, new_addr, new_len); @@ -580,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); } out: - if (ret & ~PAGE_MASK) + if (ret & ~PAGE_MASK) { vm_unacct_memory(charged); + locked = 0; + } up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); diff --git a/mm/rmap.c b/mm/rmap.c index 171b68768df1..0db38e7d0a72 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -62,6 +62,8 @@ #include <asm/tlbflush.h> +#include <trace/events/tlb.h> + #include "internal.h" static struct kmem_cache *anon_vma_cachep; @@ -583,6 +585,107 @@ vma_address(struct page *page, struct vm_area_struct *vma) return address; } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void percpu_flush_tlb_batch_pages(void *data) +{ + /* + * All TLB entries are flushed on the assumption that it is + * cheaper to flush all TLBs and let them be refilled than + * flushing individual PFNs. Note that we do not track mm's + * to flush as that might simply be multiple full TLB flushes + * for no gain. + */ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + flush_tlb_local(); +} + +/* + * Flush TLB entries for recently unmapped pages from remote CPUs. It is + * important if a PTE was dirty when it was unmapped that it's flushed + * before any IO is initiated on the page to prevent lost writes. Similarly, + * it must be flushed before freeing to prevent data leakage. + */ +void try_to_unmap_flush(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int cpu; + + if (!tlb_ubc->flush_required) + return; + + cpu = get_cpu(); + + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL); + + if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) + percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask); + + if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) { + smp_call_function_many(&tlb_ubc->cpumask, + percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true); + } + cpumask_clear(&tlb_ubc->cpumask); + tlb_ubc->flush_required = false; + tlb_ubc->writable = false; + put_cpu(); +} + +/* Flush iff there are potentially writable TLB entries that can race with IO */ +void try_to_unmap_flush_dirty(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + if (tlb_ubc->writable) + try_to_unmap_flush(); +} + +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page, bool writable) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + tlb_ubc->flush_required = true; + + /* + * If the PTE was dirty then it's best to assume it's writable. The + * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() + * before the page is queued for IO. + */ + if (writable) + tlb_ubc->writable = true; +} + +/* + * Returns true if the TLB flush should be deferred to the end of a batch of + * unmap operations to reduce IPIs. + */ +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + bool should_defer = false; + + if (!(flags & TTU_BATCH_FLUSH)) + return false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} +#else +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page, bool writable) +{ +} + +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + return false; +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * At what user virtual address is page expected in vma? * Caller should check the page is actually part of the vma. @@ -1220,7 +1323,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially a remote + * CPU could still be writing to the page. If the entry was + * previously clean then the architecture must guarantee that + * a clear->dirty transition on a cached TLB entry is written + * through and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pte); + + set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval)); + } else { + pteval = ptep_clear_flush(vma, address, pte); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) diff --git a/mm/slab.c b/mm/slab.c index bbd0b47dc6a9..60c936938b84 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + #ifdef CONFIG_TRACING void * kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) diff --git a/mm/slab.h b/mm/slab.h index 8da63e4e470f..a3a967d7d7c2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the objecct listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + #ifdef CONFIG_MEMCG_KMEM /* * Iterate over all memcg caches of the given root cache. The caller must hold @@ -321,7 +330,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __func__, cachep->name, s->name); + __func__, s->name, cachep->name); WARN_ON_ONCE(1); return s; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 86831105a09f..c26829fe4e37 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -104,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) } #endif +void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) + kmem_cache_free(s, p[i]); +} + +bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, + void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) { + void *x = p[i] = kmem_cache_alloc(s, flags); + if (!x) { + __kmem_cache_free_bulk(s, i, p); + return false; + } + } + return true; +} + #ifdef CONFIG_MEMCG_KMEM void slab_init_memcg_params(struct kmem_cache *s) { diff --git a/mm/slob.c b/mm/slob.c index 4765f65019c7..165bbd3cd606 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + int __kmem_cache_shutdown(struct kmem_cache *c) { /* No way to check for remaining objects */ diff --git a/mm/slub.c b/mm/slub.c index f68c0e50f3c0..084184e706c6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1306,6 +1306,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) kasan_slab_free(s, x); } +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) +{ + setup_object_debug(s, page, object); + if (unlikely(s->ctor)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); + } +} + /* * Slab allocation and freeing */ @@ -1336,6 +1347,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct page *page; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + void *start, *p; + int idx, order; flags &= gfp_allowed_mask; @@ -1349,6 +1362,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT; page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1359,13 +1374,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Try a lower order alloc if possible */ page = alloc_slab_page(s, alloc_gfp, node, oo); - - if (page) - stat(s, ORDER_FALLBACK); + if (unlikely(!page)) + goto out; + stat(s, ORDER_FALLBACK); } - if (kmemcheck_enabled && page - && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { + if (kmemcheck_enabled && + !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); @@ -1380,51 +1395,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) kmemcheck_mark_unallocated_pages(page, pages); } - if (flags & __GFP_WAIT) - local_irq_disable(); - if (!page) - return NULL; - page->objects = oo_objects(oo); - mod_zone_page_state(page_zone(page), - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << oo_order(oo)); - - return page; -} - -static void setup_object(struct kmem_cache *s, struct page *page, - void *object) -{ - setup_object_debug(s, page, object); - if (unlikely(s->ctor)) { - kasan_unpoison_object_data(s, object); - s->ctor(object); - kasan_poison_object_data(s, object); - } -} - -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) -{ - struct page *page; - void *start; - void *p; - int order; - int idx; - - if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); - BUG(); - } - - page = allocate_slab(s, - flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); - if (!page) - goto out; order = compound_order(page); - inc_slabs_node(s, page_to_nid(page), page->objects); page->slab_cache = s; __SetPageSlab(page); if (page_is_pfmemalloc(page)) @@ -1448,10 +1421,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->freelist = start; page->inuse = page->objects; page->frozen = 1; + out: + if (flags & __GFP_WAIT) + local_irq_disable(); + if (!page) + return NULL; + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << oo_order(oo)); + + inc_slabs_node(s, page_to_nid(page), page->objects); + return page; } +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } + + return allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); +} + static void __free_slab(struct kmem_cache *s, struct page *page) { int order = compound_order(page); @@ -2712,7 +2709,7 @@ redo: * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succedd. + * during the cmpxchg then the free will succeed. */ do { tid = this_cpu_read(s->cpu_slab->tid); @@ -2750,6 +2747,113 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); +/* Note that interrupts must be enabled when calling this function. */ +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct kmem_cache_cpu *c; + struct page *page; + int i; + + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + for (i = 0; i < size; i++) { + void *object = p[i]; + + BUG_ON(!object); + /* kmem cache debug support */ + s = cache_from_obj(s, object); + if (unlikely(!s)) + goto exit; + slab_free_hook(s, object); + + page = virt_to_head_page(object); + + if (c->page == page) { + /* Fastpath: local CPU free */ + set_freepointer(s, object, c->freelist); + c->freelist = object; + } else { + c->tid = next_tid(c->tid); + local_irq_enable(); + /* Slowpath: overhead locked cmpxchg_double_slab */ + __slab_free(s, page, object, _RET_IP_); + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + } + } +exit: + c->tid = next_tid(c->tid); + local_irq_enable(); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +/* Note that interrupts must be enabled when calling this function. */ +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + struct kmem_cache_cpu *c; + int i; + + /* + * Drain objects in the per cpu slab, while disabling local + * IRQs, which protects against PREEMPT and interrupts + * handlers invoking normal fastpath. + */ + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + for (i = 0; i < size; i++) { + void *object = c->freelist; + + if (unlikely(!object)) { + local_irq_enable(); + /* + * Invoking slow path likely have side-effect + * of re-populating per CPU c->freelist + */ + p[i] = __slab_alloc(s, flags, NUMA_NO_NODE, + _RET_IP_, c); + if (unlikely(!p[i])) { + __kmem_cache_free_bulk(s, i, p); + return false; + } + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + continue; /* goto for-loop */ + } + + /* kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); + if (unlikely(!s)) { + __kmem_cache_free_bulk(s, i, p); + c->tid = next_tid(c->tid); + local_irq_enable(); + return false; + } + + c->freelist = get_freepointer(s, object); + p[i] = object; + + /* kmem_cache debug support */ + slab_post_alloc_hook(s, flags, object); + } + c->tid = next_tid(c->tid); + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ + if (unlikely(flags & __GFP_ZERO)) { + int j; + + for (j = 0; j < i; j++) + memset(p[j], 0, s->object_size); + } + + return true; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + + /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -5181,7 +5285,7 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) - goto out_put_kobj; + goto out; err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) @@ -5208,8 +5312,6 @@ out: return err; out_del_kobj: kobject_del(&s->kobj); -out_put_kobj: - kobject_put(&s->kobj); goto out; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c new file mode 100644 index 000000000000..77fee9325a57 --- /dev/null +++ b/mm/userfaultfd.c @@ -0,0 +1,308 @@ +/* + * mm/userfaultfd.c + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/userfaultfd_k.h> +#include <linux/mmu_notifier.h> +#include <asm/tlbflush.h> +#include "internal.h" + +static int mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct mem_cgroup *memcg; + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + void *page_kaddr; + int ret; + struct page *page; + + if (!*pagep) { + ret = -ENOMEM; + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); + if (!page) + goto out; + + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + ret = -EFAULT; + *pagep = page; + /* don't free the page */ + goto out; + } + } else { + page = *pagep; + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceeding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + ret = -ENOMEM; + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) + goto out_release; + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_release_uncharge_unlock; + + inc_mm_counter(dst_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, dst_vma, dst_addr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, dst_vma); + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + pte_unmap_unlock(dst_pte, ptl); + ret = 0; +out: + return ret; +out_release_uncharge_unlock: + pte_unmap_unlock(dst_pte, ptl); + mem_cgroup_cancel_charge(page, memcg); +out_release: + page_cache_release(page); + goto out; +} + +static int mfill_zeropage_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + int ret; + + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + dst_vma->vm_page_prot)); + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_unlock; + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); + return ret; +} + +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (pud) + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + pmd = pmd_alloc(mm, pud, address); + return pmd; +} + +static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage) +{ + struct vm_area_struct *dst_vma; + ssize_t err; + pmd_t *dst_pmd; + unsigned long src_addr, dst_addr; + long copied; + struct page *page; + + /* + * Sanitize the command parameters: + */ + BUG_ON(dst_start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(src_start + len <= src_start); + BUG_ON(dst_start + len <= dst_start); + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + page = NULL; +retry: + down_read(&dst_mm->mmap_sem); + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + err = -EINVAL; + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + goto out_unlock; + + /* + * Be strict and only allow __mcopy_atomic on userfaultfd + * registered ranges to prevent userland errors going + * unnoticed. As far as the VM consistency is concerned, it + * would be perfectly safe to remove this check, but there's + * no useful usage for __mcopy_atomic ouside of userfaultfd + * registered ranges. This is after all why these are ioctls + * belonging to the userfaultfd and not syscalls. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + goto out_unlock; + + /* + * FIXME: only allow copying on anonymous vmas, tmpfs should + * be added. + */ + if (dst_vma->vm_ops) + goto out_unlock; + + /* + * Ensure the dst_vma has a anon_vma or this page + * would get a NULL anon_vma when moved in the + * dst_vma. + */ + err = -ENOMEM; + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + + while (src_addr < src_start + len) { + pmd_t dst_pmdval; + + BUG_ON(dst_addr >= dst_start + len); + + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); + if (unlikely(!dst_pmd)) { + err = -ENOMEM; + break; + } + + dst_pmdval = pmd_read_atomic(dst_pmd); + /* + * If the dst_pmd is mapped as THP don't + * override it and just be strict. + */ + if (unlikely(pmd_trans_huge(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, + dst_addr))) { + err = -ENOMEM; + break; + } + /* If an huge pmd materialized from under us fail */ + if (unlikely(pmd_trans_huge(*dst_pmd))) { + err = -EFAULT; + break; + } + + BUG_ON(pmd_none(*dst_pmd)); + BUG_ON(pmd_trans_huge(*dst_pmd)); + + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, &page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, + dst_addr); + + cond_resched(); + + if (unlikely(err == -EFAULT)) { + void *page_kaddr; + + up_read(&dst_mm->mmap_sem); + BUG_ON(!page); + + page_kaddr = kmap(page); + err = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap(page); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + goto retry; + } else + BUG_ON(page); + + if (!err) { + dst_addr += PAGE_SIZE; + src_addr += PAGE_SIZE; + copied += PAGE_SIZE; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + up_read(&dst_mm->mmap_sem); +out: + if (page) + page_cache_release(page); + BUG_ON(copied < 0); + BUG_ON(err > 0); + BUG_ON(!copied && !err); + return copied ? copied : err; +} + +ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long src_start, unsigned long len) +{ + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); +} + +ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, + unsigned long len) +{ + return __mcopy_atomic(dst_mm, start, 0, len, true); +} diff --git a/mm/vmscan.c b/mm/vmscan.c index 8286938c70de..b1139039122a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + switch (try_to_unmap(page, + ttu_flags|TTU_BATCH_FLUSH)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1097,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!sc->may_writepage) goto keep_locked; - /* Page is dirty, try to write it out here */ + /* + * Page is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after IO + * starts and then write it out here. + */ + try_to_unmap_flush_dirty(); switch (pageout(page, mapping, sc)) { case PAGE_KEEP: goto keep_locked; @@ -1208,6 +1214,7 @@ keep: } mem_cgroup_uncharge_list(&free_pages); + try_to_unmap_flush(); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); @@ -2151,6 +2158,23 @@ out: } } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void init_tlb_ubc(void) +{ + /* + * This deliberately does not clear the cpumask as it's expensive + * and unnecessary. If there happens to be data in there then the + * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and + * then will be cleared. + */ + current->tlb_ubc.flush_required = false; +} +#else +static inline void init_tlb_ubc(void) +{ +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -2185,6 +2209,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); + init_tlb_ubc(); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { |