diff options
author | 2020-09-01 14:19:48 +0200 | |
---|---|---|
committer | 2020-09-01 14:19:48 +0200 | |
commit | ead5d1f4d877e92c051e1a1ade623d0d30e71619 (patch) | |
tree | cb9db5698a546e7b96f7d5bef5ce544629dd37a2 /mm/vmalloc.c | |
parent | scif: Fix spelling of EACCES (diff) | |
parent | Merge tag 'docs-5.9-3' of git://git.lwn.net/linux (diff) | |
download | linux-dev-ead5d1f4d877e92c051e1a1ade623d0d30e71619.tar.xz linux-dev-ead5d1f4d877e92c051e1a1ade623d0d30e71619.zip |
Merge branch 'master' into for-next
Sync with Linus' branch in order to be able to apply fixups
of more recent patches.
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r-- | mm/vmalloc.c | 579 |
1 files changed, 237 insertions, 342 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1f46c3b86f9f..be4724b916b3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -7,6 +7,7 @@ * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 * Numa awareness, Christoph Lameter, SGI, June 2005 + * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 */ #include <linux/vmalloc.h> @@ -25,7 +26,7 @@ #include <linux/list.h> #include <linux/notifier.h> #include <linux/rbtree.h> -#include <linux/radix-tree.h> +#include <linux/xarray.h> #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> @@ -34,12 +35,14 @@ #include <linux/llist.h> #include <linux/bitops.h> #include <linux/rbtree_augmented.h> +#include <linux/overflow.h> #include <linux/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> #include "internal.h" +#include "pgalloc-track.h" bool is_vmalloc_addr(const void *x) { @@ -68,7 +71,8 @@ static void free_work(struct work_struct *w) /*** Page table manipulation functions ***/ -static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -77,73 +81,119 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; } -static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; + int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_clear_huge(pmd)) + + cleared = pmd_clear_huge(pmd); + if (cleared || pmd_bad(*pmd)) + *mask |= PGTBL_PMD_MODIFIED; + + if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; - vunmap_pte_range(pmd, addr, next); + vunmap_pte_range(pmd, addr, next, mask); + + cond_resched(); } while (pmd++, addr = next, addr != end); } -static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; + int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_clear_huge(pud)) + + cleared = pud_clear_huge(pud); + if (cleared || pud_bad(*pud)) + *mask |= PGTBL_PUD_MODIFIED; + + if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; - vunmap_pmd_range(pud, addr, next); + vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } -static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; + int cleared; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_clear_huge(p4d)) + + cleared = p4d_clear_huge(p4d); + if (cleared || p4d_bad(*p4d)) + *mask |= PGTBL_P4D_MODIFIED; + + if (cleared) continue; if (p4d_none_or_clear_bad(p4d)) continue; - vunmap_pud_range(p4d, addr, next); + vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } -static void vunmap_page_range(unsigned long addr, unsigned long end) +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @start: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify + * should have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible + * for calling flush_cache_vunmap() on to-be-mapped areas before calling this + * function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long start, unsigned long size) { - pgd_t *pgd; + unsigned long end = start + size; unsigned long next; + pgd_t *pgd; + unsigned long addr = start; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; - vunmap_p4d_range(pgd, addr, next); + vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -152,7 +202,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, * callers keep track of where we're up to. */ - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -165,94 +215,117 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } -/* - * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and - * will have pfns corresponding to the "pages" array. +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. * - * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + * RETURNS: + * 0 on success, -errno on failure. */ -static int vmap_page_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) { - pgd_t *pgd; + unsigned long start = addr; + unsigned long end = addr + size; unsigned long next; - unsigned long addr = start; + pgd_t *pgd; int err = 0; int nr = 0; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); - return nr; + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return 0; } -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages) { int ret; - ret = vmap_page_range_noflush(start, end, prot, pages); - flush_cache_vmap(start, end); + ret = map_kernel_range_noflush(start, size, prot, pages); + flush_cache_vmap(start, start + size); return ret; } @@ -441,6 +514,10 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) /* * This function returns back addresses of parent node * and its left or right link for further processing. + * + * Otherwise NULL is returned. In that case all further + * steps regarding inserting of conflicting overlap range + * have to be declined and actually considered as a bug. */ static __always_inline struct rb_node ** find_va_links(struct vmap_area *va, @@ -479,8 +556,12 @@ find_va_links(struct vmap_area *va, else if (va->va_end > tmp_va->va_start && va->va_start >= tmp_va->va_end) link = &(*link)->rb_right; - else - BUG(); + else { + WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", + va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); + + return NULL; + } } while (*link); *parent = &tmp_va->rb_node; @@ -562,43 +643,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root) #if DEBUG_AUGMENT_PROPAGATE_CHECK static void -augment_tree_propagate_check(struct rb_node *n) +augment_tree_propagate_check(void) { struct vmap_area *va; - struct rb_node *node; - unsigned long size; - bool found = false; - - if (n == NULL) - return; - - va = rb_entry(n, struct vmap_area, rb_node); - size = va->subtree_max_size; - node = n; - - while (node) { - va = rb_entry(node, struct vmap_area, rb_node); - - if (get_subtree_max_size(node->rb_left) == size) { - node = node->rb_left; - } else { - if (va_size(va) == size) { - found = true; - break; - } - - node = node->rb_right; - } - } + unsigned long computed_size; - if (!found) { - va = rb_entry(n, struct vmap_area, rb_node); - pr_emerg("tree is corrupted: %lu, %lu\n", - va_size(va), va->subtree_max_size); + list_for_each_entry(va, &free_vmap_area_list, list) { + computed_size = compute_subtree_max_size(va); + if (computed_size != va->subtree_max_size) + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); } - - augment_tree_propagate_check(n->rb_left); - augment_tree_propagate_check(n->rb_right); } #endif @@ -632,28 +687,15 @@ augment_tree_propagate_check(struct rb_node *n) static __always_inline void augment_tree_propagate_from(struct vmap_area *va) { - struct rb_node *node = &va->rb_node; - unsigned long new_va_sub_max_size; - - while (node) { - va = rb_entry(node, struct vmap_area, rb_node); - new_va_sub_max_size = compute_subtree_max_size(va); - - /* - * If the newly calculated maximum available size of the - * subtree is equal to the current one, then it means that - * the tree is propagated correctly. So we have to stop at - * this point to save cycles. - */ - if (va->subtree_max_size == new_va_sub_max_size) - break; - - va->subtree_max_size = new_va_sub_max_size; - node = rb_parent(&va->rb_node); - } + /* + * Populate the tree from bottom towards the root until + * the calculated maximum available size of checked node + * is equal to its current one. + */ + free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); #if DEBUG_AUGMENT_PROPAGATE_CHECK - augment_tree_propagate_check(free_vmap_area_root.rb_node); + augment_tree_propagate_check(); #endif } @@ -665,7 +707,8 @@ insert_vmap_area(struct vmap_area *va, struct rb_node *parent; link = find_va_links(va, root, NULL, &parent); - link_va(va, root, parent, link, head); + if (link) + link_va(va, root, parent, link, head); } static void @@ -681,8 +724,10 @@ insert_vmap_area_augment(struct vmap_area *va, else link = find_va_links(va, root, NULL, &parent); - link_va(va, root, parent, link, head); - augment_tree_propagate_from(va); + if (link) { + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); + } } /* @@ -690,6 +735,11 @@ insert_vmap_area_augment(struct vmap_area *va, * and next free blocks. If coalesce is not done a new * free area is inserted. If VA has been merged, it is * freed. + * + * Please note, it can return NULL in case of overlap + * ranges, followed by WARN() report. Despite it is a + * buggy behaviour, a system can be alive and keep + * ongoing. */ static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, @@ -706,6 +756,8 @@ merge_or_add_vmap_area(struct vmap_area *va, * inserted, unless it is merged with its sibling/siblings. */ link = find_va_links(va, root, NULL, &parent); + if (!link) + return NULL; /* * Get next node of VA to check if merging can be done. @@ -726,9 +778,6 @@ merge_or_add_vmap_area(struct vmap_area *va, if (sibling->va_start == va->va_end) { sibling->va_start = va->va_start; - /* Check and update the tree if needed. */ - augment_tree_propagate_from(sibling); - /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); @@ -748,14 +797,18 @@ merge_or_add_vmap_area(struct vmap_area *va, if (next->prev != head) { sibling = list_entry(next->prev, struct vmap_area, list); if (sibling->va_end == va->va_start) { - sibling->va_end = va->va_end; - - /* Check and update the tree if needed. */ - augment_tree_propagate_from(sibling); - + /* + * If both neighbors are coalesced, it is important + * to unlink the "next" node first, followed by merging + * with "previous" one. Otherwise the tree might not be + * fully populated if a sibling's augmented value is + * "normalized" because of rotation operations. + */ if (merged) unlink_va(va, root); + sibling->va_end = va->va_end; + /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); @@ -766,11 +819,13 @@ merge_or_add_vmap_area(struct vmap_area *va, } insert: - if (!merged) { + if (!merged) link_va(va, root, parent, link, head); - augment_tree_propagate_from(va); - } + /* + * Last step is to check and update the tree. + */ + augment_tree_propagate_from(va); return va; } @@ -1222,14 +1277,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); /* - * Clear the pagetable entries of a given vmap_area - */ -static void unmap_vmap_area(struct vmap_area *va) -{ - vunmap_page_range(va->va_start, va->va_end); -} - -/* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * @@ -1292,12 +1339,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) return false; /* - * First make sure the mappings are removed from all page-tables - * before they are freed. - */ - vmalloc_sync_all(); - - /* * TODO: to calculate a flush range without looping. * The list can be up to lazy_max_pages() elements. */ @@ -1325,6 +1366,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) va = merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); + if (!va) + continue; + if (is_vmalloc_or_module_addr((void *)orig_start)) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); @@ -1390,7 +1434,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_vmap_area(va); + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1457,12 +1501,11 @@ struct vmap_block { static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); /* - * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block + * XArray of vmap blocks, indexed by address, to quickly find a vmap block * in the free path. Could get rid of this if we change the API to return a * "cookie" from alloc, to be passed to free. But no big deal yet. */ -static DEFINE_SPINLOCK(vmap_block_tree_lock); -static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); +static DEFINE_XARRAY(vmap_blocks); /* * We should probably have a fallback mechanism to allocate virtual memory @@ -1519,13 +1562,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) return ERR_CAST(va); } - err = radix_tree_preload(gfp_mask); - if (unlikely(err)) { - kfree(vb); - free_vmap_area(va); - return ERR_PTR(err); - } - vaddr = vmap_block_vaddr(va->va_start, 0); spin_lock_init(&vb->lock); vb->va = va; @@ -1538,11 +1574,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); - spin_lock(&vmap_block_tree_lock); - err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); - spin_unlock(&vmap_block_tree_lock); - BUG_ON(err); - radix_tree_preload_end(); + err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); + if (err) { + kfree(vb); + free_vmap_area(va); + return ERR_PTR(err); + } vbq = &get_cpu_var(vmap_block_queue); spin_lock(&vbq->lock); @@ -1556,12 +1593,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) static void free_vmap_block(struct vmap_block *vb) { struct vmap_block *tmp; - unsigned long vb_idx; - vb_idx = addr_to_vb_idx(vb->va->va_start); - spin_lock(&vmap_block_tree_lock); - tmp = radix_tree_delete(&vmap_block_tree, vb_idx); - spin_unlock(&vmap_block_tree_lock); + tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); free_vmap_area_noflush(vb->va); @@ -1664,34 +1697,25 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) return vaddr; } -static void vb_free(const void *addr, unsigned long size) +static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; - unsigned long vb_idx; unsigned int order; struct vmap_block *vb; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); - flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + flush_cache_vunmap(addr, addr + size); order = get_order(size); + offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; + vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); - offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); - offset >>= PAGE_SHIFT; - - vb_idx = addr_to_vb_idx((unsigned long)addr); - rcu_read_lock(); - vb = radix_tree_lookup(&vmap_block_tree, vb_idx); - rcu_read_unlock(); - BUG_ON(!vb); - - vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + unmap_kernel_range_noflush(addr, size); if (debug_pagealloc_enabled_static()) - flush_tlb_kernel_range((unsigned long)addr, - (unsigned long)addr + size); + flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); @@ -1791,7 +1815,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); - vb_free(mem, size); + vb_free(addr, size); return; } @@ -1808,7 +1832,6 @@ EXPORT_SYMBOL(vm_unmap_ram); * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node - * @prot: memory protection to use. PAGE_KERNEL for regular RAM * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life @@ -1818,7 +1841,7 @@ EXPORT_SYMBOL(vm_unmap_ram); * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; @@ -1842,7 +1865,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro kasan_unpoison_vmalloc(mem, size); - if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } @@ -1987,51 +2010,6 @@ void __init vmalloc_init(void) } /** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vmap() on to-be-mapped areas - * before calling this function. - * - * RETURNS: - * The number of pages mapped on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return vmap_page_range_noflush(addr, addr + size, prot, pages); -} - -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vunmap() on to-be-mapped areas - * before calling this function and flush_tlb_kernel_range() after. - */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ - vunmap_page_range(addr, addr + size); -} -EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); - -/** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB * @addr: start of the VM area to unmap * @size: size of the VM area to unmap @@ -2044,22 +2022,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) unsigned long end = addr + size; flush_cache_vunmap(addr, end); - vunmap_page_range(addr, end); + unmap_kernel_range_noflush(addr, size); flush_tlb_kernel_range(addr, end); } -EXPORT_SYMBOL_GPL(unmap_kernel_range); - -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) -{ - unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + get_vm_area_size(area); - int err; - - err = vmap_page_range(addr, end, prot, pages); - - return err > 0 ? 0 : err; -} -EXPORT_SYMBOL_GPL(map_vm_area); static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) @@ -2127,14 +2092,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return area; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) -{ - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, __builtin_return_address(0)); -} -EXPORT_SYMBOL_GPL(__get_vm_area); - struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) @@ -2329,7 +2286,7 @@ static inline void __vfree_deferred(const void *addr) * Use raw_cpu_ptr() because this can be called from preemptible * context. Preemption is absolutely fine here, because the llist_add() * implementation is lockless, so it works even if we are adding to - * nother cpu's list. schedule_work() should be fine with this too. + * another cpu's list. schedule_work() should be fine with this too. */ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); @@ -2440,7 +2397,8 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), + pages) < 0) { vunmap(area->addr); return NULL; } @@ -2449,9 +2407,6 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { @@ -2469,7 +2424,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, - PAGE_KERNEL, node, area->caller); + node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2503,8 +2458,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_vm_area(area, prot, pages)) + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), + prot, pages) < 0) goto fail; + return area->addr; fail: @@ -2572,27 +2529,16 @@ fail: return NULL; } -/* - * This is only for performance analysis of vmalloc and stress purpose. - * It is required by vmalloc test module, therefore do not use it other - * than that. - */ -#ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node_range); -#endif - /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level allocator with + * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported @@ -2602,35 +2548,28 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * * Return: pointer to the allocated memory or %NULL on error */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller) +void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, - gfp_mask, prot, 0, node, caller); + gfp_mask, PAGE_KERNEL, 0, node, caller); } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node); +#endif -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, + return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); -static inline void *__vmalloc_node_flags(unsigned long size, - int node, gfp_t flags) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, - node, __builtin_return_address(0)); -} - - -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, - void *caller) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); -} - /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -2645,8 +2584,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, */ void *vmalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL); + return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -2665,8 +2604,8 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); @@ -2703,8 +2642,8 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, - node, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -2717,58 +2656,15 @@ EXPORT_SYMBOL(vmalloc_node); * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * - * For tight control over page level allocator and protection flags - * use __vmalloc_node() instead. - * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { - return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_user_node_flags - allocate memory for userspace on a specific node - * @size: allocation size - * @node: numa node - * @flags: flags for the page level allocator - * - * The resulting memory area is zeroed so it can be mapped to userspace - * without leaking data. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, - flags | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, node, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) @@ -2792,8 +2688,8 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -3054,6 +2950,7 @@ finished: * @vma: vma to cover * @uaddr: target user address to start at * @kaddr: virtual address of vmalloc kernel memory + * @pgoff: offset from @kaddr to start at * @size: size of map area * * Returns: 0 for success, -Exxx on failure @@ -3066,9 +2963,15 @@ finished: * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, - void *kaddr, unsigned long size) + void *kaddr, unsigned long pgoff, + unsigned long size) { struct vm_struct *area; + unsigned long off; + unsigned long end_index; + + if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) + return -EINVAL; size = PAGE_ALIGN(size); @@ -3082,8 +2985,10 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) return -EINVAL; - if (kaddr + size > area->addr + get_vm_area_size(area)) + if (check_add_overflow(size, off, &end_index) || + end_index > get_vm_area_size(area)) return -EINVAL; + kaddr += off; do { struct page *page = vmalloc_to_page(kaddr); @@ -3122,23 +3027,11 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { return remap_vmalloc_range_partial(vma, vma->vm_start, - addr + (pgoff << PAGE_SHIFT), + addr, pgoff, vma->vm_end - vma->vm_start); } EXPORT_SYMBOL(remap_vmalloc_range); -/* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. - * - * The purpose of this function is to make sure the vmalloc area - * mappings are identical in all page-tables in the system. - */ -void __weak vmalloc_sync_all(void) -{ -} - - static int f(pte_t *pte, unsigned long addr, void *data) { pte_t ***p = data; @@ -3365,7 +3258,7 @@ retry: goto overflow; /* - * If required width exeeds current VA block, move + * If required width exceeds current VA block, move * base downwards and then recheck. */ if (base + end > va->va_end) { @@ -3460,8 +3353,9 @@ recovery: orig_end = vas[area]->va_end; va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, &free_vmap_area_list); - kasan_release_vmalloc(orig_start, orig_end, - va->va_start, va->va_end); + if (va) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); vas[area] = NULL; } @@ -3509,8 +3403,9 @@ err_free_shadow: orig_end = vas[area]->va_end; va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, &free_vmap_area_list); - kasan_release_vmalloc(orig_start, orig_end, - va->va_start, va->va_end); + if (va) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); vas[area] = NULL; kfree(vms[area]); } |