aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r--mm/vmalloc.c192
1 files changed, 134 insertions, 58 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4a7d7459c4f9..4d3b3d60d893 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -331,6 +331,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
static DEFINE_SPINLOCK(vmap_area_lock);
+static DEFINE_SPINLOCK(free_vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
@@ -682,7 +683,7 @@ insert_vmap_area_augment(struct vmap_area *va,
* free area is inserted. If VA has been merged, it is
* freed.
*/
-static __always_inline void
+static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
@@ -749,7 +750,10 @@ merge_or_add_vmap_area(struct vmap_area *va,
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
- return;
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
}
}
@@ -758,6 +762,8 @@ insert:
link_va(va, root, parent, link, head);
augment_tree_propagate_from(va);
}
+
+ return va;
}
static __always_inline bool
@@ -968,6 +974,19 @@ adjust_va_to_fit_type(struct vmap_area *va,
* There are a few exceptions though, as an example it is
* a first allocation (early boot up) when we have "one"
* big free space that has to be split.
+ *
+ * Also we can hit this path in case of regular "vmap"
+ * allocations, if "this" current CPU was not preloaded.
+ * See the comment in alloc_vmap_area() why. If so, then
+ * GFP_NOWAIT is used instead to get an extra object for
+ * split purpose. That is rare and most time does not
+ * occur.
+ *
+ * What happens if an allocation gets failed. Basically,
+ * an "overflow" path is triggered to purge lazily freed
+ * areas to free some memory, then, the "retry" path is
+ * triggered to repeat one more time. See more details
+ * in alloc_vmap_area() function.
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
@@ -1063,9 +1082,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
might_sleep();
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
- va = kmem_cache_alloc_node(vmap_area_cachep,
- gfp_mask & GFP_RECLAIM_MASK, node);
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -1073,49 +1092,55 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
* Only scan the relevant parts containing pointers to other objects
* to avoid false negatives.
*/
- kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
/*
- * Preload this CPU with one extra vmap_area object to ensure
- * that we have it available when fit type of free area is
- * NE_FIT_TYPE.
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. Please note, it
+ * does not guarantee that an allocation occurs on a CPU that
+ * is preloaded, instead we minimize the case when it is not.
+ * It can happen because of cpu migration, because there is a
+ * race until the below spinlock is taken.
*
* The preload is done in non-atomic context, thus it allows us
* to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure.
+ * low memory condition and high memory pressure. In rare case,
+ * if not preloaded, GFP_NOWAIT is used.
*
- * Even if it fails we do not really care about that. Just proceed
- * as it is. "overflow" path will refill the cache we allocate from.
+ * Set "pva" to NULL here, because of "retry" path.
*/
- preempt_disable();
- if (!__this_cpu_read(ne_fit_preload_node)) {
- preempt_enable();
- pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
- preempt_disable();
-
- if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
- if (pva)
- kmem_cache_free(vmap_area_cachep, pva);
- }
- }
+ pva = NULL;
- spin_lock(&vmap_area_lock);
- preempt_enable();
+ if (!this_cpu_read(ne_fit_preload_node))
+ /*
+ * Even if it fails we do not really care about that.
+ * Just proceed as it is. If needed "overflow" path
+ * will refill the cache we allocate from.
+ */
+ pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(&free_vmap_area_lock);
+
+ if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
+ kmem_cache_free(vmap_area_cachep, pva);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
addr = __alloc_vmap_area(size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
+
if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -1125,7 +1150,6 @@ retry:
return va;
overflow:
- spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
@@ -1161,28 +1185,24 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
-static void __free_vmap_area(struct vmap_area *va)
+/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
{
/*
* Remove from the busy tree/list.
*/
+ spin_lock(&vmap_area_lock);
unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
/*
- * Merge VA with its neighbors, otherwise just add it.
+ * Insert/Merge it back to the free tree/list.
*/
- merge_or_add_vmap_area(va,
- &free_vmap_area_root, &free_vmap_area_list);
-}
-
-/*
- * Free a region of KVA allocated by alloc_vmap_area
- */
-static void free_vmap_area(struct vmap_area *va)
-{
- spin_lock(&vmap_area_lock);
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
}
/*
@@ -1275,24 +1295,30 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;
- spin_lock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long orig_start = va->va_start;
+ unsigned long orig_end = va->va_end;
/*
* Finally insert or merge lazily-freed area. It is
* detached and there is no need to "unlink" it from
* anything.
*/
- merge_or_add_vmap_area(va,
- &free_vmap_area_root, &free_vmap_area_list);
+ va = merge_or_add_vmap_area(va, &free_vmap_area_root,
+ &free_vmap_area_list);
+
+ if (is_vmalloc_or_module_addr((void *)orig_start))
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
atomic_long_sub(nr, &vmap_lazy_nr);
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
- cond_resched_lock(&vmap_area_lock);
+ cond_resched_lock(&free_vmap_area_lock);
}
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
return true;
}
@@ -2014,15 +2040,21 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
}
EXPORT_SYMBOL_GPL(map_vm_area);
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
+static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+ struct vmap_area *va, unsigned long flags, const void *caller)
{
- spin_lock(&vmap_area_lock);
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
+}
+
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, const void *caller)
+{
+ spin_lock(&vmap_area_lock);
+ setup_vmalloc_vm_locked(vm, va, flags, caller);
spin_unlock(&vmap_area_lock);
}
@@ -2068,6 +2100,22 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
setup_vmalloc_vm(area, va, flags, caller);
+ /*
+ * For KASAN, if we are in vmalloc space, we need to cover the shadow
+ * area with real memory. If we come here through VM_ALLOC, this is
+ * done by a higher level function that has access to the true size,
+ * which might not be a full page.
+ *
+ * We assume module space comes via VM_ALLOC path.
+ */
+ if (is_vmalloc_addr(area->addr) && !(area->flags & VM_ALLOC)) {
+ if (kasan_populate_vmalloc(area->size, area)) {
+ unmap_vmap_area(va);
+ kfree(area);
+ return NULL;
+ }
+ }
+
return area;
}
@@ -2245,6 +2293,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
+ if (area->flags & VM_KASAN)
+ kasan_poison_vmalloc(area->addr, area->size);
+
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
@@ -2440,7 +2491,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail;
}
area->pages[i] = page;
- if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
+ if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
@@ -2497,6 +2548,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
if (!addr)
return NULL;
+ if (is_vmalloc_or_module_addr(area->addr)) {
+ if (kasan_populate_vmalloc(real_size, area))
+ return NULL;
+ }
+
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
@@ -3282,7 +3338,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
goto err_free;
}
retry:
- spin_lock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
/* start scanning - we scan from the top, begin with the last area */
area = term_area = last_area;
@@ -3364,29 +3420,44 @@ retry:
va = vas[area];
va->va_start = start;
va->va_end = start + size;
-
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
/* insert all vm's */
- for (area = 0; area < nr_vms; area++)
- setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+ spin_lock(&vmap_area_lock);
+ for (area = 0; area < nr_vms; area++) {
+ insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
+
+ setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
+ }
+ spin_unlock(&vmap_area_lock);
+
+ /* populate the shadow space outside of the lock */
+ for (area = 0; area < nr_vms; area++) {
+ /* assume success here */
+ kasan_populate_vmalloc(sizes[area], vms[area]);
+ }
kfree(vas);
return vms;
recovery:
- /* Remove previously inserted areas. */
+ /*
+ * Remove previously allocated areas. There is no
+ * need in removing these areas from the busy tree,
+ * because they are inserted only on the final step
+ * and when pcpu_get_vm_areas() is success.
+ */
while (area--) {
- __free_vmap_area(vas[area]);
+ merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
vas[area] = NULL;
}
overflow:
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = true;
@@ -3437,9 +3508,12 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
+ __acquires(&vmap_purge_lock)
__acquires(&vmap_area_lock)
{
+ mutex_lock(&vmap_purge_lock);
spin_lock(&vmap_area_lock);
+
return seq_list_start(&vmap_area_list, *pos);
}
@@ -3449,8 +3523,10 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
}
static void s_stop(struct seq_file *m, void *p)
+ __releases(&vmap_purge_lock)
__releases(&vmap_area_lock)
{
+ mutex_unlock(&vmap_purge_lock);
spin_unlock(&vmap_area_lock);
}