From 168e47f2a6581fdbc5bb1845aeca1e50e2bc5c4b Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Wed, 25 Feb 2015 14:14:57 -0800 Subject: kernel/module.c: Update debug alignment after symtable generation When CONFIG_DEBUG_SET_MODULE_RONX is enabled, the sizes of module sections are aligned up so appropriate permissions can be applied. Adjusting for the symbol table may cause them to become unaligned. Make sure to re-align the sizes afterward. Signed-off-by: Laura Abbott Acked-by: Rusty Russell Signed-off-by: Catalin Marinas --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/module.c') diff --git a/kernel/module.c b/kernel/module.c index b34813f725e9..cc93cf68653c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2313,11 +2313,13 @@ static void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); mod->core_size += strtab_size; + mod->core_size = debug_align(mod->core_size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; + mod->init_size = debug_align(mod->init_size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } -- cgit v1.3-8-gc7d7 From a5af5aa8b67dfdba36c853b70564fd2dfe73d478 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 12 Mar 2015 16:26:11 -0700 Subject: kasan, module, vmalloc: rework shadow allocation for modules Current approach in handling shadow memory for modules is broken. Shadow memory could be freed only after memory shadow corresponds it is no longer used. vfree() called from interrupt context could use memory its freeing to store 'struct llist_node' in it: void vfree(const void *addr) { ... if (unlikely(in_interrupt())) { struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); Later this list node used in free_work() which actually frees memory. Currently module_memfree() called in interrupt context will free shadow before freeing module's memory which could provoke kernel crash. So shadow memory should be freed after module's memory. However, such deallocation order could race with kasan_module_alloc() in module_alloc(). Free shadow right before releasing vm area. At this point vfree()'d memory is not used anymore and yet not available for other allocations. New VM_KASAN flag used to indicate that vm area has dynamically allocated shadow memory so kasan frees shadow only if it was previously allocated. Signed-off-by: Andrey Ryabinin Acked-by: Rusty Russell Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 5 +++-- include/linux/vmalloc.h | 1 + kernel/module.c | 2 -- mm/kasan/kasan.c | 14 +++++++++++--- mm/vmalloc.c | 1 + 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel/module.c') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 72ba725ddf9c..5fa48a21d73e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -5,6 +5,7 @@ struct kmem_cache; struct page; +struct vm_struct; #ifdef CONFIG_KASAN @@ -52,7 +53,7 @@ void kasan_slab_free(struct kmem_cache *s, void *object); #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) int kasan_module_alloc(void *addr, size_t size); -void kasan_module_free(void *addr); +void kasan_free_shadow(const struct vm_struct *vm); #else /* CONFIG_KASAN */ @@ -82,7 +83,7 @@ static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {} static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } -static inline void kasan_module_free(void *addr) {} +static inline void kasan_free_shadow(const struct vm_struct *vm) {} #endif /* CONFIG_KASAN */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 7d7acb35603d..0ec598381f97 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -17,6 +17,7 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ +#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ /* bits [20..32] reserved for arch specific ioremap internals */ /* diff --git a/kernel/module.c b/kernel/module.c index cc93cf68653c..b3d634ed06c9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -1814,7 +1813,6 @@ static void unset_module_init_ro_nx(struct module *mod) { } void __weak module_memfree(void *module_region) { vfree(module_region); - kasan_module_free(module_region); } void __weak module_arch_cleanup(struct module *mod) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 78fee632a7ee..936d81661c47 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "kasan.h" @@ -414,12 +415,19 @@ int kasan_module_alloc(void *addr, size_t size) GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); - return ret ? 0 : -ENOMEM; + + if (ret) { + find_vm_area(addr)->flags |= VM_KASAN; + return 0; + } + + return -ENOMEM; } -void kasan_module_free(void *addr) +void kasan_free_shadow(const struct vm_struct *vm) { - vfree(kasan_mem_to_shadow(addr)); + if (vm->flags & VM_KASAN) + vfree(kasan_mem_to_shadow(vm->addr)); } static void register_global(struct kasan_global *global) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35b25e1340ca..49abccf29a29 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1418,6 +1418,7 @@ struct vm_struct *remove_vm_area(const void *addr) spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); + kasan_free_shadow(vm); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; -- cgit v1.3-8-gc7d7 From 35a9393c95b31870a74f51a3e7455f33f5657b6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Feb 2015 16:23:11 +0100 Subject: lockdep: Fix the module unload key range freeing logic Module unload calls lockdep_free_key_range(), which removes entries from the data structures. Most of the lockdep code OTOH assumes the data structures are append only; in specific see the comments in add_lock_to_list() and look_up_lock_class(). Clearly this has only worked by accident; make it work proper. The actual scenario to make it go boom would involve the memory freed by the module unlock being re-allocated and re-used for a lock inside of a rcu-sched grace period. This is a very unlikely scenario, still better plug the hole. Use RCU list iteration in all places and ammend the comments. Change lockdep_free_key_range() to issue a sync_sched() between removal from the lists and returning -- which results in the memory being freed. Further ensure the callers are placed correctly and comment the requirements. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andrey Tsyvarev Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rusty Russell Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 81 ++++++++++++++++++++++++++++++++---------------- kernel/module.c | 8 ++--- 2 files changed, 59 insertions(+), 30 deletions(-) (limited to 'kernel/module.c') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 88d0d4420ad2..ba77ab5f64dd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class) if (!new_class->name) return 0; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { if (new_class->key - new_class->subclass == class->key) return class->name_version; if (class->name && !strcmp(class->name, new_class->name)) @@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) hash_head = classhashentry(key); /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: + * We do an RCU walk of the hash, see lockdep_free_key_range(). */ - list_for_each_entry(class, hash_head, hash_entry) { + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return NULL; + + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct list_head *hash_head; struct lock_class *class; - unsigned long flags; + + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); if (likely(class)) @@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) key = lock->key->subkeys + subclass; hash_head = classhashentry(key); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } /* * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry(class, hash_head, hash_entry) + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; + } + /* * Allocate a new key from the static array, and add it to * the hash: */ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); return NULL; } - raw_local_irq_restore(flags); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); @@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); - raw_local_irq_restore(flags); printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) @@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("\n"); dump_stack(); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } } out_unlock_set: graph_unlock(); - raw_local_irq_restore(flags); out_set_class_cache: if (!subclass || force) @@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, entry->distance = distance; entry->trace = *trace; /* - * Since we never remove from the dependency list, the list can - * be walked lockless by other CPUs, it's only allocation - * that must be protected by the spinlock. But this also means - * we must make new entries visible only once writes to the - * entry become visible - hence the RCU op: + * Both allocation and removal are done under the graph lock; but + * iteration is under RCU-sched; see look_up_lock_class() and + * lockdep_free_key_range(). */ list_add_tail_rcu(&entry->entry, head); @@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry, else head = &lock->class->locks_before; - list_for_each_entry(entry, head, entry) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + list_for_each_entry_rcu(entry, head, entry) { if (!lock_accessed(entry)) { unsigned int cq_depth; mark_lock_accessed(entry, lock); @@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry(chain, hash_head, entry) { + list_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (unlikely(!debug_locks)) return; - if (subclass) + if (subclass) { + unsigned long flags; + + if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; register_lock_class(lock, subclass, 1); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + } } EXPORT_SYMBOL_GPL(lockdep_init_map); @@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one sync_sched() before getting here, so we're guaranteed + * nobody will look up these exact classes -- they're properly dead but still + * allocated. + */ void lockdep_free_key_range(void *start, unsigned long size) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i; @@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size) if (locked) graph_unlock(); raw_local_irq_restore(flags); + + /* + * Wait for any possible iterators from look_up_lock_class() to pass + * before continuing to free the memory they refer to. + * + * sync_sched() is sufficient because the read-side is IRQ disable. + */ + synchronize_sched(); + + /* + * XXX at this point we could return the resources to the pool; + * instead we leak them. We would need to change to bitmap allocators + * instead of the linear allocators we have now. + */ } void lockdep_reset_lock(struct lockdep_map *lock) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i, j; @@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) diff --git a/kernel/module.c b/kernel/module.c index b3d634ed06c9..99fdf94efce8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1865,7 +1865,7 @@ static void free_module(struct module *mod) kfree(mod->args); percpu_modfree(mod); - /* Free lock-classes: */ + /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ @@ -3349,9 +3349,6 @@ static int load_module(struct load_info *info, const char __user *uargs, module_bug_cleanup(mod); mutex_unlock(&module_mutex); - /* Free lock-classes: */ - lockdep_free_key_range(mod->module_core, mod->core_size); - /* we can't deallocate the module until we clear memory protection */ unset_module_init_ro_nx(mod); unset_module_core_ro_nx(mod); @@ -3375,6 +3372,9 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_rcu(); mutex_unlock(&module_mutex); free_module: + /* Free lock-classes; relies on the preceding sync_rcu() */ + lockdep_free_key_range(mod->module_core, mod->core_size); + module_deallocate(mod, info); free_copy: free_copy(info); -- cgit v1.3-8-gc7d7 From 3673b8e4ce7237160fa31ee8d7e94a4d5a9976a1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 25 Mar 2015 15:44:21 -0400 Subject: tracing: Allow for modules to convert their enums to values Update the infrastructure such that modules that declare TRACE_DEFINE_ENUM() will have those enums converted into their values in the tracepoint print fmt strings. Link: http://lkml.kernel.org/r/87vbhjp74q.fsf@rustcorp.com.au Acked-by: Rusty Russell Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/linux/module.h | 2 ++ kernel/module.c | 3 +++ kernel/trace/trace.c | 49 +++++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace_events.c | 2 +- 4 files changed, 51 insertions(+), 5 deletions(-) (limited to 'kernel/module.c') diff --git a/include/linux/module.h b/include/linux/module.h index 42999fe2dbd0..53dc41dd5c62 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -338,6 +338,8 @@ struct module { #ifdef CONFIG_EVENT_TRACING struct ftrace_event_call **trace_events; unsigned int num_trace_events; + struct trace_enum_map **trace_enums; + unsigned int num_trace_enums; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD unsigned int num_ftrace_callsites; diff --git a/kernel/module.c b/kernel/module.c index b3d634ed06c9..d8f8ab271c2b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2753,6 +2753,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); + mod->trace_enums = section_objs(info, "_ftrace_enum_map", + sizeof(*mod->trace_enums), + &mod->num_trace_enums); #endif #ifdef CONFIG_TRACING mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebf49649534c..28e6654e640d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3908,11 +3908,9 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { .write = tracing_saved_cmdlines_size_write, }; -static void -trace_insert_enum_map(struct trace_enum_map **start, struct trace_enum_map **stop) +static void trace_insert_enum_map(struct trace_enum_map **start, int len) { struct trace_enum_map **map; - int len = stop - start; if (len <= 0) return; @@ -6561,9 +6559,48 @@ extern struct trace_enum_map *__stop_ftrace_enum_maps[]; static void __init trace_enum_init(void) { - trace_insert_enum_map(__start_ftrace_enum_maps, __stop_ftrace_enum_maps); + int len; + + len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; + trace_insert_enum_map(__start_ftrace_enum_maps, len); +} + +#ifdef CONFIG_MODULES +static void trace_module_add_enums(struct module *mod) +{ + if (!mod->num_trace_enums) + return; + + /* + * Modules with bad taint do not have events created, do + * not bother with enums either. + */ + if (trace_module_has_bad_taint(mod)) + return; + + trace_insert_enum_map(mod->trace_enums, mod->num_trace_enums); +} + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_enums(mod); + break; + } + + return 0; } +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; +#endif + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -6590,6 +6627,10 @@ static __init int tracer_init_debugfs(void) trace_enum_init(); +#ifdef CONFIG_MODULES + register_module_notifier(&trace_module_nb); +#endif + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fc58c50fbf01..a576bbe75577 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2034,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self, static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, - .priority = 0, + .priority = 1, /* higher than trace.c module notify */ }; #endif /* CONFIG_MODULES */ -- cgit v1.3-8-gc7d7 From 3afe9f849600645723246baa95e7559caeca6ce9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 7 Apr 2015 10:33:49 -0700 Subject: Copy the kernel module data from user space in chunks Unlike most (all?) other copies from user space, kernel module loading is almost unlimited in size. So we do a potentially huge "copy_from_user()" when we copy the module data from user space to the kernel buffer, which can be a latency concern when preemption is disabled (or voluntary). Also, because 'copy_from_user()' clears the tail of the kernel buffer on failures, even a *failed* copy can end up wasting a lot of time. Normally neither of these are concerns in real life, but they do trigger when doing stress-testing with trinity. Running in a VM seems to add its own overheadm causing trinity module load testing to even trigger the watchdog. The simple fix is to just chunk up the module loading, so that it never tries to copy insanely big areas in one go. That bounds the latency, and also the amount of (unnecessarily, in this case) cleared memory for the failure case. Reported-by: Sasha Levin Signed-off-by: Linus Torvalds --- kernel/module.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel/module.c') diff --git a/kernel/module.c b/kernel/module.c index 99fdf94efce8..ec53f594e9c9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info) return 0; } +#define COPY_CHUNK_SIZE (16*PAGE_SIZE) + +static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len) +{ + do { + unsigned long n = min(len, COPY_CHUNK_SIZE); + + if (copy_from_user(dst, usrc, n) != 0) + return -EFAULT; + cond_resched(); + dst += n; + usrc += n; + len -= n; + } while (len); + return 0; +} + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (!info->hdr) return -ENOMEM; - if (copy_from_user(info->hdr, umod, info->len) != 0) { + if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { vfree(info->hdr); return -EFAULT; } -- cgit v1.3-8-gc7d7