diff options
author | 2022-08-02 10:06:12 -0700 | |
---|---|---|
committer | 2022-08-02 10:06:12 -0700 | |
commit | 8bb5e7f4dcd9b9ef22a3ea25c9066a8a968f12dd (patch) | |
tree | 0f1383880607a227142f9388a066959926233ff1 /kernel | |
parent | Input: document the units for resolution of size axes (diff) | |
parent | Input: adc-joystick - fix ordering in adc_joystick_probe() (diff) | |
download | wireguard-linux-8bb5e7f4dcd9b9ef22a3ea25c9066a8a968f12dd.tar.xz wireguard-linux-8bb5e7f4dcd9b9ef22a3ea25c9066a8a968f12dd.zip |
Merge branch 'next' into for-linus
Prepare input updates for 5.20 (or 6.0) merge window.
Diffstat (limited to 'kernel')
283 files changed, 17938 insertions, 8964 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index ce77f0265660..c2f1fd95a821 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -96,8 +96,9 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY select PREEMPT_BUILD - default y + default y if HAVE_PREEMPT_DYNAMIC_CALL help This option allows to define the preemption model on the kernel command line parameter and thus override the default preemption diff --git a/kernel/Makefile b/kernel/Makefile index 56f4ee97f328..a7e1f49ab2b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ - extable.o params.o \ + extable.o params.o platform-feature.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o @@ -29,7 +29,6 @@ KCOV_INSTRUMENT_softirq.o := n KCSAN_SANITIZE_softirq.o = n # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. -KCOV_INSTRUMENT_module.o := n KCOV_INSTRUMENT_extable.o := n KCOV_INSTRUMENT_stacktrace.o := n # Don't self-instrument. @@ -53,6 +52,7 @@ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ obj-y += entry/ +obj-$(CONFIG_MODULES) += module/ obj-$(CONFIG_KCMP) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -66,9 +66,6 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o -obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o @@ -108,12 +105,14 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ +obj-$(CONFIG_RETHOOK) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o -obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o +obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o +obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/acct.c b/kernel/acct.c index 3df53cf1dcd5..13706356ec54 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -70,11 +70,31 @@ * Turned into sysctl-controllable parameters. AV, 12/11/98 */ -int acct_parm[3] = {4, 2, 30}; +static int acct_parm[3] = {4, 2, 30}; #define RESUME (acct_parm[0]) /* >foo% free space - resume */ #define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */ #define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */ +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_acct_table[] = { + { + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static __init int kernel_acct_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_acct_table); + return 0; +} +late_initcall(kernel_acct_sysctls_init); +#endif /* CONFIG_SYSCTL */ + /* * External references and all of the globals. */ diff --git a/kernel/audit.h b/kernel/audit.h index c4498090a5bd..58b66543b4d5 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -201,6 +201,10 @@ struct audit_context { struct { char *name; } module; + struct { + struct audit_ntp_data ntp_data; + struct timespec64 tk_injoffset; + } time; }; int fds[2]; struct audit_proctitle proctitle; diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 02348b48447c..6432a37ac1c9 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -100,7 +100,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true); + ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); if (ret < 0) { fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); @@ -181,7 +181,8 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = { static int __init audit_fsnotify_init(void) { - audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops); + audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops, + FSNOTIFY_GROUP_DUPS); if (IS_ERR(audit_fsnotify_group)) { audit_fsnotify_group = NULL; audit_panic("cannot create audit fsnotify group"); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e7315d487163..e867c17d3f84 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -351,7 +351,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) struct audit_chunk *new; int size; - mutex_lock(&audit_tree_group->mark_mutex); + fsnotify_group_lock(audit_tree_group); /* * mark_mutex stabilizes chunk attached to the mark so we can check * whether it didn't change while we've dropped hash_lock. @@ -368,7 +368,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) replace_mark_chunk(mark, NULL); spin_unlock(&hash_lock); fsnotify_detach_mark(mark); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); audit_mark_put_chunk(chunk); fsnotify_free_mark(mark); return; @@ -385,12 +385,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) */ replace_chunk(new, chunk); spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); audit_mark_put_chunk(chunk); return; out_mutex: - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); } /* Call with group->mark_mutex held, releases it */ @@ -400,19 +400,19 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) struct audit_chunk *chunk = alloc_chunk(1); if (!chunk) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); return -ENOMEM; } mark = alloc_mark(); if (!mark) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); kfree(chunk); return -ENOMEM; } if (fsnotify_add_inode_mark_locked(mark, inode, 0)) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); kfree(chunk); return -ENOSPC; @@ -422,7 +422,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) if (tree->goner) { spin_unlock(&hash_lock); fsnotify_detach_mark(mark); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); kfree(chunk); @@ -444,7 +444,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) */ insert_hash(chunk); spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); /* * Drop our initial reference. When mark we point to is getting freed, * we get notification through ->freeing_mark callback and cleanup @@ -462,7 +462,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) struct audit_node *p; int n; - mutex_lock(&audit_tree_group->mark_mutex); + fsnotify_group_lock(audit_tree_group); mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); if (!mark) return create_chunk(inode, tree); @@ -478,7 +478,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); return 0; } @@ -487,7 +487,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); return -ENOMEM; } @@ -495,7 +495,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); kfree(chunk); return 0; @@ -515,7 +515,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) */ replace_chunk(chunk, old); spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */ audit_mark_put_chunk(old); @@ -1044,12 +1044,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark, { struct audit_chunk *chunk; - mutex_lock(&mark->group->mark_mutex); + fsnotify_group_lock(mark->group); spin_lock(&hash_lock); chunk = mark_chunk(mark); replace_mark_chunk(mark, NULL); spin_unlock(&hash_lock); - mutex_unlock(&mark->group->mark_mutex); + fsnotify_group_unlock(mark->group); if (chunk) { evict_chunk(chunk); audit_mark_put_chunk(chunk); @@ -1074,7 +1074,7 @@ static int __init audit_tree_init(void) audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC); - audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); + audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0); if (IS_ERR(audit_tree_group)) audit_panic("cannot initialize fsnotify group for rectree watches"); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 713b256be944..4b0957aa2cd4 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -493,7 +493,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = { static int __init audit_watch_init(void) { - audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); + audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0); if (IS_ERR(audit_watch_group)) { audit_watch_group = NULL; audit_panic("cannot create audit fsnotify group"); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a83928cbdcb7..3a8c9d744800 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1014,10 +1014,10 @@ static void audit_reset_context(struct audit_context *ctx) ctx->target_comm[0] = '\0'; unroll_tree_refs(ctx, NULL, 0); WARN_ON(!list_empty(&ctx->killed_trees)); - ctx->type = 0; audit_free_module(ctx); ctx->fds[0] = -1; audit_proctitle_free(ctx); + ctx->type = 0; /* reset last for audit_free_*() */ } static inline struct audit_context *audit_alloc_context(enum audit_state state) @@ -1340,6 +1340,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) from_kuid(&init_user_ns, name->fcap.rootid)); } +static void audit_log_time(struct audit_context *context, struct audit_buffer **ab) +{ + const struct audit_ntp_data *ntp = &context->time.ntp_data; + const struct timespec64 *tk = &context->time.tk_injoffset; + static const char * const ntp_name[] = { + "offset", + "freq", + "status", + "tai", + "tick", + "adjust", + }; + int type; + + if (context->type == AUDIT_TIME_ADJNTPVAL) { + for (type = 0; type < AUDIT_NTP_NVALS; type++) { + if (ntp->vals[type].newval != ntp->vals[type].oldval) { + if (!*ab) { + *ab = audit_log_start(context, + GFP_KERNEL, + AUDIT_TIME_ADJNTPVAL); + if (!*ab) + return; + } + audit_log_format(*ab, "op=%s old=%lli new=%lli", + ntp_name[type], + ntp->vals[type].oldval, + ntp->vals[type].newval); + audit_log_end(*ab); + *ab = NULL; + } + } + } + if (tk->tv_sec != 0 || tk->tv_nsec != 0) { + if (!*ab) { + *ab = audit_log_start(context, GFP_KERNEL, + AUDIT_TIME_INJOFFSET); + if (!*ab) + return; + } + audit_log_format(*ab, "sec=%lli nsec=%li", + (long long)tk->tv_sec, tk->tv_nsec); + audit_log_end(*ab); + *ab = NULL; + } +} + static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1454,6 +1501,11 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "(null)"); break; + case AUDIT_TIME_ADJNTPVAL: + case AUDIT_TIME_INJOFFSET: + /* this call deviates from the rest, eating the buffer */ + audit_log_time(context, &ab); + break; } audit_log_end(ab); } @@ -1907,6 +1959,12 @@ void __audit_uring_exit(int success, long code) { struct audit_context *ctx = audit_context(); + if (ctx->dummy) { + if (ctx->context != AUDIT_CTX_URING) + return; + goto out; + } + if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case @@ -2849,31 +2907,26 @@ void __audit_fanotify(unsigned int response) void __audit_tk_injoffset(struct timespec64 offset) { - audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET, - "sec=%lli nsec=%li", - (long long)offset.tv_sec, offset.tv_nsec); -} - -static void audit_log_ntp_val(const struct audit_ntp_data *ad, - const char *op, enum audit_ntp_type type) -{ - const struct audit_ntp_val *val = &ad->vals[type]; - - if (val->newval == val->oldval) - return; + struct audit_context *context = audit_context(); - audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL, - "op=%s old=%lli new=%lli", op, val->oldval, val->newval); + /* only set type if not already set by NTP */ + if (!context->type) + context->type = AUDIT_TIME_INJOFFSET; + memcpy(&context->time.tk_injoffset, &offset, sizeof(offset)); } void __audit_ntp_log(const struct audit_ntp_data *ad) { - audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET); - audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ); - audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS); - audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI); - audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK); - audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); + struct audit_context *context = audit_context(); + int type; + + for (type = 0; type < AUDIT_NTP_NVALS; type++) + if (ad->vals[type].newval != ad->vals[type].oldval) { + /* unconditionally set type, overwriting TK */ + context->type = AUDIT_TIME_ADJNTPVAL; + memcpy(&context->time.ntp_data, ad, sizeof(*ad)); + break; + } } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d24d518ddd63..2dfe1079f772 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -27,9 +27,11 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_RCU if PREEMPTION select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET + select PAGE_POOL if NET default n help Enable the bpf() system call that allows to manipulate BPF programs @@ -58,6 +60,10 @@ config BPF_JIT_ALWAYS_ON Enables BPF JIT and removes BPF interpreter to avoid speculative execution of BPF instructions by the interpreter. + When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable + is permanently set to 1 and setting any other value than that will + return failure. + config BPF_JIT_DEFAULT_ON def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON depends on HAVE_EBPF_JIT && BPF_JIT diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c1a9be6a4b9f..057ba8e01e70 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c7a5be3bf8be..fe40d3b9458f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,6 +11,7 @@ #include <linux/perf_event.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> +#include <linux/btf_ids.h> #include "map_in_map.h" @@ -242,6 +243,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (cpu >= nr_cpu_ids) + return NULL; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); +} + int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -287,10 +302,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } -static void check_and_free_timer_in_array(struct bpf_array *arr, void *val) +static void check_and_free_fields(struct bpf_array *arr, void *val) { - if (unlikely(map_value_has_timer(&arr->map))) + if (map_value_has_timer(&arr->map)) bpf_timer_cancel_and_free(val + arr->map.timer_off); + if (map_value_has_kptrs(&arr->map)) + bpf_map_free_kptrs(&arr->map, val); } /* Called from syscall or from eBPF program */ @@ -327,7 +344,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - check_and_free_timer_in_array(array, val); + check_and_free_fields(array, val); } return 0; } @@ -386,7 +403,8 @@ static void array_map_free_timers(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - if (likely(!map_value_has_timer(map))) + /* We don't reset or free kptr on uref dropping to zero. */ + if (!map_value_has_timer(map)) return; for (i = 0; i < array->map.max_entries; i++) @@ -398,6 +416,13 @@ static void array_map_free_timers(struct bpf_map *map) static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + if (map_value_has_kptrs(map)) { + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_kptrs(map, array->value + array->elem_size * i); + bpf_map_free_kptr_off_tab(map); + } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); @@ -680,7 +705,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_ return num_elems; } -static int array_map_btf_id; +BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array) const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, @@ -701,12 +726,10 @@ const struct bpf_map_ops array_map_ops = { .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, @@ -716,14 +739,14 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &percpu_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -837,13 +860,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog = bpf_prog_get(fd); if (IS_ERR(prog)) return prog; - if (!bpf_prog_array_compatible(array, prog)) { + if (!bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } @@ -1071,7 +1093,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); - spin_lock_init(&aux->owner.lock); map = array_map_alloc(attr); if (IS_ERR(map)) { @@ -1104,7 +1125,6 @@ static void prog_array_map_free(struct bpf_map *map) * Thus, prog_array_map cannot be used as an inner_map * and map_meta_equal is not implemented. */ -static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, @@ -1120,8 +1140,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &prog_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -1210,7 +1229,6 @@ static void perf_event_fd_array_map_free(struct bpf_map *map) fd_array_map_free(map); } -static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, @@ -1223,8 +1241,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &perf_event_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; #ifdef CONFIG_CGROUPS @@ -1247,7 +1264,6 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } -static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, @@ -1259,8 +1275,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &cgroup_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; #endif @@ -1334,7 +1349,6 @@ static int array_of_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static int array_of_maps_map_btf_id; const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, @@ -1346,7 +1360,8 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &array_of_maps_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index b141a1346f72..b9ea539a5561 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -7,6 +7,7 @@ #include <linux/err.h> #include <linux/jhash.h> #include <linux/random.h> +#include <linux/btf_ids.h> #define BLOOM_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK) @@ -192,7 +193,7 @@ static int bloom_map_check_btf(const struct bpf_map *map, return btf_type_is_void(key_type) ? 0 : -EINVAL; } -static int bpf_bloom_map_btf_id; +BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter) const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bloom_map_alloc, @@ -205,6 +206,5 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_update_elem = bloom_map_update_elem, .map_delete_elem = bloom_map_delete_elem, .map_check_btf = bloom_map_check_btf, - .map_btf_name = "bpf_bloom_filter", - .map_btf_id = &bpf_bloom_map_btf_id, + .map_btf_id = &bpf_bloom_map_btf_ids[0], }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e29d9e3d853e..5f7683b19199 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -90,7 +90,7 @@ void bpf_inode_storage_free(struct inode *inode) */ bpf_selem_unlink_map(selem); free_inode_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false); + local_storage, selem, false, false); } raw_spin_unlock_bh(&local_storage->lock); rcu_read_unlock(); @@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(f->f_inode, (struct bpf_local_storage_map *)map, - value, map_flags); + value, map_flags, GFP_ATOMIC); fput(f); return PTR_ERR_OR_ZERO(sdata); } @@ -149,7 +149,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } @@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) return err; } -BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } @@ -244,7 +245,8 @@ static void inode_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap, NULL); } -static int inode_storage_map_btf_id; +BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, + bpf_local_storage_map) const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -255,8 +257,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &inode_storage_map_btf_id, + .map_btf_id = &inode_storage_map_btf_ids[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b7aef5b3416d..d5d96ceca105 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -5,6 +5,7 @@ #include <linux/anon_inodes.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/rcupdate_trace.h> struct bpf_iter_target_info { struct list_head list; @@ -329,35 +330,34 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo, bool bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; + struct bpf_iter_target_info *tinfo = NULL, *iter; u32 prog_btf_id = prog->aux->attach_btf_id; const char *prefix = BPF_ITER_FUNC_PREFIX; - struct bpf_iter_target_info *tinfo; int prefix_len = strlen(prefix); - bool supported = false; if (strncmp(attach_fname, prefix, prefix_len)) return false; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { - supported = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id && iter->btf_id == prog_btf_id) { + tinfo = iter; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { - cache_btf_id(tinfo, prog); - supported = true; + if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) { + cache_btf_id(iter, prog); + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (supported) { + if (tinfo) { prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; } - return supported; + return tinfo != NULL; } const struct bpf_func_proto * @@ -498,12 +498,11 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog) { + struct bpf_iter_target_info *tinfo = NULL, *iter; struct bpf_link_primer link_primer; - struct bpf_iter_target_info *tinfo; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; - bool existed = false; bpfptr_t ulinfo; int err; @@ -529,14 +528,14 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id == prog_btf_id) { - existed = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id == prog_btf_id) { + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (!existed) + if (!tinfo) return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); @@ -546,7 +545,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); link->tinfo = tinfo; - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); return err; @@ -684,11 +683,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { int ret; - rcu_read_lock(); - migrate_disable(); - ret = bpf_prog_run(prog, ctx); - migrate_enable(); - rcu_read_unlock(); + if (prog->aux->sleepable) { + rcu_read_lock_trace(); + migrate_disable(); + might_fault(); + ret = bpf_prog_run(prog, ctx); + migrate_enable(); + rcu_read_unlock_trace(); + } else { + rcu_read_lock(); + migrate_disable(); + ret = bpf_prog_run(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + } /* bpf program can only return 0 or 1: * 0 : okay diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 71de2a89869c..8ce40fd869f6 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem) + void *value, bool charge_mem, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (selem) { if (value) memcpy(SDATA(selem)->data, value, smap->map.value_size); @@ -106,7 +106,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) */ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem) + bool uncharge_mem, bool use_trace_rcu) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -136,7 +136,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, * will be done by the caller. * * Although the unlock will be done under - * rcu_read_lock(), it is more intutivie to + * rcu_read_lock(), it is more intuitive to * read if the freeing of the storage is done * after the raw_spin_unlock_bh(&local_storage->lock). * @@ -150,11 +150,16 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); + if (use_trace_rcu) + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); + else + kfree_rcu(selem, rcu); + return free_local_storage; } -static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) +static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, + bool use_trace_rcu) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -169,12 +174,16 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true); + local_storage, selem, true, use_trace_rcu); raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (free_local_storage) - call_rcu_tasks_trace(&local_storage->rcu, + if (free_local_storage) { + if (use_trace_rcu) + call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_rcu); + else + kfree_rcu(local_storage, rcu); + } } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -214,14 +223,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem) +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ bpf_selem_unlink_map(selem); - __bpf_selem_unlink_storage(selem); + __bpf_selem_unlink_storage(selem, use_trace_rcu); } struct bpf_local_storage_data * @@ -282,7 +291,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem) + struct bpf_local_storage_elem *first_selem, + gfp_t gfp_flags) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; @@ -293,7 +303,7 @@ int bpf_local_storage_alloc(void *owner, return err; storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (!storage) { err = -ENOMEM; goto uncharge; @@ -350,10 +360,10 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags) + void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem; + struct bpf_local_storage_elem *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -365,6 +375,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, !map_value_has_spin_lock(&smap->map))) return ERR_PTR(-EINVAL); + if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) + return ERR_PTR(-EINVAL); + local_storage = rcu_dereference_check(*owner_storage(smap, owner), bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { @@ -373,11 +386,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true); + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); - err = bpf_local_storage_alloc(owner, smap, selem); + err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { kfree(selem); mem_uncharge(smap, owner, smap->elem_size); @@ -404,6 +417,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } + if (gfp_flags == GFP_KERNEL) { + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!selem) + return ERR_PTR(-ENOMEM); + } + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ @@ -429,19 +448,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata); - if (!selem) { - err = -ENOMEM; - goto unlock_err; + if (gfp_flags != GFP_KERNEL) { + /* local_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during + * bpf_selem_unlink_storage_nolock(). + */ + selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } } /* First, link the new selem to the map */ @@ -454,7 +475,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false); + false, true); } unlock: @@ -463,6 +484,10 @@ unlock: unlock_err: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + if (selem) { + mem_uncharge(smap, owner, smap->elem_size); + kfree(selem); + } return ERR_PTR(err); } @@ -532,7 +557,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, migrate_disable(); __this_cpu_inc(*busy_counter); } - bpf_selem_unlink(selem); + bpf_selem_unlink(selem, false); if (busy_counter) { __this_cpu_dec(*busy_counter); migrate_enable(); diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 6b12f06ee18c..4ea227c9c1ad 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -4,6 +4,7 @@ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ +#include <linux/cache.h> #include <linux/list.h> #include <linux/spinlock_types.h> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9e4ecc990647..c1351df9f7ee 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -99,6 +99,39 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = { .allowed = bpf_ima_inode_hash_allowed, }; +BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size) +{ + return ima_file_hash(file, dst, size); +} + +BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file) + +static const struct bpf_func_proto bpf_ima_file_hash_proto = { + .func = bpf_ima_file_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0], + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, + .allowed = bpf_ima_inode_hash_allowed, +}; + +BPF_CALL_1(bpf_get_attach_cookie, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto = { + .func = bpf_get_attach_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -121,6 +154,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; + case BPF_FUNC_ima_file_hash: + return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } @@ -167,6 +204,7 @@ BTF_ID(func, bpf_lsm_inode_setxattr) BTF_ID(func, bpf_lsm_inode_symlink) BTF_ID(func, bpf_lsm_inode_unlink) BTF_ID(func, bpf_lsm_kernel_module_request) +BTF_ID(func, bpf_lsm_kernel_read_file) BTF_ID(func, bpf_lsm_kernfs_init_security) #ifdef CONFIG_KEYS diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 21069dbe9138..d9a3c9207240 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -10,6 +10,7 @@ #include <linux/seq_file.h> #include <linux/refcount.h> #include <linux/mutex.h> +#include <linux/btf_ids.h> enum bpf_struct_ops_state { BPF_STRUCT_OPS_STATE_INIT, @@ -32,15 +33,15 @@ struct bpf_struct_ops_map { const struct bpf_struct_ops *st_ops; /* protect map_update */ struct mutex lock; - /* progs has all the bpf_prog that is populated + /* link has all the bpf_links that is populated * to the func ptr of the kernel's struct * (in kvalue.data). */ - struct bpf_prog **progs; + struct bpf_link **links; /* image is a page that has all the trampolines * that stores the func args before calling the bpf_prog. * A PAGE_SIZE "image" is enough to store all trampoline for - * "progs[]". + * "links[]". */ void *image; /* uvalue->data stores the kernel struct @@ -263,7 +264,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, /* No lock is needed. state and refcnt do not need * to be updated together under atomic context. */ - uvalue = (struct bpf_struct_ops_value *)value; + uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); uvalue->state = state; refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); @@ -282,9 +283,9 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) u32 i; for (i = 0; i < btf_type_vlen(t); i++) { - if (st_map->progs[i]) { - bpf_prog_put(st_map->progs[i]); - st_map->progs[i] = NULL; + if (st_map->links[i]) { + bpf_link_put(st_map->links[i]); + st_map->links[i] = NULL; } } } @@ -315,18 +316,34 @@ static int check_zero_holes(const struct btf_type *t, void *data) return 0; } -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, - struct bpf_prog *prog, +static void bpf_struct_ops_link_release(struct bpf_link *link) +{ +} + +static void bpf_struct_ops_link_dealloc(struct bpf_link *link) +{ + struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); + + kfree(tlink); +} + +const struct bpf_link_ops bpf_struct_ops_link_lops = { + .release = bpf_struct_ops_link_release, + .dealloc = bpf_struct_ops_link_dealloc, +}; + +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, + struct bpf_tramp_link *link, const struct btf_func_model *model, void *image, void *image_end) { u32 flags; - tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; - tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + tlinks[BPF_TRAMP_FENTRY].links[0] = link; + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; return arch_prepare_bpf_trampoline(NULL, image, image_end, - model, flags, tprogs, NULL); + model, flags, tlinks, NULL); } static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, @@ -337,7 +354,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; - struct bpf_tramp_progs *tprogs = NULL; + struct bpf_tramp_links *tlinks = NULL; void *udata, *kdata; int prog_fd, err = 0; void *image, *image_end; @@ -353,7 +370,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) return err; - uvalue = (struct bpf_struct_ops_value *)value; + uvalue = value; err = check_zero_holes(t, uvalue->data); if (err) return err; @@ -361,8 +378,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->state || refcount_read(&uvalue->refcnt)) return -EINVAL; - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; @@ -385,6 +402,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; + struct bpf_tramp_link *link; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; @@ -438,16 +456,26 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = PTR_ERR(prog); goto reset_unlock; } - st_map->progs[i] = prog; if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || prog->aux->attach_btf_id != st_ops->type_id || prog->expected_attach_type != i) { + bpf_prog_put(prog); err = -EINVAL; goto reset_unlock; } - err = bpf_struct_ops_prepare_trampoline(tprogs, prog, + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + bpf_prog_put(prog); + err = -ENOMEM; + goto reset_unlock; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, + &bpf_struct_ops_link_lops, prog); + st_map->links[i] = &link->link; + + err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], image, image_end); if (err < 0) @@ -490,7 +518,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: - kfree(tprogs); + kfree(tlinks); mutex_unlock(&st_map->lock); return err; } @@ -545,9 +573,9 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; - if (st_map->progs) + if (st_map->links) bpf_struct_ops_map_put_progs(st_map); - bpf_map_area_free(st_map->progs); + bpf_map_area_free(st_map->links); bpf_jit_free_exec(st_map->image); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); @@ -596,11 +624,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) map = &st_map->map; st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); - st_map->progs = - bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), + st_map->links = + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), NUMA_NO_NODE); st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!st_map->uvalue || !st_map->progs || !st_map->image) { + if (!st_map->uvalue || !st_map->links || !st_map->image) { bpf_struct_ops_map_free(map); return ERR_PTR(-ENOMEM); } @@ -612,7 +640,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return map; } -static int bpf_struct_ops_map_btf_id; +BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, @@ -622,8 +650,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, - .map_btf_name = "bpf_struct_ops_map", - .map_btf_id = &bpf_struct_ops_map_btf_id, + .map_btf_id = &bpf_struct_ops_map_btf_ids[0], }; /* "const void *" because some subsystem is diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 5da7bed0f5f6..e9014dc62682 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -102,7 +102,7 @@ void bpf_task_storage_free(struct task_struct *task) */ bpf_selem_unlink_map(selem); free_task_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false); + local_storage, selem, false, false); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_task_storage_unlock(); @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( - task, (struct bpf_local_storage_map *)map, value, map_flags); + task, (struct bpf_local_storage_map *)map, value, map_flags, + GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -191,7 +192,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } @@ -226,8 +227,9 @@ out: return err; } -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); unlock: bpf_task_storage_unlock(); @@ -305,7 +307,7 @@ static void task_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap, &bpf_task_storage_busy); } -static int task_storage_map_btf_id; +BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -316,8 +318,7 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &task_storage_map_btf_id, + .map_btf_id = &task_storage_map_btf_ids[0], .map_owner_storage_ptr = task_storage_ptr, }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3e23b3fa79ff..eb12d4f705cc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018 Facebook */ #include <uapi/linux/btf.h> @@ -198,6 +198,29 @@ DEFINE_IDR(btf_idr); DEFINE_SPINLOCK(btf_idr_lock); +enum btf_kfunc_hook { + BTF_KFUNC_HOOK_XDP, + BTF_KFUNC_HOOK_TC, + BTF_KFUNC_HOOK_STRUCT_OPS, + BTF_KFUNC_HOOK_TRACING, + BTF_KFUNC_HOOK_SYSCALL, + BTF_KFUNC_HOOK_MAX, +}; + +enum { + BTF_KFUNC_SET_MAX_CNT = 32, + BTF_DTOR_KFUNC_MAX_CNT = 256, +}; + +struct btf_kfunc_set_tab { + struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX]; +}; + +struct btf_id_dtor_kfunc_tab { + u32 cnt; + struct btf_id_dtor_kfunc dtors[]; +}; + struct btf { void *data; struct btf_type **types; @@ -212,6 +235,8 @@ struct btf { refcount_t refcnt; u32 id; struct rcu_head rcu; + struct btf_kfunc_set_tab *kfunc_set_tab; + struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; /* split BTF support */ struct btf *base_btf; @@ -403,6 +428,9 @@ static struct btf_type btf_void; static int btf_resolve(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id); +static int btf_func_check(struct btf_verifier_env *env, + const struct btf_type *t); + static bool btf_type_is_modifier(const struct btf_type *t) { /* Some of them is not strictly a C modifier @@ -506,6 +534,50 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) return -ENOENT; } +static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) +{ + struct btf *btf; + s32 ret; + int id; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (!btf) + return -EINVAL; + + ret = btf_find_by_name_kind(btf, name, kind); + /* ret is never zero, since btf_find_by_name_kind returns + * positive btf_id or negative error. + */ + if (ret > 0) { + btf_get(btf); + *btf_p = btf; + return ret; + } + + /* If name is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, btf, id) { + if (!btf_is_module(btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(btf); + spin_unlock_bh(&btf_idr_lock); + ret = btf_find_by_name_kind(btf, name, kind); + if (ret > 0) { + *btf_p = btf; + return ret; + } + spin_lock_bh(&btf_idr_lock); + btf_put(btf); + } + spin_unlock_bh(&btf_idr_lock); + return ret; +} + const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) { @@ -579,6 +651,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t) btf_type_is_struct(t) || btf_type_is_array(t) || btf_type_is_var(t) || + btf_type_is_func(t) || btf_type_is_decl_tag(t) || btf_type_is_datasec(t); } @@ -1531,8 +1604,41 @@ static void btf_free_id(struct btf *btf) spin_unlock_irqrestore(&btf_idr_lock, flags); } +static void btf_free_kfunc_set_tab(struct btf *btf) +{ + struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab; + int hook, type; + + if (!tab) + return; + /* For module BTF, we directly assign the sets being registered, so + * there is nothing to free except kfunc_set_tab. + */ + if (btf_is_module(btf)) + goto free_tab; + for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) { + for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++) + kfree(tab->sets[hook][type]); + } +free_tab: + kfree(tab); + btf->kfunc_set_tab = NULL; +} + +static void btf_free_dtor_kfunc_tab(struct btf *btf) +{ + struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; + + if (!tab) + return; + kfree(tab); + btf->dtor_kfunc_tab = NULL; +} + static void btf_free(struct btf *btf) { + btf_free_dtor_kfunc_tab(btf); + btf_free_kfunc_set_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); kvfree(btf->resolved_ids); @@ -2505,7 +2611,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, * * We now need to continue from the last-resolved-ptr to * ensure the last-resolved-ptr will not referring back to - * the currenct ptr (t). + * the current ptr (t). */ if (btf_type_is_modifier(next_type)) { const struct btf_type *resolved_type; @@ -3077,24 +3183,86 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +enum btf_field_type { + BTF_FIELD_SPIN_LOCK, + BTF_FIELD_TIMER, + BTF_FIELD_KPTR, +}; + +enum { + BTF_FIELD_IGNORE = 0, + BTF_FIELD_FOUND = 1, +}; + +struct btf_field_info { + u32 type_id; + u32 off; + enum bpf_kptr_type type; +}; + +static int btf_find_struct(const struct btf *btf, const struct btf_type *t, + u32 off, int sz, struct btf_field_info *info) +{ + if (!__btf_type_is_struct(t)) + return BTF_FIELD_IGNORE; + if (t->size != sz) + return BTF_FIELD_IGNORE; + info->off = off; + return BTF_FIELD_FOUND; +} + +static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, + u32 off, int sz, struct btf_field_info *info) +{ + enum bpf_kptr_type type; + u32 res_id; + + /* For PTR, sz is always == 8 */ + if (!btf_type_is_ptr(t)) + return BTF_FIELD_IGNORE; + t = btf_type_by_id(btf, t->type); + + if (!btf_type_is_type_tag(t)) + return BTF_FIELD_IGNORE; + /* Reject extra tags */ + if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) + return -EINVAL; + if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_UNREF; + else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_REF; + else + return -EINVAL; + + /* Get the base type */ + t = btf_type_skip_modifiers(btf, t->type, &res_id); + /* Only pointer to struct is allowed */ + if (!__btf_type_is_struct(t)) + return -EINVAL; + + info->type_id = res_id; + info->off = off; + info->type = type; + return BTF_FIELD_FOUND; +} + static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + const char *name, int sz, int align, + enum btf_field_type field_type, + struct btf_field_info *info, int info_cnt) { const struct btf_member *member; - u32 i, off = -ENOENT; + struct btf_field_info tmp; + int ret, idx = 0; + u32 i, off; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (!__btf_type_is_struct(member_type)) - continue; - if (member_type->size != sz) - continue; - if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) + + if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) continue; - if (off != -ENOENT) - /* only one such field is allowed */ - return -E2BIG; + off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ @@ -3102,46 +3270,115 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t off /= 8; if (off % align) return -EINVAL; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + case BTF_FIELD_TIMER: + ret = btf_find_struct(btf, member_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + case BTF_FIELD_KPTR: + ret = btf_find_kptr(btf, member_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + default: + return -EFAULT; + } + + if (ret == BTF_FIELD_IGNORE) + continue; + if (idx >= info_cnt) + return -E2BIG; + ++idx; } - return off; + return idx; } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + const char *name, int sz, int align, + enum btf_field_type field_type, + struct btf_field_info *info, int info_cnt) { const struct btf_var_secinfo *vsi; - u32 i, off = -ENOENT; + struct btf_field_info tmp; + int ret, idx = 0; + u32 i, off; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - if (!__btf_type_is_struct(var_type)) - continue; - if (var_type->size != sz) + off = vsi->offset; + + if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) continue; if (vsi->size != sz) continue; - if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) - continue; - if (off != -ENOENT) - /* only one such field is allowed */ - return -E2BIG; - off = vsi->offset; if (off % align) return -EINVAL; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + case BTF_FIELD_TIMER: + ret = btf_find_struct(btf, var_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + case BTF_FIELD_KPTR: + ret = btf_find_kptr(btf, var_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + default: + return -EFAULT; + } + + if (ret == BTF_FIELD_IGNORE) + continue; + if (idx >= info_cnt) + return -E2BIG; + ++idx; } - return off; + return idx; } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + enum btf_field_type field_type, + struct btf_field_info *info, int info_cnt) { + const char *name; + int sz, align; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + name = "bpf_spin_lock"; + sz = sizeof(struct bpf_spin_lock); + align = __alignof__(struct bpf_spin_lock); + break; + case BTF_FIELD_TIMER: + name = "bpf_timer"; + sz = sizeof(struct bpf_timer); + align = __alignof__(struct bpf_timer); + break; + case BTF_FIELD_KPTR: + name = NULL; + sz = sizeof(u64); + align = 8; + break; + default: + return -EFAULT; + } if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align); + return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align); + return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt); return -EINVAL; } @@ -3151,16 +3388,130 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, */ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) { - return btf_find_field(btf, t, "bpf_spin_lock", - sizeof(struct bpf_spin_lock), - __alignof__(struct bpf_spin_lock)); + struct btf_field_info info; + int ret; + + ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1); + if (ret < 0) + return ret; + if (!ret) + return -ENOENT; + return info.off; } int btf_find_timer(const struct btf *btf, const struct btf_type *t) { - return btf_find_field(btf, t, "bpf_timer", - sizeof(struct bpf_timer), - __alignof__(struct bpf_timer)); + struct btf_field_info info; + int ret; + + ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1); + if (ret < 0) + return ret; + if (!ret) + return -ENOENT; + return info.off; +} + +struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, + const struct btf_type *t) +{ + struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; + struct bpf_map_value_off *tab; + struct btf *kernel_btf = NULL; + struct module *mod = NULL; + int ret, i, nr_off; + + ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); + if (ret < 0) + return ERR_PTR(ret); + if (!ret) + return NULL; + + nr_off = ret; + tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN); + if (!tab) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_off; i++) { + const struct btf_type *t; + s32 id; + + /* Find type in map BTF, and use it to look up the matching type + * in vmlinux or module BTFs, by name and kind. + */ + t = btf_type_by_id(btf, info_arr[i].type_id); + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), + &kernel_btf); + if (id < 0) { + ret = id; + goto end; + } + + /* Find and stash the function pointer for the destruction function that + * needs to be eventually invoked from the map free path. + */ + if (info_arr[i].type == BPF_KPTR_REF) { + const struct btf_type *dtor_func; + const char *dtor_func_name; + unsigned long addr; + s32 dtor_btf_id; + + /* This call also serves as a whitelist of allowed objects that + * can be used as a referenced pointer and be stored in a map at + * the same time. + */ + dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + if (dtor_btf_id < 0) { + ret = dtor_btf_id; + goto end_btf; + } + + dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + if (!dtor_func) { + ret = -ENOENT; + goto end_btf; + } + + if (btf_is_module(kernel_btf)) { + mod = btf_try_get_module(kernel_btf); + if (!mod) { + ret = -ENXIO; + goto end_btf; + } + } + + /* We already verified dtor_func to be btf_type_is_func + * in register_btf_id_dtor_kfuncs. + */ + dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + addr = kallsyms_lookup_name(dtor_func_name); + if (!addr) { + ret = -EINVAL; + goto end_mod; + } + tab->off[i].kptr.dtor = (void *)addr; + } + + tab->off[i].offset = info_arr[i].off; + tab->off[i].type = info_arr[i].type; + tab->off[i].kptr.btf_id = id; + tab->off[i].kptr.btf = kernel_btf; + tab->off[i].kptr.module = mod; + } + tab->nr_off = nr_off; + return tab; +end_mod: + module_put(mod); +end_btf: + btf_put(kernel_btf); +end: + while (i--) { + btf_put(tab->off[i].kptr.btf); + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + } + kfree(tab); + return ERR_PTR(ret); } static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, @@ -3533,9 +3884,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return 0; } +static int btf_func_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + int err; + + err = btf_func_check(env, t); + if (err) + return err; + + env_stack_pop_resolved(env, next_type_id, 0); + return 0; +} + static struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, - .resolve = btf_df_resolve, + .resolve = btf_func_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, @@ -4156,7 +4522,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return !btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); - if (btf_type_is_decl_tag(t)) + if (btf_type_is_decl_tag(t) || btf_type_is_func(t)) return btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); @@ -4246,12 +4612,6 @@ static int btf_check_all_types(struct btf_verifier_env *env) if (err) return err; } - - if (btf_type_is_func(t)) { - err = btf_func_check(env, t); - if (err) - return err; - } } return 0; @@ -4387,8 +4747,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env) btf = env->btf; btf_data_size = btf->data_size; - if (btf_data_size < - offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { + if (btf_data_size < offsetofend(struct btf_header, hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } @@ -4447,6 +4806,53 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return 0; } +static int btf_check_type_tags(struct btf_verifier_env *env, + struct btf *btf, int start_id) +{ + int i, n, good_id = start_id - 1; + bool in_tags; + + n = btf_nr_types(btf); + for (i = start_id; i < n; i++) { + const struct btf_type *t; + int chain_limit = 32; + u32 cur_id = i; + + t = btf_type_by_id(btf, i); + if (!t) + return -EINVAL; + if (!btf_type_is_modifier(t)) + continue; + + cond_resched(); + + in_tags = btf_type_is_type_tag(t); + while (btf_type_is_modifier(t)) { + if (!chain_limit--) { + btf_verifier_log(env, "Max chain length or cycle detected"); + return -ELOOP; + } + if (btf_type_is_type_tag(t)) { + if (!in_tags) { + btf_verifier_log(env, "Type tags don't precede modifiers"); + return -EINVAL; + } + } else if (in_tags) { + in_tags = false; + } + if (cur_id <= good_id) + break; + /* Move to next type */ + cur_id = t->type; + t = btf_type_by_id(btf, cur_id); + if (!t) + return -EINVAL; + } + good_id = i; + } + return 0; +} + static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, u32 log_level, char __user *log_ubuf, u32 log_size) { @@ -4514,6 +4920,10 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, if (err) goto errout; + err = btf_check_type_tags(env, btf, 1); + if (err) + goto errout; + if (log->level && bpf_verifier_log_full(log)) { err = -ENOSPC; goto errout; @@ -4622,41 +5032,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, return ctx_type; } -static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = { -#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) -#define BPF_LINK_TYPE(_id, _name) -#define BPF_MAP_TYPE(_id, _ops) \ - [_id] = &_ops, -#include <linux/bpf_types.h> -#undef BPF_PROG_TYPE -#undef BPF_LINK_TYPE -#undef BPF_MAP_TYPE -}; - -static int btf_vmlinux_map_ids_init(const struct btf *btf, - struct bpf_verifier_log *log) -{ - const struct bpf_map_ops *ops; - int i, btf_id; - - for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) { - ops = btf_vmlinux_map_ops[i]; - if (!ops || (!ops->map_btf_name && !ops->map_btf_id)) - continue; - if (!ops->map_btf_name || !ops->map_btf_id) { - bpf_log(log, "map type %d is misconfigured\n", i); - return -EINVAL; - } - btf_id = btf_find_by_name_kind(btf, ops->map_btf_name, - BTF_KIND_STRUCT); - if (btf_id < 0) - return btf_id; - *ops->map_btf_id = btf_id; - } - - return 0; -} - static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -4715,14 +5090,13 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; + err = btf_check_type_tags(env, btf, 1); + if (err) + goto errout; + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); - /* find bpf map structs for map_ptr access checking */ - err = btf_vmlinux_map_ids_init(btf, log); - if (err < 0) - goto errout; - bpf_struct_ops_init(btf, log); refcount_set(&btf->refcnt, 1); @@ -4800,6 +5174,10 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u if (err) goto errout; + err = btf_check_type_tags(env, btf, btf_nr_types(base_btf)); + if (err) + goto errout; + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; @@ -4848,6 +5226,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; + const char *tag_value; u32 nr_args, arg; int i, ret; @@ -5000,6 +5379,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); + + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + if (strcmp(tag_value, "user") == 0) + info->reg_type |= MEM_USER; + if (strcmp(tag_value, "percpu") == 0) + info->reg_type |= MEM_PERCPU; + } + /* skip modifiers */ while (btf_type_is_modifier(t)) { info->btf_id = t->type; @@ -5026,12 +5414,12 @@ enum bpf_struct_walk_result { static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; - const char *tname, *mname; + const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; again: @@ -5215,7 +5603,8 @@ error: } if (btf_type_is_ptr(mtype)) { - const struct btf_type *stype; + const struct btf_type *stype, *t; + enum bpf_type_flag tmp_flag = 0; u32 id; if (msize != size || off != moff) { @@ -5224,9 +5613,23 @@ error: mname, moff, tname, off, size); return -EACCES; } + + /* check type tag */ + t = btf_type_by_id(btf, mtype->type); + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + /* check __user tag */ + if (strcmp(tag_value, "user") == 0) + tmp_flag = MEM_USER; + /* check __percpu tag */ + if (strcmp(tag_value, "percpu") == 0) + tmp_flag = MEM_PERCPU; + } + stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; + *flag = tmp_flag; return WALK_PTR; } } @@ -5253,13 +5656,14 @@ error: int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype __maybe_unused, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { + enum bpf_type_flag tmp_flag = 0; int err; u32 id; do { - err = btf_struct_walk(log, btf, t, off, size, &id); + err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); switch (err) { case WALK_PTR: @@ -5267,6 +5671,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, * we're done. */ *next_btf_id = id; + *flag = tmp_flag; return PTR_TO_BTF_ID; case WALK_SCALAR: return SCALAR_VALUE; @@ -5308,20 +5713,27 @@ static bool btf_types_are_same(const struct btf *btf1, u32 id1, bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, - const struct btf *need_btf, u32 need_type_id) + const struct btf *need_btf, u32 need_type_id, + bool strict) { const struct btf_type *type; + enum bpf_type_flag flag; int err; /* Are we already done? */ if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) return true; - + /* In case of strict type match, we do not walk struct, the top level + * type match must succeed. When strict is true, off should have already + * been 0. + */ + if (strict) + return false; again: type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, btf, type, off, 1, &id); + err = btf_struct_walk(log, btf, type, off, 1, &id, &flag); if (err != WALK_STRUCT) return false; @@ -5385,7 +5797,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, } args = (const struct btf_param *)(func + 1); nargs = btf_type_vlen(func); - if (nargs >= MAX_BPF_FUNC_ARGS) { + if (nargs > MAX_BPF_FUNC_ARGS) { bpf_log(log, "The function %s has %d arguments. Too many.\n", tname, nargs); @@ -5616,17 +6028,46 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log, return true; } +static bool is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg) +{ + int len, sfx_len = sizeof("__sz") - 1; + const struct btf_type *t; + const char *param_name; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + /* In the future, this can be ported to use BTF tagging */ + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len < sfx_len) + return false; + param_name += len - sfx_len; + if (strncmp(param_name, "__sz", sfx_len)) + return false; + + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_verifier_log *log = &env->log; + u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); + bool rel = false, kptr_get = false; const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - u32 i, nargs, ref_id; + int ref_regno = 0, ret; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -5652,10 +6093,19 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + if (is_kfunc) { + /* Only kfunc can be release func */ + rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RELEASE, func_id); + kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id); + } + /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { + enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; @@ -5675,8 +6125,59 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - if (btf_get_prog_ctx_type(log, btf, t, - env->prog->type, i)) { + + if (rel && reg->ref_obj_id) + arg_type |= OBJ_RELEASE; + ret = check_func_arg_reg_off(env, reg, regno, arg_type); + if (ret < 0) + return ret; + + /* kptr_get is only true for kfunc */ + if (i == 0 && kptr_get) { + struct bpf_map_value_off_desc *off_desc; + + if (reg->type != PTR_TO_MAP_VALUE) { + bpf_log(log, "arg#0 expected pointer to map value\n"); + return -EINVAL; + } + + /* check_func_arg_reg_off allows var_off for + * PTR_TO_MAP_VALUE, but we need fixed offset to find + * off_desc. + */ + if (!tnum_is_const(reg->var_off)) { + bpf_log(log, "arg#0 must have constant offset\n"); + return -EINVAL; + } + + off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value); + if (!off_desc || off_desc->type != BPF_KPTR_REF) { + bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n", + reg->off + reg->var_off.value); + return -EINVAL; + } + + if (!btf_type_is_ptr(ref_t)) { + bpf_log(log, "arg#0 BTF type must be a double pointer\n"); + return -EINVAL; + } + + ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + + if (!btf_type_is_struct(ref_t)) { + bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", + func_name, i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf, + off_desc->kptr.btf_id, true)) { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n", + func_name, i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + /* rest of the arguments can be anything, like normal kfunc */ + } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -5686,8 +6187,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, i, btf_type_str(t)); return -EINVAL; } - if (check_ptr_off_reg(env, reg, regno)) - return -EINVAL; } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) { const struct btf_type *reg_ref_t; @@ -5705,6 +6204,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; + /* Ensure only one argument is referenced PTR_TO_BTF_ID */ + if (reg->ref_obj_id) { + if (ref_obj_id) { + bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, ref_obj_id); + return -EFAULT; + } + ref_regno = regno; + ref_obj_id = reg->ref_obj_id; + } } else { reg_btf = btf_vmlinux; reg_ref_id = *reg2btf_ids[base_type(reg->type)]; @@ -5715,7 +6224,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, - reg->off, btf, ref_id)) { + reg->off, btf, ref_id, rel && reg->ref_obj_id)) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname, @@ -5728,17 +6237,33 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, u32 type_size; if (is_kfunc) { + bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); + /* Permit pointer to mem, but only when argument * type is pointer to scalar, or struct composed * (recursively) of scalars. + * When arg_mem_size is true, the pointer can be + * void *. */ if (!btf_type_is_scalar(ref_t) && - !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) { + !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && + (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { bpf_log(log, - "arg#%d pointer type %s %s must point to scalar or struct with scalar\n", - i, btf_type_str(ref_t), ref_tname); + "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", + i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } + + /* Check for mem, len pair */ + if (arg_mem_size) { + if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { + bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", + i, i + 1); + return -EINVAL; + } + i++; + continue; + } } resolve_ret = btf_resolve_size(btf, ref_t, &type_size); @@ -5759,7 +6284,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, } } - return 0; + /* Either both are set, or neither */ + WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno)); + /* We already made sure ref_obj_id is set only for one argument. We do + * allow (!rel && ref_obj_id), so that passing such referenced + * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when + * is_kfunc is true. + */ + if (rel && !ref_obj_id) { + bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", + func_name); + return -EINVAL; + } + /* returns argument register number > 0 in case of reference release kfunc */ + return rel ? ref_regno : 0; } /* Compare BTF of a function with given bpf_reg_state. @@ -6005,7 +6543,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf); - /* If we encontered an error, return it. */ + /* If we encountered an error, return it. */ if (ssnprintf.show.state.status) return ssnprintf.show.state.status; @@ -6201,12 +6739,17 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } +enum { + BTF_MODULE_F_LIVE = (1 << 0), +}; + #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module { struct list_head list; struct module *module; struct btf *btf; struct bin_attribute *sysfs_attr; + int flags; }; static LIST_HEAD(btf_modules); @@ -6234,7 +6777,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, int err = 0; if (mod->btf_data_size == 0 || - (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE && + op != MODULE_STATE_GOING)) goto out; switch (op) { @@ -6249,7 +6793,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, pr_warn("failed to validate module [%s] BTF: %ld\n", mod->name, PTR_ERR(btf)); kfree(btf_mod); - err = PTR_ERR(btf); + if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + err = PTR_ERR(btf); goto out; } err = btf_alloc_id(btf); @@ -6293,6 +6838,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, } break; + case MODULE_STATE_LIVE: + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + btf_mod->flags |= BTF_MODULE_F_LIVE; + break; + } + mutex_unlock(&btf_module_mutex); + break; case MODULE_STATE_GOING: mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { @@ -6339,7 +6895,12 @@ struct module *btf_try_get_module(const struct btf *btf) if (btf_mod->btf != btf) continue; - if (try_module_get(btf_mod->module)) + /* We must only consider module whose __init routine has + * finished, hence we must check for BTF_MODULE_F_LIVE flag, + * which is set from the notifier callback for + * MODULE_STATE_LIVE. + */ + if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module)) res = btf_mod->module; break; @@ -6350,9 +6911,43 @@ struct module *btf_try_get_module(const struct btf *btf) return res; } +/* Returns struct btf corresponding to the struct module. + * This function can return NULL or ERR_PTR. + */ +static struct btf *btf_get_module_btf(const struct module *module) +{ +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; +#endif + struct btf *btf = NULL; + + if (!module) { + btf = bpf_get_btf_vmlinux(); + if (!IS_ERR_OR_NULL(btf)) + btf_get(btf); + return btf; + } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + btf_get(btf_mod->btf); + btf = btf_mod->btf; + break; + } + mutex_unlock(&btf_module_mutex); +#endif + + return btf; +} + BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) { - struct btf *btf; + struct btf *btf = NULL; + int btf_obj_fd = 0; long ret; if (flags) @@ -6361,44 +6956,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int if (name_sz <= 1 || name[name_sz - 1]) return -EINVAL; - btf = bpf_get_btf_vmlinux(); - if (IS_ERR(btf)) - return PTR_ERR(btf); - - ret = btf_find_by_name_kind(btf, name, kind); - /* ret is never zero, since btf_find_by_name_kind returns - * positive btf_id or negative error. - */ - if (ret < 0) { - struct btf *mod_btf; - int id; - - /* If name is not found in vmlinux's BTF then search in module's BTFs */ - spin_lock_bh(&btf_idr_lock); - idr_for_each_entry(&btf_idr, mod_btf, id) { - if (!btf_is_module(mod_btf)) - continue; - /* linear search could be slow hence unlock/lock - * the IDR to avoiding holding it for too long - */ - btf_get(mod_btf); - spin_unlock_bh(&btf_idr_lock); - ret = btf_find_by_name_kind(mod_btf, name, kind); - if (ret > 0) { - int btf_obj_fd; - - btf_obj_fd = __btf_new_fd(mod_btf); - if (btf_obj_fd < 0) { - btf_put(mod_btf); - return btf_obj_fd; - } - return ret | (((u64)btf_obj_fd) << 32); - } - spin_lock_bh(&btf_idr_lock); - btf_put(mod_btf); + ret = bpf_find_btf_id(name, kind, &btf); + if (ret > 0 && btf_is_module(btf)) { + btf_obj_fd = __btf_new_fd(btf); + if (btf_obj_fd < 0) { + btf_put(btf); + return btf_obj_fd; } - spin_unlock_bh(&btf_idr_lock); + return ret | (((u64)btf_obj_fd) << 32); } + if (ret > 0) + btf_put(btf); return ret; } @@ -6417,58 +6985,434 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE -/* BTF ID set registration API for modules */ +/* Kernel Function (kfunc) BTF ID set registration API */ -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + enum btf_kfunc_type type, + struct btf_id_set *add_set, bool vmlinux_set) +{ + struct btf_kfunc_set_tab *tab; + struct btf_id_set *set; + u32 set_cnt; + int ret; + + if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) { + ret = -EINVAL; + goto end; + } + + if (!add_set->cnt) + return 0; + + tab = btf->kfunc_set_tab; + if (!tab) { + tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN); + if (!tab) + return -ENOMEM; + btf->kfunc_set_tab = tab; + } + + set = tab->sets[hook][type]; + /* Warn when register_btf_kfunc_id_set is called twice for the same hook + * for module sets. + */ + if (WARN_ON_ONCE(set && !vmlinux_set)) { + ret = -EINVAL; + goto end; + } + + /* We don't need to allocate, concatenate, and sort module sets, because + * only one is allowed per hook. Hence, we can directly assign the + * pointer and return. + */ + if (!vmlinux_set) { + tab->sets[hook][type] = add_set; + return 0; + } + + /* In case of vmlinux sets, there may be more than one set being + * registered per hook. To create a unified set, we allocate a new set + * and concatenate all individual sets being registered. While each set + * is individually sorted, they may become unsorted when concatenated, + * hence re-sorting the final set again is required to make binary + * searching the set using btf_id_set_contains function work. + */ + set_cnt = set ? set->cnt : 0; + + if (set_cnt > U32_MAX - add_set->cnt) { + ret = -EOVERFLOW; + goto end; + } + + if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) { + ret = -E2BIG; + goto end; + } + + /* Grow set */ + set = krealloc(tab->sets[hook][type], + offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]), + GFP_KERNEL | __GFP_NOWARN); + if (!set) { + ret = -ENOMEM; + goto end; + } + + /* For newly allocated set, initialize set->cnt to 0 */ + if (!tab->sets[hook][type]) + set->cnt = 0; + tab->sets[hook][type] = set; -void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) + /* Concatenate the two sets */ + memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0])); + set->cnt += add_set->cnt; + + sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL); + + return 0; +end: + btf_free_kfunc_set_tab(btf); + return ret; +} + +static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + const struct btf_kfunc_id_set *kset) +{ + bool vmlinux_set = !btf_is_module(btf); + int type, ret = 0; + + for (type = 0; type < ARRAY_SIZE(kset->sets); type++) { + if (!kset->sets[type]) + continue; + + ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set); + if (ret) + break; + } + return ret; +} + +static bool __btf_kfunc_id_set_contains(const struct btf *btf, + enum btf_kfunc_hook hook, + enum btf_kfunc_type type, + u32 kfunc_btf_id) { - mutex_lock(&l->mutex); - list_add(&s->list, &l->list); - mutex_unlock(&l->mutex); + struct btf_id_set *set; + + if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) + return false; + if (!btf->kfunc_set_tab) + return false; + set = btf->kfunc_set_tab->sets[hook][type]; + if (!set) + return false; + return btf_id_set_contains(set, kfunc_btf_id); +} + +static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_XDP: + return BTF_KFUNC_HOOK_XDP; + case BPF_PROG_TYPE_SCHED_CLS: + return BTF_KFUNC_HOOK_TC; + case BPF_PROG_TYPE_STRUCT_OPS: + return BTF_KFUNC_HOOK_STRUCT_OPS; + case BPF_PROG_TYPE_TRACING: + return BTF_KFUNC_HOOK_TRACING; + case BPF_PROG_TYPE_SYSCALL: + return BTF_KFUNC_HOOK_SYSCALL; + default: + return BTF_KFUNC_HOOK_MAX; + } } -EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set); -void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) +/* Caution: + * Reference to the module (obtained using btf_try_get_module) corresponding to + * the struct btf *MUST* be held when calling this function from verifier + * context. This is usually true as we stash references in prog's kfunc_btf_tab; + * keeping the reference for the duration of the call provides the necessary + * protection for looking up a well-formed btf->kfunc_set_tab. + */ +bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, u32 kfunc_btf_id) { - mutex_lock(&l->mutex); - list_del_init(&s->list); - mutex_unlock(&l->mutex); + enum btf_kfunc_hook hook; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id); } -EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set); -bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, - struct module *owner) +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *kset) { - struct kfunc_btf_id_set *s; + enum btf_kfunc_hook hook; + struct btf *btf; + int ret; - mutex_lock(&klist->mutex); - list_for_each_entry(s, &klist->list, list) { - if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) { - mutex_unlock(&klist->mutex); - return true; + btf = btf_get_module_btf(kset->owner); + if (!btf) { + if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register kfuncs\n"); + return -ENOENT; } + if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { + pr_err("missing module BTF, cannot register kfuncs\n"); + return -ENOENT; + } + return 0; } - mutex_unlock(&klist->mutex); - return false; + if (IS_ERR(btf)) + return PTR_ERR(btf); + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + ret = btf_populate_kfunc_set(btf, hook, kset); + btf_put(btf); + return ret; } +EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); -#define DEFINE_KFUNC_BTF_ID_LIST(name) \ - struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \ - __MUTEX_INITIALIZER(name.mutex) }; \ - EXPORT_SYMBOL_GPL(name) +s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) +{ + struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; + struct btf_id_dtor_kfunc *dtor; -DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); -DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); + if (!tab) + return -ENOENT; + /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need + * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func. + */ + BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0); + dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func); + if (!dtor) + return -ENOENT; + return dtor->kfunc_btf_id; +} -#endif +static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt) +{ + const struct btf_type *dtor_func, *dtor_func_proto, *t; + const struct btf_param *args; + s32 dtor_btf_id; + u32 nr_args, i; + + for (i = 0; i < cnt; i++) { + dtor_btf_id = dtors[i].kfunc_btf_id; + + dtor_func = btf_type_by_id(btf, dtor_btf_id); + if (!dtor_func || !btf_type_is_func(dtor_func)) + return -EINVAL; + + dtor_func_proto = btf_type_by_id(btf, dtor_func->type); + if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto)) + return -EINVAL; + + /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */ + t = btf_type_by_id(btf, dtor_func_proto->type); + if (!t || !btf_type_is_void(t)) + return -EINVAL; + + nr_args = btf_type_vlen(dtor_func_proto); + if (nr_args != 1) + return -EINVAL; + args = btf_params(dtor_func_proto); + t = btf_type_by_id(btf, args[0].type); + /* Allow any pointer type, as width on targets Linux supports + * will be same for all pointer types (i.e. sizeof(void *)) + */ + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + } + return 0; +} + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, + struct module *owner) +{ + struct btf_id_dtor_kfunc_tab *tab; + struct btf *btf; + u32 tab_cnt; + int ret; + btf = btf_get_module_btf(owner); + if (!btf) { + if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n"); + return -ENOENT; + } + if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { + pr_err("missing module BTF, cannot register dtor kfuncs\n"); + return -ENOENT; + } + return 0; + } + if (IS_ERR(btf)) + return PTR_ERR(btf); + + if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { + pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); + ret = -E2BIG; + goto end; + } + + /* Ensure that the prototype of dtor kfuncs being registered is sane */ + ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt); + if (ret < 0) + goto end; + + tab = btf->dtor_kfunc_tab; + /* Only one call allowed for modules */ + if (WARN_ON_ONCE(tab && btf_is_module(btf))) { + ret = -EINVAL; + goto end; + } + + tab_cnt = tab ? tab->cnt : 0; + if (tab_cnt > U32_MAX - add_cnt) { + ret = -EOVERFLOW; + goto end; + } + if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { + pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); + ret = -E2BIG; + goto end; + } + + tab = krealloc(btf->dtor_kfunc_tab, + offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), + GFP_KERNEL | __GFP_NOWARN); + if (!tab) { + ret = -ENOMEM; + goto end; + } + + if (!btf->dtor_kfunc_tab) + tab->cnt = 0; + btf->dtor_kfunc_tab = tab; + + memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0])); + tab->cnt += add_cnt; + + sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); + + return 0; +end: + btf_free_dtor_kfunc_tab(btf); + btf_put(btf); + return ret; +} +EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); + +#define MAX_TYPES_ARE_COMPAT_DEPTH 2 + +static +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, + int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); + targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + btf_type_skip_modifiers(local_btf, local_p->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + btf_type_skip_modifiers(local_btf, local_type->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + return 0; + } +} + +/* Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return -EOPNOTSUPP; + return __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + MAX_TYPES_ARE_COMPAT_DEPTH); } static bool bpf_core_is_flavor_sep(const char *s) @@ -6711,6 +7655,8 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) main_btf = bpf_get_btf_vmlinux(); if (IS_ERR(main_btf)) return ERR_CAST(main_btf); + if (!main_btf) + return ERR_PTR(-EINVAL); local_type = btf_type_by_id(local_btf, local_type_id); if (!local_type) @@ -6789,6 +7735,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, { bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; struct bpf_core_cand_list cands = {}; + struct bpf_core_relo_res targ_res; struct bpf_core_spec *specs; int err; @@ -6828,13 +7775,19 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, cands.len = cc->cnt; /* cand_cache_mutex needs to span the cache lookup and * copy of btf pointer into bpf_core_cand_list, - * since module can be unloaded while bpf_core_apply_relo_insn + * since module can be unloaded while bpf_core_calc_relo_insn * is working with module's btf. */ } - err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, - relo, relo_idx, ctx->btf, &cands, specs); + err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs, + &targ_res); + if (err) + goto out; + + err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx, + &targ_res); + out: kfree(specs); if (need_cands) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 514b4681a90a..afb414b26d01 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -22,6 +22,45 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* __always_inline is necessary to prevent indirect call through run_prog + * function pointer. + */ +static __always_inline int +bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, + enum cgroup_bpf_attach_type atype, + const void *ctx, bpf_prog_run_fn run_prog, + int retval, u32 *ret_flags) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 func_ret; + + run_ctx.retval = retval; + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(cgrp->effective[atype]); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + func_ret = run_prog(prog, ctx); + if (ret_flags) { + *(ret_flags) |= (func_ret >> 1); + func_ret &= 1; + } + if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) + run_ctx.retval = -EPERM; + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return run_ctx.retval; +} + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -1031,7 +1070,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. @@ -1044,7 +1083,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr * NET_XMIT_CN (2) - continue with packet output and notify TCP * to call cwr - * -EPERM - drop packet + * -err - drop packet * * For ingress packets, this function will return -EPERM if any * attached program was found and if it returned != 1 during execution. @@ -1075,12 +1114,40 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, bpf_compute_and_save_data_end(skb, &saved_data_end); if (atype == CGROUP_INET_EGRESS) { - ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); + u32 flags = 0; + bool cn; + + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb, + __bpf_prog_run_save_cb, 0, &flags); + + /* Return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * The returned value is then converted to one of the NET_XMIT + * or an error code that is then interpreted as drop packet + * (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -err skb should be dropped + */ + + cn = flags & BPF_RET_SET_CN; + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; + if (!ret) + ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); + else + ret = (cn ? NET_XMIT_DROP : ret); } else { - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, - __bpf_prog_run_save_cb); - ret = (ret == 1 ? 0 : -EPERM); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, + skb, __bpf_prog_run_save_cb, 0, + NULL); + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); @@ -1093,7 +1160,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * @@ -1107,10 +1174,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0, + NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1119,7 +1185,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user - * @type: The type of program to be exectuted + * @type: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). @@ -1142,7 +1208,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, }; struct sockaddr_storage unspec; struct cgroup *cgrp; - int ret; /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). @@ -1156,10 +1221,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, flags); - - return ret == 1 ? 0 : -EPERM; + return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, + 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1169,7 +1232,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * @@ -1184,11 +1247,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run, + 0, NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1201,17 +1262,47 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, .major = major, .minor = minor, }; - int allow; + int ret; rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, + NULL); rcu_read_unlock(); - return !allow; + return ret; } +BPF_CALL_0(bpf_get_retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + return ctx->retval; +} + +static const struct bpf_func_proto bpf_get_retval_proto = { + .func = bpf_get_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_1(bpf_set_retval, int, retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + ctx->retval = retval; + return 0; +} + +static const struct bpf_func_proto bpf_set_retval_proto = { + .func = bpf_set_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1224,6 +1315,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; + case BPF_FUNC_get_retval: + return &bpf_get_retval_proto; + case BPF_FUNC_set_retval: + return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } @@ -1337,7 +1432,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, + NULL); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1350,24 +1446,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.new_val); } - return ret == 1 ? 0 : -EPERM; + return ret; } #ifdef CONFIG_NET -static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum cgroup_bpf_attach_type attach_type) -{ - struct bpf_prog_array *prog_array; - bool empty; - - rcu_read_lock(); - prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); - empty = bpf_prog_array_is_empty(prog_array); - rcu_read_unlock(); - - return empty; -} - static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, struct bpf_sockopt_buf *buf) { @@ -1426,19 +1508,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, }; int ret, max_optlen; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) - return 0; - /* Allocate a bit more than the initial user buffer for * BPF program. The canonical use case is overriding * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1451,14 +1525,12 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run); + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT, + &ctx, bpf_prog_run, 0, NULL); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret) goto out; - } if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ @@ -1518,19 +1590,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, + .current_task = current, }; int ret; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) - return retval; - ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1561,28 +1625,18 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, + &ctx, bpf_prog_run, retval, NULL); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret < 0) goto out; - } if (ctx.optlen > max_optlen || ctx.optlen < 0) { ret = -EFAULT; goto out; } - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) { - ret = -EFAULT; - goto out; - } - if (ctx.optlen != 0) { if (copy_to_user(optval, ctx.optval, ctx.optlen) || put_user(ctx.optlen, optlen)) { @@ -1591,8 +1645,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } } - ret = ctx.retval; - out: sockopt_free_buf(&ctx, &buf); return ret; @@ -1607,10 +1659,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, .optlen = *optlen, .optval = optval, .optval_end = optval + *optlen, + .current_task = current, }; int ret; @@ -1622,26 +1674,20 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); - if (!ret) - return -EPERM; + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, + &ctx, bpf_prog_run, retval, NULL); + if (ret < 0) + return ret; if (ctx.optlen > *optlen) return -EFAULT; - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) - return -EFAULT; - /* BPF programs can shrink the buffer, export the modifications. */ if (ctx.optlen != 0) *optlen = ctx.optlen; - return ctx.retval; + return ret; } #endif @@ -2057,10 +2103,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); break; case offsetof(struct bpf_sockopt, retval): - if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); - else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); + + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + treg, treg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + treg, si->src_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + si->dst_reg, si->dst_reg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + si->dst_reg, si->dst_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + } break; case offsetof(struct bpf_sockopt, optval): *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index de3e5bc6781f..5f6f3f829b36 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -33,6 +33,7 @@ #include <linux/extable.h> #include <linux/log2.h> #include <linux/bpf_verifier.h> +#include <linux/nodemask.h> #include <asm/barrier.h> #include <asm/unaligned.h> @@ -105,6 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); + fp->blinding_requested = bpf_jit_blinding_enabled(fp); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -537,13 +539,10 @@ long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) { - const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); - unsigned long addr = (unsigned long)hdr; - WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; - prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; } static void @@ -808,6 +807,181 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, return slot; } +/* + * BPF program pack allocator. + * + * Most BPF programs are pretty small. Allocating a hole page for each + * program is sometime a waste. Many small bpf program also adds pressure + * to instruction TLB. To solve this issue, we introduce a BPF program pack + * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) + * to host BPF programs. + */ +#define BPF_PROG_CHUNK_SHIFT 6 +#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) +#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) + +struct bpf_prog_pack { + struct list_head list; + void *ptr; + unsigned long bitmap[]; +}; + +#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) + +static size_t bpf_prog_pack_size = -1; +static size_t bpf_prog_pack_mask = -1; + +static int bpf_prog_chunk_count(void) +{ + WARN_ON_ONCE(bpf_prog_pack_size == -1); + return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE; +} + +static DEFINE_MUTEX(pack_mutex); +static LIST_HEAD(pack_list); + +/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with + * CONFIG_MMU=n. Use PAGE_SIZE in these cases. + */ +#ifdef PMD_SIZE +#define BPF_HPAGE_SIZE PMD_SIZE +#define BPF_HPAGE_MASK PMD_MASK +#else +#define BPF_HPAGE_SIZE PAGE_SIZE +#define BPF_HPAGE_MASK PAGE_MASK +#endif + +static size_t select_bpf_prog_pack_size(void) +{ + size_t size; + void *ptr; + + size = BPF_HPAGE_SIZE * num_online_nodes(); + ptr = module_alloc(size); + + /* Test whether we can get huge pages. If not just use PAGE_SIZE + * packs. + */ + if (!ptr || !is_vm_area_hugepages(ptr)) { + size = PAGE_SIZE; + bpf_prog_pack_mask = PAGE_MASK; + } else { + bpf_prog_pack_mask = BPF_HPAGE_MASK; + } + + vfree(ptr); + return size; +} + +static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_prog_pack *pack; + + pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())), + GFP_KERNEL); + if (!pack) + return NULL; + pack->ptr = module_alloc(bpf_prog_pack_size); + if (!pack->ptr) { + kfree(pack); + return NULL; + } + bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size); + bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE); + list_add_tail(&pack->list, &pack_list); + + set_vm_flush_reset_perms(pack->ptr); + set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + return pack; +} + +static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); + struct bpf_prog_pack *pack; + unsigned long pos; + void *ptr = NULL; + + mutex_lock(&pack_mutex); + if (bpf_prog_pack_size == -1) + bpf_prog_pack_size = select_bpf_prog_pack_size(); + + if (size > bpf_prog_pack_size) { + size = round_up(size, PAGE_SIZE); + ptr = module_alloc(size); + if (ptr) { + bpf_fill_ill_insns(ptr, size); + set_vm_flush_reset_perms(ptr); + set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); + set_memory_x((unsigned long)ptr, size / PAGE_SIZE); + } + goto out; + } + list_for_each_entry(pack, &pack_list, list) { + pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + nbits, 0); + if (pos < bpf_prog_chunk_count()) + goto found_free_area; + } + + pack = alloc_new_pack(bpf_fill_ill_insns); + if (!pack) + goto out; + + pos = 0; + +found_free_area: + bitmap_set(pack->bitmap, pos, nbits); + ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); + +out: + mutex_unlock(&pack_mutex); + return ptr; +} + +static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +{ + struct bpf_prog_pack *pack = NULL, *tmp; + unsigned int nbits; + unsigned long pos; + void *pack_ptr; + + mutex_lock(&pack_mutex); + if (hdr->size > bpf_prog_pack_size) { + module_memfree(hdr); + goto out; + } + + pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask); + + list_for_each_entry(tmp, &pack_list, list) { + if (tmp->ptr == pack_ptr) { + pack = tmp; + break; + } + } + + if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) + goto out; + + nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); + pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + + WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size), + "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); + + bitmap_clear(pack->bitmap, pos, nbits); + if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + bpf_prog_chunk_count(), 0) == 0) { + list_del(&pack->list); + module_memfree(pack->ptr); + kfree(pack); + } +out: + mutex_unlock(&pack_mutex); +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, @@ -833,12 +1007,11 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(pages, &bpf_jit_current) > - (bpf_jit_limit >> PAGE_SHIFT)) { + if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { if (!bpf_capable()) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } @@ -846,9 +1019,9 @@ int bpf_jit_charge_modmem(u32 pages) return 0; } -void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 size) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) @@ -867,7 +1040,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - u32 size, hole, start, pages; + u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); @@ -877,20 +1050,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); - pages = size / PAGE_SIZE; - if (bpf_jit_charge_modmem(pages)) + if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = pages; + hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -903,10 +1075,117 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - u32 pages = hdr->pages; + u32 size = hdr->size; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); +} + +/* Allocate jit binary from bpf_prog_pack allocator. + * Since the allocated memory is RO+X, the JIT engine cannot write directly + * to the memory. To solve this problem, a RW buffer is also allocated at + * as the same time. The JIT engine should calculate offsets based on the + * RO memory address, but write JITed program to the RW buffer. Once the + * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies + * the JITed program to the RO memory. + */ +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + struct bpf_binary_header **rw_header, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *ro_header; + u32 size, hole, start; + + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + + /* add 16 bytes for a random section of illegal instructions */ + size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); + + if (bpf_jit_charge_modmem(size)) + return NULL; + ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns); + if (!ro_header) { + bpf_jit_uncharge_modmem(size); + return NULL; + } + + *rw_header = kvmalloc(size, GFP_KERNEL); + if (!*rw_header) { + bpf_arch_text_copy(&ro_header->size, &size, sizeof(size)); + bpf_prog_pack_free(ro_header); + bpf_jit_uncharge_modmem(size); + return NULL; + } + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(*rw_header, size); + (*rw_header)->size = size; + + hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), + BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); + start = (get_random_int() % hole) & ~(alignment - 1); + + *image_ptr = &ro_header->image[start]; + *rw_image = &(*rw_header)->image[start]; + + return ro_header; +} + +/* Copy JITed text from rw_header to its final location, the ro_header. */ +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + void *ptr; + + ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); + + kvfree(rw_header); + + if (IS_ERR(ptr)) { + bpf_prog_pack_free(ro_header); + return PTR_ERR(ptr); + } + prog->aux->use_bpf_prog_pack = true; + return 0; +} + +/* bpf_jit_binary_pack_free is called in two different scenarios: + * 1) when the program is freed after; + * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). + * For case 2), we need to free both the RO memory and the RW buffer. + * + * bpf_jit_binary_pack_free requires proper ro_header->size. However, + * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size + * must be set with either bpf_jit_binary_pack_finalize (normal path) or + * bpf_arch_text_copy (when jit fails). + */ +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + u32 size = ro_header->size; + + bpf_prog_pack_free(ro_header); + kvfree(rw_header); + bpf_jit_uncharge_modmem(size); +} + +static inline struct bpf_binary_header * +bpf_jit_binary_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + if (fp->aux->use_bpf_prog_pack) + addr = real_start & BPF_PROG_CHUNK_MASK; + else + addr = real_start & PAGE_MASK; + + return (void *)addr; } /* This symbol is only overridden by archs that have different @@ -918,7 +1197,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_free(hdr); + if (fp->aux->use_bpf_prog_pack) + bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); + else + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } @@ -1146,7 +1428,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog) || prog->blinded) + if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); @@ -1157,6 +1439,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn = clone->insnsi; for (i = 0; i < insn_cnt; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* ld_imm64 with an address of bpf subprog is not + * a user controlled constant. Don't randomize it, + * since it will conflict with jit_subprogs() logic. + */ + insn++; + i++; + continue; + } + /* We temporarily need to hold the original ld64 insn * so that we can still access the first part in the * second blinding run. @@ -1661,6 +1953,11 @@ out: CONT; \ LDX_MEM_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; \ + LDX_PROBE_MEM_##SIZEOP: \ + bpf_probe_read_kernel(&DST, sizeof(SIZE), \ + (const void *)(long) (SRC + insn->off)); \ + DST = *((SIZE *)&DST); \ CONT; LDST(B, u8) @@ -1668,15 +1965,6 @@ out: LDST(W, u32) LDST(DW, u64) #undef LDST -#define LDX_PROBE(SIZEOP, SIZE) \ - LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \ - CONT; - LDX_PROBE(B, 1) - LDX_PROBE(H, 2) - LDX_PROBE(W, 4) - LDX_PROBE(DW, 8) -#undef LDX_PROBE #define ATOMIC_ALU_OP(BOP, KOP) \ case BOP: \ @@ -1829,28 +2117,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, } #endif -bool bpf_prog_array_compatible(struct bpf_array *array, - const struct bpf_prog *fp) +bool bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { bool ret; if (fp->kprobe_override) return false; - spin_lock(&array->aux->owner.lock); - - if (!array->aux->owner.type) { + spin_lock(&map->owner.lock); + if (!map->owner.type) { /* There's no owner yet where we could check for * compatibility. */ - array->aux->owner.type = fp->type; - array->aux->owner.jited = fp->jited; + map->owner.type = fp->type; + map->owner.jited = fp->jited; + map->owner.xdp_has_frags = fp->aux->xdp_has_frags; ret = true; } else { - ret = array->aux->owner.type == fp->type && - array->aux->owner.jited == fp->jited; + ret = map->owner.type == fp->type && + map->owner.jited == fp->jited && + map->owner.xdp_has_frags == fp->aux->xdp_has_frags; } - spin_unlock(&array->aux->owner.lock); + spin_unlock(&map->owner.lock); + return ret; } @@ -1862,13 +2152,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; - struct bpf_array *array; - if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + if (!map_type_contains_progs(map)) continue; - array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) { + if (!bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } @@ -1968,18 +2256,10 @@ static struct bpf_prog_dummy { }, }; -/* to avoid allocating empty bpf_prog_array for cgroups that - * don't have bpf program attached use one global 'empty_prog_array' - * It will not be modified the caller of bpf_prog_array_alloc() - * (since caller requested prog_cnt == 0) - * that pointer should be 'freed' by bpf_prog_array_free() - */ -static struct { - struct bpf_prog_array hdr; - struct bpf_prog *null_prog; -} empty_prog_array = { +struct bpf_empty_prog_array bpf_empty_prog_array = { .null_prog = NULL, }; +EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { @@ -1989,12 +2269,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) (prog_cnt + 1), flags); - return &empty_prog_array.hdr; + return &bpf_empty_prog_array.hdr; } void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || progs == &empty_prog_array.hdr) + if (!progs || progs == &bpf_empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } @@ -2350,6 +2630,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_jiffies64_proto __weak; @@ -2453,6 +2734,16 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return -ENOTSUPP; } +void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + return ERR_PTR(-ENOTSUPP); +} + +int __weak bpf_arch_text_invalidate(void *dst, size_t len) +{ + return -ENOTSUPP; +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b3e6b9422238..f4860ac756cd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -27,6 +27,7 @@ #include <linux/kthread.h> #include <linux/capability.h> #include <trace/events/xdp.h> +#include <linux/btf_ids.h> #include <linux/netdevice.h> /* netif_receive_skb_list */ #include <linux/etherdevice.h> /* eth_type_trans */ @@ -397,7 +398,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, + struct bpf_map *map, int fd) { struct bpf_prog *prog; @@ -405,7 +407,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + if (prog->expected_attach_type != BPF_XDP_CPUMAP || + !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return -EINVAL; } @@ -457,7 +460,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; /* Setup kthread */ @@ -671,7 +674,7 @@ static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) __cpu_map_lookup_elem); } -static int cpu_map_btf_id; +BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map) const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = cpu_map_alloc, @@ -681,8 +684,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_cpu_map", - .map_btf_id = &cpu_map_btf_id, + .map_btf_id = &cpu_map_btf_ids[0], .map_redirect = cpu_map_redirect, }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index fe019dbdb3f0..c2867068e5bd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,6 +48,7 @@ #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> +#include <linux/btf_ids.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -858,7 +859,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; - if (prog->expected_attach_type != BPF_XDP_DEVMAP) + if (prog->expected_attach_type != BPF_XDP_DEVMAP || + !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } @@ -1004,7 +1006,7 @@ static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) __dev_map_hash_lookup_elem); } -static int dev_map_btf_id; +BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, @@ -1014,12 +1016,10 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_dtab", - .map_btf_id = &dev_map_btf_id, + .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; -static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, @@ -1029,8 +1029,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_dtab", - .map_btf_id = &dev_map_hash_map_btf_id, + .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d29af9988f37..17fb69c0e0dc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -10,6 +10,7 @@ #include <linux/random.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> +#include <linux/btf_ids.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -139,7 +140,7 @@ static inline bool htab_use_raw_lock(const struct bpf_htab *htab) static void htab_init_buckets(struct bpf_htab *htab) { - unsigned i; + unsigned int i; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); @@ -238,7 +239,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int i; - if (likely(!map_value_has_timer(&htab->map))) + if (!map_value_has_timer(&htab->map)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -254,6 +255,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } +static void htab_free_prealloced_kptrs(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (!map_value_has_kptrs(&htab->map)) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } +} + static void htab_free_elems(struct bpf_htab *htab) { int i; @@ -725,12 +745,15 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) +static void check_and_free_fields(struct bpf_htab *htab, + struct htab_elem *elem) { - if (unlikely(map_value_has_timer(&htab->map))) - bpf_timer_cancel_and_free(elem->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + void *map_value = elem->key + round_up(htab->map.key_size, 8); + + if (map_value_has_timer(&htab->map)) + bpf_timer_cancel_and_free(map_value + htab->map.timer_off); + if (map_value_has_kptrs(&htab->map)) + bpf_map_free_kptrs(&htab->map, map_value); } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -738,7 +761,7 @@ static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) */ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { - struct bpf_htab *htab = (struct bpf_htab *)arg; + struct bpf_htab *htab = arg; struct htab_elem *l = NULL, *tgt_l; struct hlist_nulls_head *head; struct hlist_nulls_node *n; @@ -757,7 +780,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); break; } @@ -829,7 +852,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); kfree(l); } @@ -857,7 +880,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -1104,7 +1127,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); else - check_and_free_timer(htab, l_old); + check_and_free_fields(htab, l_old); } ret = 0; err: @@ -1114,7 +1137,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { - check_and_free_timer(htab, elem); + check_and_free_fields(htab, elem); bpf_lru_push_free(&htab->lru, &elem->lru_node); } @@ -1419,8 +1442,14 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) struct hlist_nulls_node *n; struct htab_elem *l; - hlist_nulls_for_each_entry(l, n, head, hash_node) - check_and_free_timer(htab, l); + hlist_nulls_for_each_entry(l, n, head, hash_node) { + /* We don't reset or free kptr on uref dropping to zero, + * hence just free timer. + */ + bpf_timer_cancel_and_free(l->key + + round_up(htab->map.key_size, 8) + + htab->map.timer_off); + } cond_resched_rcu(); } rcu_read_unlock(); @@ -1430,7 +1459,8 @@ static void htab_map_free_timers(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - if (likely(!map_value_has_timer(&htab->map))) + /* We don't reset or free kptr on uref dropping to zero. */ + if (!map_value_has_timer(&htab->map)) return; if (!htab_is_prealloc(htab)) htab_free_malloced_timers(htab); @@ -1453,11 +1483,14 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (!htab_is_prealloc(htab)) + if (!htab_is_prealloc(htab)) { delete_all_elements(htab); - else + } else { + htab_free_prealloced_kptrs(htab); prealloc_destroy(htab); + } + bpf_map_free_kptr_off_tab(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) @@ -1594,7 +1627,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); - u32 batch, max_count, size, bucket_size; + u32 batch, max_count, size, bucket_size, map_id; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1636,7 +1669,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to - * 1000, it was observed that a bucket can have upto 5 entries. + * 1000, it was observed that a bucket can have up to 5 entries. */ bucket_size = 5; @@ -1719,6 +1752,14 @@ again_nocopy: } } else { value = l->key + roundup_key_size; + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + struct bpf_map **inner_map = value; + + /* Actual value is the id of the inner map */ + map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); + value = &map_id; + } + if (elem_map_flags & BPF_F_LOCK) copy_map_value_locked(map, dst_val, value, true); @@ -2105,7 +2146,7 @@ out: return num_elems; } -static int htab_map_btf_id; +BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2122,12 +2163,10 @@ const struct bpf_map_ops htab_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2145,8 +2184,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_lru_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -2161,6 +2199,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + else + return NULL; +} + static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); @@ -2173,6 +2225,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { struct htab_elem *l; @@ -2252,7 +2320,6 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2263,16 +2330,15 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_percpu), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_percpu_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2283,12 +2349,12 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru_percpu), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_lru_percpu_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -2412,7 +2478,6 @@ static void htab_of_map_free(struct bpf_map *map) fd_htab_map_free(map); } -static int htab_of_maps_map_btf_id; const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, @@ -2425,6 +2490,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_of_maps_map_btf_id, + BATCH_OPS(htab), + .map_btf_id = &htab_map_btf_ids[0], }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 55c084251fab..225806a02efb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -17,6 +17,7 @@ #include <linux/pid_namespace.h> #include <linux/proc_ns.h> #include <linux/security.h> +#include <linux/btf_ids.h> #include "../../lib/kstrtox.h" @@ -102,7 +103,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -115,7 +116,23 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, +}; + +BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) +{ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); +} + +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { + .func = bpf_map_lookup_percpu_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_get_prandom_u32_proto = { @@ -224,13 +241,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) if (unlikely(!task)) goto err_clear; - strncpy(buf, task->comm, size); - - /* Verifier guarantees that size > 0. For task->comm exceeding - * size, guarantee that buf is %NUL-terminated. Unconditionally - * done here to save the size test. - */ - buf[size - 1] = 0; + /* Verifier guarantees that size > 0 */ + strscpy(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); @@ -672,6 +684,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, + const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) +{ + int ret; + + /* flags is not used yet */ + if (unlikely(flags)) + return -EINVAL; + + if (unlikely(!size)) + return 0; + + ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); + if (ret == size) + return 0; + + memset(dst, 0, size); + /* Return -EFAULT for partial read */ + return ret < 0 ? ret : -EFAULT; +} + +const struct bpf_func_proto bpf_copy_from_user_task_proto = { + .func = bpf_copy_from_user_task, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_BTF_ID, + .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg5_type = ARG_ANYTHING +}; + BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) { if (cpu >= nr_cpu_ids) @@ -1059,7 +1104,7 @@ struct bpf_hrtimer { struct bpf_timer_kern { struct bpf_hrtimer *timer; /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space resereved by struct bpf_timer + * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. */ struct bpf_spin_lock lock; @@ -1345,6 +1390,191 @@ out: kfree(t); } +BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) +{ + unsigned long *kptr = map_value; + + return xchg(kptr, (unsigned long)ptr); +} + +/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() + * helper is determined dynamically by the verifier. + */ +#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) + +const struct bpf_func_proto bpf_kptr_xchg_proto = { + .func = bpf_kptr_xchg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = BPF_PTR_POISON, + .arg1_type = ARG_PTR_TO_KPTR, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, + .arg2_btf_id = BPF_PTR_POISON, +}; + +/* Since the upper 8 bits of dynptr->size is reserved, the + * maximum supported size is 2^24 - 1. + */ +#define DYNPTR_MAX_SIZE ((1UL << 24) - 1) +#define DYNPTR_TYPE_SHIFT 28 +#define DYNPTR_SIZE_MASK 0xFFFFFF +#define DYNPTR_RDONLY_BIT BIT(31) + +static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +{ + return ptr->size & DYNPTR_RDONLY_BIT; +} + +static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) +{ + ptr->size |= type << DYNPTR_TYPE_SHIFT; +} + +static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +{ + return ptr->size & DYNPTR_SIZE_MASK; +} + +int bpf_dynptr_check_size(u32 size) +{ + return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; +} + +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size) +{ + ptr->data = data; + ptr->offset = offset; + ptr->size = size; + bpf_dynptr_set_type(ptr, type); +} + +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) +{ + memset(ptr, 0, sizeof(*ptr)); +} + +static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +{ + u32 size = bpf_dynptr_get_size(ptr); + + if (len > size || offset > size - len) + return -E2BIG; + + return 0; +} + +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +{ + int err; + + err = bpf_dynptr_check_size(size); + if (err) + goto error; + + /* flags is currently unsupported */ + if (flags) { + err = -EINVAL; + goto error; + } + + bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size); + + return 0; + +error: + bpf_dynptr_set_null(ptr); + return err; +} + +const struct bpf_func_proto bpf_dynptr_from_mem_proto = { + .func = bpf_dynptr_from_mem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, +}; + +BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset) +{ + int err; + + if (!src->data) + return -EINVAL; + + err = bpf_dynptr_check_off_len(src, offset, len); + if (err) + return err; + + memcpy(dst, src->data + src->offset + offset, len); + + return 0; +} + +const struct bpf_func_proto bpf_dynptr_read_proto = { + .func = bpf_dynptr_read, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_DYNPTR, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len) +{ + int err; + + if (!dst->data || bpf_dynptr_is_rdonly(dst)) + return -EINVAL; + + err = bpf_dynptr_check_off_len(dst, offset, len); + if (err) + return err; + + memcpy(dst->data + dst->offset + offset, src, len); + + return 0; +} + +const struct bpf_func_proto bpf_dynptr_write_proto = { + .func = bpf_dynptr_write, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_DYNPTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +{ + int err; + + if (!ptr->data) + return 0; + + err = bpf_dynptr_check_off_len(ptr, offset, len); + if (err) + return 0; + + if (bpf_dynptr_is_rdonly(ptr)) + return 0; + + return (unsigned long)(ptr->data + ptr->offset + offset); +} + +const struct bpf_func_proto bpf_dynptr_data_proto = { + .func = bpf_dynptr_data, + .gpl_only = false, + .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_DYNPTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; @@ -1369,6 +1599,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: @@ -1391,12 +1623,26 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_ringbuf_reserve_dynptr: + return &bpf_ringbuf_reserve_dynptr_proto; + case BPF_FUNC_ringbuf_submit_dynptr: + return &bpf_ringbuf_submit_dynptr_proto; + case BPF_FUNC_ringbuf_discard_dynptr: + return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; + case BPF_FUNC_dynptr_from_mem: + return &bpf_dynptr_from_mem_proto; + case BPF_FUNC_dynptr_read: + return &bpf_dynptr_read_proto; + case BPF_FUNC_dynptr_write: + return &bpf_dynptr_write_proto; + case BPF_FUNC_dynptr_data: + return &bpf_dynptr_data_proto; default: break; } @@ -1423,6 +1669,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_timer_start_proto; case BPF_FUNC_timer_cancel: return &bpf_timer_cancel_proto; + case BPF_FUNC_kptr_xchg: + return &bpf_kptr_xchg_proto; default: break; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5a8d9f7467bf..4f841e16779e 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -710,11 +710,10 @@ static DEFINE_MUTEX(bpf_preload_lock); static int populate_bpffs(struct dentry *parent) { struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; - struct bpf_link *links[BPF_PRELOAD_LINKS] = {}; int err = 0, i; /* grab the mutex to make sure the kernel interactions with bpf_preload - * UMD are serialized + * are serialized */ mutex_lock(&bpf_preload_lock); @@ -722,40 +721,22 @@ static int populate_bpffs(struct dentry *parent) if (!bpf_preload_mod_get()) goto out; - if (!bpf_preload_ops->info.tgid) { - /* preload() will start UMD that will load BPF iterator programs */ - err = bpf_preload_ops->preload(objs); - if (err) + err = bpf_preload_ops->preload(objs); + if (err) + goto out_put; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + bpf_link_inc(objs[i].link); + err = bpf_iter_link_pin_kernel(parent, + objs[i].link_name, objs[i].link); + if (err) { + bpf_link_put(objs[i].link); goto out_put; - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - links[i] = bpf_link_by_id(objs[i].link_id); - if (IS_ERR(links[i])) { - err = PTR_ERR(links[i]); - goto out_put; - } } - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - err = bpf_iter_link_pin_kernel(parent, - objs[i].link_name, links[i]); - if (err) - goto out_put; - /* do not unlink successfully pinned links even - * if later link fails to pin - */ - links[i] = NULL; - } - /* finish() will tell UMD process to exit */ - err = bpf_preload_ops->finish(); - if (err) - goto out_put; } out_put: bpf_preload_mod_put(); out: mutex_unlock(&bpf_preload_lock); - for (i = 0; i < BPF_PRELOAD_LINKS && err; i++) - if (!IS_ERR_OR_NULL(links[i])) - bpf_link_put(links[i]); return err; } diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c new file mode 100644 index 000000000000..fec8005a121c --- /dev/null +++ b/kernel/bpf/link_iter.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Red Hat, Inc. */ +#include <linux/bpf.h> +#include <linux/fs.h> +#include <linux/filter.h> +#include <linux/kernel.h> +#include <linux/btf_ids.h> + +struct bpf_iter_seq_link_info { + u32 link_id; +}; + +static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_link_info *info = seq->private; + struct bpf_link *link; + + link = bpf_link_get_curr_or_next(&info->link_id); + if (!link) + return NULL; + + if (*pos == 0) + ++*pos; + return link; +} + +static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_link_info *info = seq->private; + + ++*pos; + ++info->link_id; + bpf_link_put((struct bpf_link *)v); + return bpf_link_get_curr_or_next(&info->link_id); +} + +struct bpf_iter__bpf_link { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_link *, link); +}; + +DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link) + +static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_link ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.link = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_link_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_link_seq_show(seq, v, false); +} + +static void bpf_link_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_link_seq_show(seq, v, true); + else + bpf_link_put((struct bpf_link *)v); +} + +static const struct seq_operations bpf_link_seq_ops = { + .start = bpf_link_seq_start, + .next = bpf_link_seq_next, + .stop = bpf_link_seq_stop, + .show = bpf_link_seq_show, +}; + +BTF_ID_LIST(btf_bpf_link_id) +BTF_ID(struct, bpf_link) + +static const struct bpf_iter_seq_info bpf_link_seq_info = { + .seq_ops = &bpf_link_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_link_info), +}; + +static struct bpf_iter_reg bpf_link_reg_info = { + .target = "bpf_link", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_link, link), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &bpf_link_seq_info, +}; + +static int __init bpf_link_iter_init(void) +{ + bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id; + return bpf_iter_reg_target(&bpf_link_reg_info); +} + +late_initcall(bpf_link_iter_init); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 23f7f9d08a62..8654fc97f5fe 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,4 +1,4 @@ -//SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0 #include <linux/bpf-cgroup.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> @@ -9,6 +9,7 @@ #include <linux/rbtree.h> #include <linux/slab.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> #ifdef CONFIG_CGROUP_BPF @@ -446,7 +447,8 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int cgroup_storage_map_btf_id; +BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, + bpf_cgroup_storage_map) const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -456,8 +458,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, - .map_btf_name = "bpf_cgroup_storage_map", - .map_btf_id = &cgroup_storage_map_btf_id, + .map_btf_id = &cgroup_storage_map_btf_ids[0], }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 5763cc7ac4f1..f0d05a3cc4b9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -14,6 +14,7 @@ #include <linux/vmalloc.h> #include <net/ipv6.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -719,7 +720,7 @@ static int trie_check_btf(const struct bpf_map *map, -EINVAL : 0; } -static int trie_map_btf_id; +BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie) const struct bpf_map_ops trie_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = trie_alloc, @@ -732,6 +733,5 @@ const struct bpf_map_ops trie_map_ops = { .map_update_batch = generic_map_update_batch, .map_delete_batch = generic_map_delete_batch, .map_check_btf = trie_check_btf, - .map_btf_name = "lpm_trie", - .map_btf_id = &trie_map_btf_id, + .map_btf_id = &trie_map_btf_ids[0], }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 5cd8f5277279..135205d0d560 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -52,6 +52,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; inner_map_meta->timer_off = inner_map->timer_off; + inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; @@ -71,6 +72,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) void bpf_map_meta_free(struct bpf_map *map_meta) { + bpf_map_free_kptr_off_tab(map_meta); btf_put(map_meta->btf); kfree(map_meta); } @@ -83,7 +85,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->timer_off == meta1->timer_off && - meta0->map_flags == meta1->map_flags; + meta0->map_flags == meta1->map_flags && + bpf_map_equal_kptr_off_tab(meta0, meta1); } void *bpf_map_fd_get_ptr(struct bpf_map *map, diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index 26bced262473..c9d45c9d6918 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -18,10 +18,9 @@ menuconfig BPF_PRELOAD if BPF_PRELOAD config BPF_PRELOAD_UMD - tristate "bpf_preload kernel module with user mode driver" - depends on CC_CAN_LINK - depends on m || CC_CAN_LINK_STATIC + tristate "bpf_preload kernel module" default m help - This builds bpf_preload kernel module with embedded user mode driver. + This builds bpf_preload kernel module with embedded BPF programs for + introspection in bpffs. endif diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile index 1400ac58178e..20f89cc0a0a6 100644 --- a/kernel/bpf/preload/Makefile +++ b/kernel/bpf/preload/Makefile @@ -1,42 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ -LIBBPF_OUT = $(abspath $(obj))/libbpf -LIBBPF_A = $(LIBBPF_OUT)/libbpf.a -LIBBPF_DESTDIR = $(LIBBPF_OUT) -LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include - -# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test -# in tools/scripts/Makefile.include always succeeds when building the kernel -# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg". -$(LIBBPF_A): | $(LIBBPF_OUT) - $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \ - DESTDIR=$(LIBBPF_DESTDIR) prefix= \ - $(LIBBPF_OUT)/libbpf.a install_headers - -libbpf_hdrs: $(LIBBPF_A) - -.PHONY: libbpf_hdrs - -$(LIBBPF_OUT): - $(call msg,MKDIR,$@) - $(Q)mkdir -p $@ - -userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ - -I $(LIBBPF_INCLUDE) -Wno-unused-result - -userprogs := bpf_preload_umd - -clean-files := libbpf/ - -$(obj)/iterators/iterators.o: | libbpf_hdrs - -bpf_preload_umd-objs := iterators/iterators.o -bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz - -$(obj)/bpf_preload_umd: $(LIBBPF_A) - -$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd +LIBBPF_INCLUDE = $(srctree)/tools/lib obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o -bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o +CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE) +bpf_preload-objs += bpf_preload_kern.o diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h index 2f9932276f2e..f065c91213a0 100644 --- a/kernel/bpf/preload/bpf_preload.h +++ b/kernel/bpf/preload/bpf_preload.h @@ -2,13 +2,13 @@ #ifndef _BPF_PRELOAD_H #define _BPF_PRELOAD_H -#include <linux/usermode_driver.h> -#include "iterators/bpf_preload_common.h" +struct bpf_preload_info { + char link_name[16]; + struct bpf_link *link; +}; struct bpf_preload_ops { - struct umd_info info; int (*preload)(struct bpf_preload_info *); - int (*finish)(void); struct module *owner; }; extern struct bpf_preload_ops *bpf_preload_ops; diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 53736e52c1df..5106b5372f0c 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -2,101 +2,87 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <linux/module.h> -#include <linux/pid.h> -#include <linux/fs.h> -#include <linux/sched/signal.h> #include "bpf_preload.h" +#include "iterators/iterators.lskel.h" -extern char bpf_preload_umd_start; -extern char bpf_preload_umd_end; +static struct bpf_link *maps_link, *progs_link; +static struct iterators_bpf *skel; -static int preload(struct bpf_preload_info *obj); -static int finish(void); +static void free_links_and_skel(void) +{ + if (!IS_ERR_OR_NULL(maps_link)) + bpf_link_put(maps_link); + if (!IS_ERR_OR_NULL(progs_link)) + bpf_link_put(progs_link); + iterators_bpf__destroy(skel); +} + +static int preload(struct bpf_preload_info *obj) +{ + strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); + obj[0].link = maps_link; + strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); + obj[1].link = progs_link; + return 0; +} -static struct bpf_preload_ops umd_ops = { - .info.driver_name = "bpf_preload", +static struct bpf_preload_ops ops = { .preload = preload, - .finish = finish, .owner = THIS_MODULE, }; -static int preload(struct bpf_preload_info *obj) +static int load_skel(void) { - int magic = BPF_PRELOAD_START; - loff_t pos = 0; - int i, err; - ssize_t n; + int err; - err = fork_usermode_driver(&umd_ops.info); + skel = iterators_bpf__open(); + if (!skel) + return -ENOMEM; + err = iterators_bpf__load(skel); if (err) - return err; - - /* send the start magic to let UMD proceed with loading BPF progs */ - n = kernel_write(umd_ops.info.pipe_to_umh, - &magic, sizeof(magic), &pos); - if (n != sizeof(magic)) - return -EPIPE; - - /* receive bpf_link IDs and names from UMD */ - pos = 0; - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - n = kernel_read(umd_ops.info.pipe_from_umh, - &obj[i], sizeof(*obj), &pos); - if (n != sizeof(*obj)) - return -EPIPE; + goto out; + err = iterators_bpf__attach(skel); + if (err) + goto out; + maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd); + if (IS_ERR(maps_link)) { + err = PTR_ERR(maps_link); + goto out; } - return 0; -} - -static int finish(void) -{ - int magic = BPF_PRELOAD_END; - struct pid *tgid; - loff_t pos = 0; - ssize_t n; - - /* send the last magic to UMD. It will do a normal exit. */ - n = kernel_write(umd_ops.info.pipe_to_umh, - &magic, sizeof(magic), &pos); - if (n != sizeof(magic)) - return -EPIPE; - - tgid = umd_ops.info.tgid; - if (tgid) { - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(&umd_ops.info); + progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd); + if (IS_ERR(progs_link)) { + err = PTR_ERR(progs_link); + goto out; } + /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out + * makes skel_closenz() a no-op later in iterators_bpf__destroy(). + */ + close_fd(skel->links.dump_bpf_map_fd); + skel->links.dump_bpf_map_fd = 0; + close_fd(skel->links.dump_bpf_prog_fd); + skel->links.dump_bpf_prog_fd = 0; return 0; +out: + free_links_and_skel(); + return err; } -static int __init load_umd(void) +static int __init load(void) { int err; - err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start, - &bpf_preload_umd_end - &bpf_preload_umd_start); + err = load_skel(); if (err) return err; - bpf_preload_ops = &umd_ops; + bpf_preload_ops = &ops; return err; } -static void __exit fini_umd(void) +static void __exit fini(void) { - struct pid *tgid; - bpf_preload_ops = NULL; - - /* kill UMD in case it's still there due to earlier error */ - tgid = umd_ops.info.tgid; - if (tgid) { - kill_pid(tgid, SIGKILL, 1); - - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(&umd_ops.info); - } - umd_unload_blob(&umd_ops.info); + free_links_and_skel(); } -late_initcall(load_umd); -module_exit(fini_umd); +late_initcall(load); +module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S deleted file mode 100644 index f1f40223b5c3..000000000000 --- a/kernel/bpf/preload/bpf_preload_umd_blob.S +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" - .global bpf_preload_umd_start -bpf_preload_umd_start: - .incbin "kernel/bpf/preload/bpf_preload_umd" - .global bpf_preload_umd_end -bpf_preload_umd_end: diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index b8bd60511227..bfe24f8c5a20 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -35,15 +35,15 @@ endif .PHONY: all clean -all: iterators.skel.h +all: iterators.lskel.h clean: $(call msg,CLEAN) $(Q)rm -rf $(OUTPUT) iterators -iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) +iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) $(call msg,GEN-SKEL,$@) - $(Q)$(BPFTOOL) gen skeleton $< > $@ + $(Q)$(BPFTOOL) gen skeleton -L $< > $@ $(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h deleted file mode 100644 index 8464d1a48c05..000000000000 --- a/kernel/bpf/preload/iterators/bpf_preload_common.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BPF_PRELOAD_COMMON_H -#define _BPF_PRELOAD_COMMON_H - -#define BPF_PRELOAD_START 0x5555 -#define BPF_PRELOAD_END 0xAAAA - -struct bpf_preload_info { - char link_name[16]; - int link_id; -}; - -#endif diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c deleted file mode 100644 index 5d872a705470..000000000000 --- a/kernel/bpf/preload/iterators/iterators.c +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2020 Facebook */ -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/resource.h> -#include <bpf/libbpf.h> -#include <bpf/bpf.h> -#include <sys/mount.h> -#include "iterators.skel.h" -#include "bpf_preload_common.h" - -int to_kernel = -1; -int from_kernel = 0; - -static int send_link_to_kernel(struct bpf_link *link, const char *link_name) -{ - struct bpf_preload_info obj = {}; - struct bpf_link_info info = {}; - __u32 info_len = sizeof(info); - int err; - - err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len); - if (err) - return err; - obj.link_id = info.id; - if (strlen(link_name) >= sizeof(obj.link_name)) - return -E2BIG; - strcpy(obj.link_name, link_name); - if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj)) - return -EPIPE; - return 0; -} - -int main(int argc, char **argv) -{ - struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY }; - struct iterators_bpf *skel; - int err, magic; - int debug_fd; - - debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC); - if (debug_fd < 0) - return 1; - to_kernel = dup(1); - close(1); - dup(debug_fd); - /* now stdin and stderr point to /dev/console */ - - read(from_kernel, &magic, sizeof(magic)); - if (magic != BPF_PRELOAD_START) { - printf("bad start magic %d\n", magic); - return 1; - } - setrlimit(RLIMIT_MEMLOCK, &rlim); - /* libbpf opens BPF object and loads it into the kernel */ - skel = iterators_bpf__open_and_load(); - if (!skel) { - /* iterators.skel.h is little endian. - * libbpf doesn't support automatic little->big conversion - * of BPF bytecode yet. - * The program load will fail in such case. - */ - printf("Failed load could be due to wrong endianness\n"); - return 1; - } - err = iterators_bpf__attach(skel); - if (err) - goto cleanup; - - /* send two bpf_link IDs with names to the kernel */ - err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug"); - if (err) - goto cleanup; - err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug"); - if (err) - goto cleanup; - - /* The kernel will proceed with pinnging the links in bpffs. - * UMD will wait on read from pipe. - */ - read(from_kernel, &magic, sizeof(magic)); - if (magic != BPF_PRELOAD_END) { - printf("bad final magic %d\n", magic); - err = -EINVAL; - } -cleanup: - iterators_bpf__destroy(skel); - - return err != 0; -} diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h new file mode 100644 index 000000000000..70f236a82fe1 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.lskel.h @@ -0,0 +1,425 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* THIS FILE IS AUTOGENERATED! */ +#ifndef __ITERATORS_BPF_SKEL_H__ +#define __ITERATORS_BPF_SKEL_H__ + +#include <bpf/skel_internal.h> + +struct iterators_bpf { + struct bpf_loader_ctx ctx; + struct { + struct bpf_map_desc rodata; + } maps; + struct { + struct bpf_prog_desc dump_bpf_map; + struct bpf_prog_desc dump_bpf_prog; + } progs; + struct { + int dump_bpf_map_fd; + int dump_bpf_prog_fd; + } links; + struct iterators_bpf__rodata { + } *rodata; +}; + +static inline int +iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel) +{ + int prog_fd = skel->progs.dump_bpf_map.prog_fd; + int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + + if (fd > 0) + skel->links.dump_bpf_map_fd = fd; + return fd; +} + +static inline int +iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel) +{ + int prog_fd = skel->progs.dump_bpf_prog.prog_fd; + int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + + if (fd > 0) + skel->links.dump_bpf_prog_fd = fd; + return fd; +} + +static inline int +iterators_bpf__attach(struct iterators_bpf *skel) +{ + int ret = 0; + + ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel); + ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel); + return ret < 0 ? ret : 0; +} + +static inline void +iterators_bpf__detach(struct iterators_bpf *skel) +{ + skel_closenz(skel->links.dump_bpf_map_fd); + skel_closenz(skel->links.dump_bpf_prog_fd); +} +static void +iterators_bpf__destroy(struct iterators_bpf *skel) +{ + if (!skel) + return; + iterators_bpf__detach(skel); + skel_closenz(skel->progs.dump_bpf_map.prog_fd); + skel_closenz(skel->progs.dump_bpf_prog.prog_fd); + skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096); + skel_closenz(skel->maps.rodata.map_fd); + skel_free(skel); +} +static inline struct iterators_bpf * +iterators_bpf__open(void) +{ + struct iterators_bpf *skel; + + skel = skel_alloc(sizeof(*skel)); + if (!skel) + goto cleanup; + skel->ctx.sz = (void *)&skel->links - (void *)skel; + skel->rodata = skel_prep_map_data((void *)"\ +\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\ +\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\ +\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98); + if (!skel->rodata) + goto cleanup; + skel->maps.rodata.initial_value = (__u64) (long) skel->rodata; + return skel; +cleanup: + iterators_bpf__destroy(skel); + return NULL; +} + +static inline int +iterators_bpf__load(struct iterators_bpf *skel) +{ + struct bpf_load_and_run_opts opts = {}; + int err; + + opts.ctx = (struct bpf_loader_ctx *)skel; + opts.data_sz = 6056; + opts.data = (void *)"\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\ +\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\ +\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\ +\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\ +\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\ +\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\ +\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\ +\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\ +\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\ +\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\ +\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\ +\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\ +\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\ +\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\ +\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\ +\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\ +\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\ +\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\ +\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\ +\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\ +\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\ +\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\ +\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\ +\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\ +\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\ +\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\ +\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\ +\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\ +\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\ +\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\ +\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\ +\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\ +\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\ +\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\ +\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\ +\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\ +\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\ +\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\ +\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\ +\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\ +\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\ +\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\ +\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\ +\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\ +\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\ +\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\ +\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\ +\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ +\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\ +\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\ +\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ +\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ +\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ +\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ +\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ +\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ +\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ +\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ +\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ +\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ +\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ +\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ +\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ +\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ +\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ +\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ +\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ +\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ +\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ +\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ +\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ +\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ +\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ +\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ +\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ +\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ +\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ +\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ +\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ +\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ +\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ +\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ +\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\ +\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\ +\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\ +\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\ +\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\ +\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\ +\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\ +\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\ +\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\ +\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\ +\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\ +\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\ +\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\ +\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\ +\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\ +\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\ +\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\ +\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\ +\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\ +\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\ +\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\ +\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\ +\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\ +\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\ +\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\ +\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\ +\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\ +\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\ +\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ +\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\ +\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\ +\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\ +\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\ +\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\ +\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\ +\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\ +\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\ +\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\ +\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\ +\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\ +\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\ +\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\ +\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\ +\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\ +\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\ +\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\ +\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\ +\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\ +\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\ +\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\ +\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\ +\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\ +\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\ +\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\ +\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\ +\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\ +\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\ +\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\ +\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\ +\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\ +\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\ +\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\ +\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\ +\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\ +\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\ +\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\ +\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\ +\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\ +\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\ +\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\ +\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\ +\0\0\0\0"; + opts.insns_sz = 2216; + opts.insns = (void *)"\ +\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\ +\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\ +\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\ +\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\ +\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\ +\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\ +\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\ +\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\ +\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\ +\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\ +\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\ +\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\ +\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\ +\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\ +\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\ +\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\ +\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ +\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\ +\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\ +\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\ +\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\ +\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\ +\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\ +\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\ +\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\ +\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\ +\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\ +\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\ +\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\ +\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\ +\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\ +\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\ +\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\ +\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\ +\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\ +\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\ +\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\ +\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\ +\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\ +\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\ +\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\ +\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\ +\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\ +\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\ +\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\ +\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\ +\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\ +\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\ +\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\ +\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\ +\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0"; + err = bpf_load_and_run(&opts); + if (err < 0) + return err; + skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value, + 4096, PROT_READ, skel->maps.rodata.map_fd); + if (!skel->rodata) + return -ENOMEM; + return 0; +} + +static inline struct iterators_bpf * +iterators_bpf__open_and_load(void) +{ + struct iterators_bpf *skel; + + skel = iterators_bpf__open(); + if (!skel) + return NULL; + if (iterators_bpf__load(skel)) { + iterators_bpf__destroy(skel); + return NULL; + } + return skel; +} + +#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h deleted file mode 100644 index cf9a6a94b3a4..000000000000 --- a/kernel/bpf/preload/iterators/iterators.skel.h +++ /dev/null @@ -1,412 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ - -/* THIS FILE IS AUTOGENERATED! */ -#ifndef __ITERATORS_BPF_SKEL_H__ -#define __ITERATORS_BPF_SKEL_H__ - -#include <stdlib.h> -#include <bpf/libbpf.h> - -struct iterators_bpf { - struct bpf_object_skeleton *skeleton; - struct bpf_object *obj; - struct { - struct bpf_map *rodata; - } maps; - struct { - struct bpf_program *dump_bpf_map; - struct bpf_program *dump_bpf_prog; - } progs; - struct { - struct bpf_link *dump_bpf_map; - struct bpf_link *dump_bpf_prog; - } links; - struct iterators_bpf__rodata { - char dump_bpf_map____fmt[35]; - char dump_bpf_map____fmt_1[14]; - char dump_bpf_prog____fmt[32]; - char dump_bpf_prog____fmt_2[17]; - } *rodata; -}; - -static void -iterators_bpf__destroy(struct iterators_bpf *obj) -{ - if (!obj) - return; - if (obj->skeleton) - bpf_object__destroy_skeleton(obj->skeleton); - free(obj); -} - -static inline int -iterators_bpf__create_skeleton(struct iterators_bpf *obj); - -static inline struct iterators_bpf * -iterators_bpf__open_opts(const struct bpf_object_open_opts *opts) -{ - struct iterators_bpf *obj; - - obj = (struct iterators_bpf *)calloc(1, sizeof(*obj)); - if (!obj) - return NULL; - if (iterators_bpf__create_skeleton(obj)) - goto err; - if (bpf_object__open_skeleton(obj->skeleton, opts)) - goto err; - - return obj; -err: - iterators_bpf__destroy(obj); - return NULL; -} - -static inline struct iterators_bpf * -iterators_bpf__open(void) -{ - return iterators_bpf__open_opts(NULL); -} - -static inline int -iterators_bpf__load(struct iterators_bpf *obj) -{ - return bpf_object__load_skeleton(obj->skeleton); -} - -static inline struct iterators_bpf * -iterators_bpf__open_and_load(void) -{ - struct iterators_bpf *obj; - - obj = iterators_bpf__open(); - if (!obj) - return NULL; - if (iterators_bpf__load(obj)) { - iterators_bpf__destroy(obj); - return NULL; - } - return obj; -} - -static inline int -iterators_bpf__attach(struct iterators_bpf *obj) -{ - return bpf_object__attach_skeleton(obj->skeleton); -} - -static inline void -iterators_bpf__detach(struct iterators_bpf *obj) -{ - return bpf_object__detach_skeleton(obj->skeleton); -} - -static inline int -iterators_bpf__create_skeleton(struct iterators_bpf *obj) -{ - struct bpf_object_skeleton *s; - - s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s)); - if (!s) - return -1; - obj->skeleton = s; - - s->sz = sizeof(*s); - s->name = "iterators_bpf"; - s->obj = &obj->obj; - - /* maps */ - s->map_cnt = 1; - s->map_skel_sz = sizeof(*s->maps); - s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz); - if (!s->maps) - goto err; - - s->maps[0].name = "iterator.rodata"; - s->maps[0].map = &obj->maps.rodata; - s->maps[0].mmaped = (void **)&obj->rodata; - - /* programs */ - s->prog_cnt = 2; - s->prog_skel_sz = sizeof(*s->progs); - s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz); - if (!s->progs) - goto err; - - s->progs[0].name = "dump_bpf_map"; - s->progs[0].prog = &obj->progs.dump_bpf_map; - s->progs[0].link = &obj->links.dump_bpf_map; - - s->progs[1].name = "dump_bpf_prog"; - s->progs[1].prog = &obj->progs.dump_bpf_prog; - s->progs[1].link = &obj->links.dump_bpf_prog; - - s->data_sz = 7176; - s->data = (void *)"\ -\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\ -\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\ -\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ -\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\ -\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\ -\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\ -\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\ -\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\ -\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\ -\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\ -\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\ -\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\ -\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\ -\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\ -\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\ -\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\ -\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\ -\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\ -\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\ -\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\ -\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\ -\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\ -\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\ -\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\ -\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\ -\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\ -\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\ -\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\ -\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\ -\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\ -\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\ -\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\ -\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\ -\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\ -\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\ -\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\ -\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\ -\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\ -\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\ -\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\ -\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\ -\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\ -\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ -\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\ -\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\ -\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\ -\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\ -\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ -\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\ -\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\ -\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\ -\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\ -\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\ -\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\ -\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\ -\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\ -\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ -\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\ -\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\ -\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\ -\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\ -\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\ -\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\ -\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ -\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\ -\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\ -\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\ -\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\ -\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\ -\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\ -\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\ -\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\ -\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\ -\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\ -\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\ -\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\ -\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\ -\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\ -\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ -\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\ -\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\ -\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ -\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ -\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ -\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ -\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ -\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ -\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ -\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ -\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ -\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ -\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ -\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ -\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ -\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ -\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ -\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ -\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ -\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ -\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ -\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ -\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ -\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ -\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ -\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ -\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ -\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ -\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ -\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ -\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ -\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ -\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ -\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ -\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ -\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ -\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ -\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ -\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ -\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\ -\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\ -\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\ -\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\ -\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\ -\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\ -\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\ -\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\ -\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\ -\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\ -\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\ -\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\ -\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\ -\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\ -\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\ -\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\ -\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\ -\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\ -\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\ -\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\ -\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\ -\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\ -\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\ -\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\ -\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\ -\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\ -\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\ -\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\ -\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\ -\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\ -\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\ -\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\ -\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\ -\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\ -\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\ -\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\ -\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\ -\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\ -\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\ -\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\ -\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\ -\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\ -\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\ -\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\ -\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\ -\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\ -\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\ -\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\ -\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\ -\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\ -\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\ -\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\ -\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\ -\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\ -\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\ -\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\ -\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\ -\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\ -\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\ -\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\ -\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\ -\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\ -\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ -\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\ -\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\ -\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\ -\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\ -\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\ -\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\ -\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\ -\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\ -\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\ -\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\ -\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\ -\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\ -\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\ -\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\ -\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ -\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\ -\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\ -\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; - - return 0; -err: - bpf_object__destroy_skeleton(s); - return -1; -} - -#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f9c734aaa990..a1c0794ae49d 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -8,6 +8,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/capability.h> +#include <linux/btf_ids.h> #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -247,7 +248,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } -static int queue_map_btf_id; +BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack) const struct bpf_map_ops queue_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, @@ -260,11 +261,9 @@ const struct bpf_map_ops queue_map_ops = { .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, - .map_btf_name = "bpf_queue_stack", - .map_btf_id = &queue_map_btf_id, + .map_btf_id = &queue_map_btf_ids[0], }; -static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, @@ -277,6 +276,5 @@ const struct bpf_map_ops stack_map_ops = { .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, - .map_btf_name = "bpf_queue_stack", - .map_btf_id = &stack_map_btf_id, + .map_btf_id = &queue_map_btf_ids[0], }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 556a769b5b80..e2618fb5870e 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -6,6 +6,7 @@ #include <linux/err.h> #include <linux/sock_diag.h> #include <net/sock_reuseport.h> +#include <linux/btf_ids.h> struct reuseport_array { struct bpf_map map; @@ -143,7 +144,7 @@ static void reuseport_array_free(struct bpf_map *map) /* * Once reaching here, all sk->sk_user_data is not - * referenceing this "array". "array" can be freed now. + * referencing this "array". "array" can be freed now. */ bpf_map_area_free(array); } @@ -337,7 +338,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, return 0; } -static int reuseport_array_map_btf_id; +BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, @@ -346,6 +347,5 @@ const struct bpf_map_ops reuseport_array_ops = { .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, - .map_btf_name = "reuseport_array", - .map_btf_id = &reuseport_array_map_btf_id, + .map_btf_id = &reuseport_array_map_btf_ids[0], }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 710ba9de12ce..ded4faeca192 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -10,6 +10,7 @@ #include <linux/poll.h> #include <linux/kmemleak.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) @@ -263,7 +264,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } -static int ringbuf_map_btf_id; +BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, @@ -274,8 +275,7 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, - .map_btf_name = "bpf_ringbuf_map", - .map_btf_id = &ringbuf_map_btf_id, + .map_btf_id = &ringbuf_map_btf_ids[0], }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, @@ -404,7 +404,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_submit_proto = { .func = bpf_ringbuf_submit, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -417,7 +417,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_discard_proto = { .func = bpf_ringbuf_discard, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -475,3 +475,81 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, + struct bpf_dynptr_kern *, ptr) +{ + struct bpf_ringbuf_map *rb_map; + void *sample; + int err; + + if (unlikely(flags)) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + err = bpf_dynptr_check_size(size); + if (err) { + bpf_dynptr_set_null(ptr); + return err; + } + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + sample = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!sample) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { + .func = bpf_ringbuf_reserve_dynptr, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, +}; + +BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + bpf_ringbuf_commit(ptr->data, flags, false /* discard */); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { + .func = bpf_ringbuf_submit_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + bpf_ringbuf_commit(ptr->data, flags, true /* discard */); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { + .func = bpf_ringbuf_discard_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 22c8ae94e4c1..1adbe67cdb95 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -100,13 +100,11 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); - smap->map.value_size = value_size; smap->n_buckets = n_buckets; err = get_callchain_buffers(sysctl_perf_event_max_stack); @@ -132,7 +130,8 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev_vma = NULL; + const char *prev_build_id; /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with @@ -150,6 +149,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } for (i = 0; i < trace_nr; i++) { + if (range_in_vma(prev_vma, ips[i], ips[i])) { + vma = prev_vma; + memcpy(id_offs[i].build_id, prev_build_id, + BUILD_ID_SIZE_MAX); + goto build_id_valid; + } vma = find_vma(current->mm, ips[i]); if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ @@ -158,15 +163,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } +build_id_valid: id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + prev_vma = vma; + prev_build_id = id_offs[i].build_id; } bpf_mmap_unlock_mm(work, current->mm); } static struct perf_callchain_entry * -get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) { #ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; @@ -177,9 +185,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) if (!entry) return NULL; - entry->nr = init_nr + - stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), - sysctl_perf_event_max_stack - init_nr, 0); + entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, + max_depth, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is @@ -191,7 +198,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) int i; /* copy data from the end to avoid using extra buffer */ - for (i = entry->nr - 1; i >= (int)init_nr; i--) + for (i = entry->nr - 1; i >= 0; i--) to[i] = (u64)(from[i]); } @@ -208,27 +215,19 @@ static long __bpf_get_stackid(struct bpf_map *map, { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; - /* get_perf_callchain() guarantees that trace->nr >= init_nr - * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth - */ - trace_nr = trace->nr - init_nr; - - if (trace_nr <= skip) + if (trace->nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_len = trace_nr * sizeof(u64); - ips = trace->ip + skip + init_nr; + ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); @@ -285,8 +284,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; @@ -295,8 +293,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + max_depth += skip; + if (max_depth > sysctl_perf_event_max_stack) + max_depth = sysctl_perf_event_max_stack; + + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -387,7 +389,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { - u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; @@ -412,30 +414,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, goto err_fault; num_elem = size / elem_size; - if (sysctl_perf_event_max_stack < num_elem) - init_nr = 0; - else - init_nr = sysctl_perf_event_max_stack - num_elem; + max_depth = num_elem + skip; + if (sysctl_perf_event_max_stack < max_depth) + max_depth = sysctl_perf_event_max_stack; if (trace_in) trace = trace_in; else if (kernel && task) - trace = get_callchain_entry_for_task(task, init_nr); + trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) goto err_fault; - trace_nr = trace->nr - init_nr; - if (trace_nr < skip) + if (trace->nr < skip) goto err_fault; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; - ips = trace->ip + skip + init_nr; + + ips = trace->ip + skip; if (user && user_build_id) stack_map_get_build_id_offset(buf, ips, trace_nr, user); else @@ -654,7 +654,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -static int stack_trace_map_btf_id; +BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map) const struct bpf_map_ops stack_trace_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, @@ -664,6 +664,5 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_stack_map", - .map_btf_id = &stack_trace_map_btf_id, + .map_btf_id = &stack_trace_map_btf_ids[0], }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ca70fe6fba38..2b69306d3c6e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6,6 +6,7 @@ #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> #include <linux/bpf_verifier.h> +#include <linux/bsearch.h> #include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> @@ -29,9 +30,11 @@ #include <linux/pgtable.h> #include <linux/bpf_lsm.h> #include <linux/poll.h> +#include <linux/sort.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> +#include <linux/trace_events.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -472,14 +475,128 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif +static int bpf_map_kptr_off_cmp(const void *a, const void *b) +{ + const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; + + if (off_desc1->offset < off_desc2->offset) + return -1; + else if (off_desc1->offset > off_desc2->offset) + return 1; + return 0; +} + +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) +{ + /* Since members are iterated in btf_find_field in increasing order, + * offsets appended to kptr_off_tab are in increasing order, so we can + * do bsearch to find exact match. + */ + struct bpf_map_value_off *tab; + + if (!map_value_has_kptrs(map)) + return NULL; + tab = map->kptr_off_tab; + return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); +} + +void bpf_map_free_kptr_off_tab(struct bpf_map *map) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + if (!map_value_has_kptrs(map)) + return; + for (i = 0; i < tab->nr_off; i++) { + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + btf_put(tab->off[i].kptr.btf); + } + kfree(tab); + map->kptr_off_tab = NULL; +} + +struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; + int size, i; + + if (!map_value_has_kptrs(map)) + return ERR_PTR(-ENOENT); + size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); + new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); + if (!new_tab) + return ERR_PTR(-ENOMEM); + /* Do a deep copy of the kptr_off_tab */ + for (i = 0; i < tab->nr_off; i++) { + btf_get(tab->off[i].kptr.btf); + if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { + while (i--) { + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + btf_put(tab->off[i].kptr.btf); + } + kfree(new_tab); + return ERR_PTR(-ENXIO); + } + } + return new_tab; +} + +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) +{ + struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; + bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); + int size; + + if (!a_has_kptr && !b_has_kptr) + return true; + if (a_has_kptr != b_has_kptr) + return false; + if (tab_a->nr_off != tab_b->nr_off) + return false; + size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); + return !memcmp(tab_a, tab_b, size); +} + +/* Caller must ensure map_value_has_kptrs is true. Note that this function can + * be called on a map value while the map_value is visible to BPF programs, as + * it ensures the correct synchronization, and we already enforce the same using + * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. + */ +void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab; + unsigned long *btf_id_ptr; + int i; + + for (i = 0; i < tab->nr_off; i++) { + struct bpf_map_value_off_desc *off_desc = &tab->off[i]; + unsigned long old_ptr; + + btf_id_ptr = map_value + off_desc->offset; + if (off_desc->type == BPF_KPTR_UNREF) { + u64 *p = (u64 *)btf_id_ptr; + + WRITE_ONCE(p, 0); + continue; + } + old_ptr = xchg(btf_id_ptr, 0); + off_desc->kptr.dtor((void *)old_ptr); + } +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); security_bpf_map_free(map); + kfree(map->off_arr); bpf_map_release_memcg(map); - /* implementation dependent freeing */ + /* implementation dependent freeing, map_free callback also does + * bpf_map_free_kptr_off_tab, if needed. + */ map->ops->map_free(map); } @@ -556,16 +673,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { - const struct bpf_map *map = filp->private_data; - const struct bpf_array *array; + struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { - array = container_of(map, struct bpf_array, map); - spin_lock(&array->aux->owner.lock); - type = array->aux->owner.type; - jited = array->aux->owner.jited; - spin_unlock(&array->aux->owner.lock); + if (map_type_contains_progs(map)) { + spin_lock(&map->owner.lock); + type = map->owner.type; + jited = map->owner.jited; + spin_unlock(&map->owner.lock); } seq_printf(m, @@ -641,7 +756,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) int err; if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map)) + map_value_has_timer(map) || map_value_has_kptrs(map)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -768,6 +883,84 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } +static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) +{ + const u32 a = *(const u32 *)_a; + const u32 b = *(const u32 *)_b; + + if (a < b) + return -1; + else if (a > b) + return 1; + return 0; +} + +static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) +{ + struct bpf_map *map = (struct bpf_map *)priv; + u32 *off_base = map->off_arr->field_off; + u32 *a = _a, *b = _b; + u8 *sz_a, *sz_b; + + sz_a = map->off_arr->field_sz + (a - off_base); + sz_b = map->off_arr->field_sz + (b - off_base); + + swap(*a, *b); + swap(*sz_a, *sz_b); +} + +static int bpf_map_alloc_off_arr(struct bpf_map *map) +{ + bool has_spin_lock = map_value_has_spin_lock(map); + bool has_timer = map_value_has_timer(map); + bool has_kptrs = map_value_has_kptrs(map); + struct bpf_map_off_arr *off_arr; + u32 i; + + if (!has_spin_lock && !has_timer && !has_kptrs) { + map->off_arr = NULL; + return 0; + } + + off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); + if (!off_arr) + return -ENOMEM; + map->off_arr = off_arr; + + off_arr->cnt = 0; + if (has_spin_lock) { + i = off_arr->cnt; + + off_arr->field_off[i] = map->spin_lock_off; + off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); + off_arr->cnt++; + } + if (has_timer) { + i = off_arr->cnt; + + off_arr->field_off[i] = map->timer_off; + off_arr->field_sz[i] = sizeof(struct bpf_timer); + off_arr->cnt++; + } + if (has_kptrs) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + u32 *off = &off_arr->field_off[off_arr->cnt]; + u8 *sz = &off_arr->field_sz[off_arr->cnt]; + + for (i = 0; i < tab->nr_off; i++) { + *off++ = tab->off[i].offset; + *sz++ = sizeof(u64); + } + off_arr->cnt += tab->nr_off; + } + + if (off_arr->cnt == 1) + return 0; + sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), + map_off_arr_cmp, map_off_arr_swap, map); + return 0; +} + static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { @@ -821,10 +1014,34 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EOPNOTSUPP; } - if (map->ops->map_check_btf) + map->kptr_off_tab = btf_parse_kptrs(btf, value_type); + if (map_value_has_kptrs(map)) { + if (!bpf_capable()) { + ret = -EPERM; + goto free_map_tab; + } + if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { + ret = -EACCES; + goto free_map_tab; + } + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + } + + if (map->ops->map_check_btf) { ret = map->ops->map_check_btf(map, btf, key_type, value_type); + if (ret < 0) + goto free_map_tab; + } return ret; +free_map_tab: + bpf_map_free_kptr_off_tab(map); + return ret; } #define BPF_MAP_CREATE_LAST_FIELD map_extra @@ -874,6 +1091,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); + spin_lock_init(&map->owner.lock); map->spin_lock_off = -EINVAL; map->timer_off = -EINVAL; @@ -912,10 +1130,14 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_alloc(map); + err = bpf_map_alloc_off_arr(map); if (err) goto free_map; + err = security_bpf_map_alloc(map); + if (err) + goto free_map_off_arr; + err = bpf_map_alloc_id(map); if (err) goto free_map_sec; @@ -938,6 +1160,8 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); +free_map_off_arr: + kfree(map->off_arr); free_map: btf_put(map->btf); map->ops->map_free(map); @@ -986,6 +1210,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } +EXPORT_SYMBOL(bpf_map_get); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -1352,7 +1577,6 @@ int generic_map_delete_batch(struct bpf_map *map, err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); if (err) break; cond_resched(); @@ -1361,6 +1585,8 @@ int generic_map_delete_batch(struct bpf_map *map, err = -EFAULT; kvfree(key); + + maybe_wait_bpf_programs(map); return err; } @@ -1637,7 +1863,7 @@ static int map_freeze(const union bpf_attr *attr) return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map)) { + map_value_has_timer(map) || map_value_has_kptrs(map)) { fdput(f); return -ENOTSUPP; } @@ -2220,7 +2446,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | - BPF_F_TEST_RND_HI32)) + BPF_F_TEST_RND_HI32 | + BPF_F_XDP_HAS_FRAGS)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2306,6 +2533,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->aux->dst_prog = dst_prog; prog->aux->offload_requested = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; + prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; err = security_bpf_prog_alloc(prog->aux); if (err) @@ -2491,6 +2719,7 @@ void bpf_link_put(struct bpf_link *link) bpf_link_free(link); } } +EXPORT_SYMBOL(bpf_link_put); static int bpf_link_release(struct inode *inode, struct file *filp) { @@ -2562,7 +2791,7 @@ static int bpf_link_alloc_id(struct bpf_link *link) * pre-allocated resources are to be freed with bpf_cleanup() call. All the * transient state is passed around in struct bpf_link_primer. * This is preferred way to create and initialize bpf_link, especially when - * there are complicated and expensive operations inbetween creating bpf_link + * there are complicated and expensive operations in between creating bpf_link * itself and attaching it to BPF hook. By using bpf_link_prime() and * bpf_link_settle() kernel code using bpf_link doesn't have to perform * expensive (and potentially failing) roll back operations in a rare case @@ -2633,20 +2862,14 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) return link; } - -struct bpf_tracing_link { - struct bpf_link link; - enum bpf_attach_type attach_type; - struct bpf_trampoline *trampoline; - struct bpf_prog *tgt_prog; -}; +EXPORT_SYMBOL(bpf_link_get_from_fd); static void bpf_tracing_link_release(struct bpf_link *link) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); - WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, tr_link->trampoline)); bpf_trampoline_put(tr_link->trampoline); @@ -2659,7 +2882,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) static void bpf_tracing_link_dealloc(struct bpf_link *link) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } @@ -2668,7 +2891,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); seq_printf(seq, "attach_type:\t%d\n", @@ -2679,7 +2902,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = tr_link->attach_type; bpf_trampoline_unpack_key(tr_link->trampoline->key, @@ -2698,7 +2921,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, - u32 btf_id) + u32 btf_id, + u64 bpf_cookie) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; @@ -2760,9 +2984,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; + link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); @@ -2830,11 +3055,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, tgt_prog = prog->aux->dst_prog; } - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link.link, &link_primer); if (err) goto out_unlock; - err = bpf_trampoline_link_prog(prog, tr); + err = bpf_trampoline_link_prog(&link->link, tr); if (err) { bpf_link_cleanup(&link_primer); link = NULL; @@ -3017,68 +3242,52 @@ out_put_file: fput(perf_file); return err; } +#else +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_PERF_EVENTS */ -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd - -static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +static int bpf_raw_tp_link_attach(struct bpf_prog *prog, + const char __user *user_tp_name) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct bpf_prog *prog; const char *tp_name; char buf[128]; int err; - if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) - return -EINVAL; - - prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - switch (prog->type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: - if (attr->raw_tracepoint.name) { + if (user_tp_name) /* The attach point for this category of programs * should be specified via btf_id during program load. */ - err = -EINVAL; - goto out_put_prog; - } + return -EINVAL; if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) { tp_name = prog->aux->attach_func_name; break; } - err = bpf_tracing_prog_attach(prog, 0, 0); - if (err >= 0) - return err; - goto out_put_prog; + return bpf_tracing_prog_attach(prog, 0, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: - if (strncpy_from_user(buf, - u64_to_user_ptr(attr->raw_tracepoint.name), - sizeof(buf) - 1) < 0) { - err = -EFAULT; - goto out_put_prog; - } + if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) + return -EFAULT; buf[sizeof(buf) - 1] = 0; tp_name = buf; break; default: - err = -EINVAL; - goto out_put_prog; + return -EINVAL; } btp = bpf_get_raw_tracepoint(tp_name); - if (!btp) { - err = -ENOENT; - goto out_put_prog; - } + if (!btp) + return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER); if (!link) { @@ -3105,11 +3314,29 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) out_put_btp: bpf_put_raw_tracepoint(btp); -out_put_prog: - bpf_prog_put(prog); return err; } +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + int fd; + + if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) + return -EINVAL; + + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + if (fd < 0) + bpf_prog_put(prog); + return fd; +} + static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -3178,7 +3405,13 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: + case BPF_TRACE_RAW_TP: + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; + case BPF_LSM_MAC: + return BPF_PROG_TYPE_LSM; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: @@ -3321,12 +3554,17 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_FLOW_DISSECTOR: case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_MSG_VERDICT: + case BPF_SK_SKB_VERDICT: + return sock_map_bpf_prog_query(attr, uattr); default: return -EINVAL; } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu +#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -4230,22 +4468,7 @@ err_put: return err; } -static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, - struct bpf_prog *prog) -{ - if (attr->link_create.attach_type != prog->expected_attach_type) - return -EINVAL; - - if (prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, uattr, prog); - else if (prog->type == BPF_PROG_TYPE_EXT) - return bpf_tracing_prog_attach(prog, - attr->link_create.target_fd, - attr->link_create.target_btf_id); - return -EINVAL; -} - -#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len +#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies static int link_create(union bpf_attr *attr, bpfptr_t uattr) { enum bpf_prog_type ptype; @@ -4266,16 +4489,20 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) switch (prog->type) { case BPF_PROG_TYPE_EXT: - ret = tracing_bpf_link_attach(attr, uattr, prog); - goto out; + break; case BPF_PROG_TYPE_PERF_EVENT: - case BPF_PROG_TYPE_KPROBE: case BPF_PROG_TYPE_TRACEPOINT: if (attr->link_create.attach_type != BPF_PERF_EVENT) { ret = -EINVAL; goto out; } - ptype = prog->type; + break; + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type != BPF_PERF_EVENT && + attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) { + ret = -EINVAL; + goto out; + } break; default: ptype = attach_type_to_prog_type(attr->link_create.attach_type); @@ -4286,7 +4513,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) break; } - switch (ptype) { + switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -4296,8 +4523,27 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_EXT: + ret = bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); + break; + case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: - ret = tracing_bpf_link_attach(attr, uattr, prog); + if (attr->link_create.attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) + ret = bpf_raw_tp_link_attach(prog, NULL); + else if (prog->expected_attach_type == BPF_TRACE_ITER) + ret = bpf_iter_link_attach(attr, uattr, prog); + else + ret = bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: @@ -4308,13 +4554,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_xdp_link_attach(attr, prog); break; #endif -#ifdef CONFIG_PERF_EVENTS case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: - case BPF_PROG_TYPE_KPROBE: ret = bpf_perf_link_attach(attr, prog); break; -#endif + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type == BPF_PERF_EVENT) + ret = bpf_perf_link_attach(attr, prog); + else + ret = bpf_kprobe_multi_link_attach(attr, prog); + break; default: ret = -EINVAL; } @@ -4428,6 +4677,25 @@ struct bpf_link *bpf_link_by_id(u32 id) return link; } +struct bpf_link *bpf_link_get_curr_or_next(u32 *id) +{ + struct bpf_link *link; + + spin_lock_bh(&link_idr_lock); +again: + link = idr_get_next(&link_idr, id); + if (link) { + link = bpf_link_inc_not_zero(link); + if (IS_ERR(link)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&link_idr_lock); + + return link; +} + #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id static int bpf_link_get_fd_by_id(const union bpf_attr *attr) @@ -4595,9 +4863,21 @@ out_prog_put: static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; + bool capable; int err; - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; + + /* Intent here is for unprivileged_bpf_disabled to block key object + * creation commands for unprivileged users; other actions depend + * of fd availability and access to bpffs, so are dependent on + * object creation success. Capabilities are later verified for + * operations such as load and map create, so even with unprivileged + * BPF disabled, capability checks are still carried out for these + * and other operations. + */ + if (!capable && + (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); @@ -4753,23 +5033,55 @@ static bool syscall_prog_is_valid_access(int off, int size, return true; } -BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size) +BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { + struct bpf_prog * __maybe_unused prog; + struct bpf_tramp_run_ctx __maybe_unused run_ctx; + switch (cmd) { case BPF_MAP_CREATE: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: case BPF_PROG_LOAD: case BPF_BTF_LOAD: + case BPF_LINK_CREATE: + case BPF_RAW_TRACEPOINT_OPEN: break; - /* case BPF_PROG_TEST_RUN: - * is not part of this list to prevent recursive test_run - */ +#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ + case BPF_PROG_TEST_RUN: + if (attr->test.data_in || attr->test.data_out || + attr->test.ctx_out || attr->test.duration || + attr->test.repeat || attr->test.flags) + return -EINVAL; + + prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || + attr->test.ctx_size_in > U16_MAX) { + bpf_prog_put(prog); + return -EINVAL; + } + + run_ctx.bpf_cookie = 0; + run_ctx.saved_run_ctx = NULL; + if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) { + /* recursion detected */ + bpf_prog_put(prog); + return -EBUSY; + } + attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); + __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); + bpf_prog_put(prog); + return 0; +#endif default: return -EINVAL; } return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } +EXPORT_SYMBOL(bpf_sys_bpf); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, @@ -4853,3 +5165,90 @@ const struct bpf_verifier_ops bpf_syscall_verifier_ops = { const struct bpf_prog_ops bpf_syscall_prog_ops = { .test_run = bpf_prog_test_run_syscall, }; + +#ifdef CONFIG_SYSCTL +static int bpf_stats_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} + +void __weak unpriv_ebpf_notify(int new_state) +{ +} + +static int bpf_unpriv_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, unpriv_enable = *(int *)table->data; + bool locked_state = unpriv_enable == 1; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &unpriv_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (locked_state && unpriv_enable != 1) + return -EPERM; + *(int *)table->data = unpriv_enable; + } + + unpriv_ebpf_notify(unpriv_enable); + + return ret; +} + +static struct ctl_table bpf_syscall_table[] = { + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + .proc_handler = bpf_unpriv_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "bpf_stats_enabled", + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), + .mode = 0644, + .proc_handler = bpf_stats_handler, + }, + { } +}; + +static int __init bpf_syscall_sysctl_init(void) +{ + register_sysctl_init("kernel", bpf_syscall_table); + return 0; +} +late_initcall(bpf_syscall_sysctl_init); +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index d94696198ef8..8c921799def4 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -99,7 +99,6 @@ static int __task_seq_show(struct seq_file *seq, struct task_struct *task, if (!prog) return 0; - meta.seq = seq; ctx.meta = &meta; ctx.task = task; return bpf_iter_run_prog(prog, &ctx); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5e7edf913060..93c7675f0c9e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -30,9 +30,12 @@ static DEFINE_MUTEX(trampoline_mutex); bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; + enum bpf_prog_type ptype = prog->type; - return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN; + return (ptype == BPF_PROG_TYPE_TRACING && + (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN)) || + (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } void *bpf_jit_alloc_exec_page(void) @@ -45,7 +48,7 @@ void *bpf_jit_alloc_exec_page(void) set_vm_flush_reset_perms(image); /* Keep image as writeable. The alternative is to keep flipping ro/rw - * everytime new program is attached or detached. + * every time new program is attached or detached. */ set_memory_x((long)image, 1); return image; @@ -117,18 +120,6 @@ static void bpf_trampoline_module_put(struct bpf_trampoline *tr) tr->mod = NULL; } -static int is_ftrace_location(void *ip) -{ - long addr; - - addr = ftrace_location((long)ip); - if (!addr) - return 0; - if (WARN_ON_ONCE(addr != (long)ip)) - return -EFAULT; - return 1; -} - static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) { void *ip = tr->func.addr; @@ -160,12 +151,12 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad static int register_fentry(struct bpf_trampoline *tr, void *new_addr) { void *ip = tr->func.addr; + unsigned long faddr; int ret; - ret = is_ftrace_location(ip); - if (ret < 0) - return ret; - tr->func.ftrace_managed = ret; + faddr = ftrace_location((unsigned long)ip); + if (faddr) + tr->func.ftrace_managed = true; if (bpf_trampoline_module_get(tr)) return -ENOENT; @@ -180,30 +171,30 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; } -static struct bpf_tramp_progs * +static struct bpf_tramp_links * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { - const struct bpf_prog_aux *aux; - struct bpf_tramp_progs *tprogs; - struct bpf_prog **progs; + struct bpf_tramp_link *link; + struct bpf_tramp_links *tlinks; + struct bpf_tramp_link **links; int kind; *total = 0; - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - tprogs[kind].nr_progs = tr->progs_cnt[kind]; + tlinks[kind].nr_links = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; - progs = tprogs[kind].progs; + links = tlinks[kind].links; - hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) { - *ip_arg |= aux->prog->call_get_func_ip; - *progs++ = aux->prog; + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= link->link.prog->call_get_func_ip; + *links++ = link; } } - return tprogs; + return tlinks; } static void __bpf_tramp_image_put_deferred(struct work_struct *work) @@ -213,7 +204,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work) im = container_of(work, struct bpf_tramp_image, work); bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -310,7 +301,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) if (!im) goto out; - err = bpf_jit_charge_modmem(1); + err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) goto out_free_im; @@ -332,7 +323,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) out_free_image: bpf_jit_free_exec(im->image); out_uncharge: - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: kfree(im); out: @@ -342,14 +333,14 @@ out: static int bpf_trampoline_update(struct bpf_trampoline *tr) { struct bpf_tramp_image *im; - struct bpf_tramp_progs *tprogs; + struct bpf_tramp_links *tlinks; u32 flags = BPF_TRAMP_F_RESTORE_REGS; bool ip_arg = false; int err, total; - tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg); - if (IS_ERR(tprogs)) - return PTR_ERR(tprogs); + tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); + if (IS_ERR(tlinks)) + return PTR_ERR(tlinks); if (total == 0) { err = unregister_fentry(tr, tr->cur_image->image); @@ -365,15 +356,15 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; } - if (tprogs[BPF_TRAMP_FEXIT].nr_progs || - tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) + if (tlinks[BPF_TRAMP_FEXIT].nr_links || + tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; if (ip_arg) flags |= BPF_TRAMP_F_IP_ARG; err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, - &tr->func.model, flags, tprogs, + &tr->func.model, flags, tlinks, tr->func.addr); if (err < 0) goto out; @@ -393,7 +384,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) tr->cur_image = im; tr->selector++; out: - kfree(tprogs); + kfree(tlinks); return err; } @@ -419,13 +410,14 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; + struct bpf_tramp_link *link_exiting; int err = 0; - int cnt; + int cnt = 0, i; - kind = bpf_attach_type_to_tramp(prog); + kind = bpf_attach_type_to_tramp(link->link.prog); mutex_lock(&tr->mutex); if (tr->extension_prog) { /* cannot attach fentry/fexit if extension prog is attached. @@ -434,32 +426,43 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) err = -EBUSY; goto out; } - cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + cnt += tr->progs_cnt[i]; + if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ if (cnt) { err = -EBUSY; goto out; } - tr->extension_prog = prog; + tr->extension_prog = link->link.prog; err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, - prog->bpf_func); + link->link.prog->bpf_func); goto out; } - if (cnt >= BPF_MAX_TRAMP_PROGS) { + if (cnt >= BPF_MAX_TRAMP_LINKS) { err = -E2BIG; goto out; } - if (!hlist_unhashed(&prog->aux->tramp_hlist)) { + if (!hlist_unhashed(&link->tramp_hlist)) { /* prog already linked */ err = -EBUSY; goto out; } - hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); + hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + if (link_exiting->link.prog != link->link.prog) + continue; + /* prog already linked */ + err = -EBUSY; + goto out; + } + + hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; err = bpf_trampoline_update(tr); if (err) { - hlist_del_init(&prog->aux->tramp_hlist); + hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } out: @@ -468,12 +471,12 @@ out: } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; int err; - kind = bpf_attach_type_to_tramp(prog); + kind = bpf_attach_type_to_tramp(link->link.prog); mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); @@ -482,7 +485,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) tr->extension_prog = NULL; goto out; } - hlist_del_init(&prog->aux->tramp_hlist); + hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; err = bpf_trampoline_update(tr); out: @@ -512,16 +515,19 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { + int i; + if (!tr) return; mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); - if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) - goto out; - if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) - goto out; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) + goto out; + /* This code will be executed even when the last bpf_tramp_image * is alive. All progs are detached from the trampoline and the * trampoline image is patched with jmp into epilogue to skip @@ -571,11 +577,14 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; @@ -605,29 +614,38 @@ static void notrace update_prog_stats(struct bpf_prog *prog, } } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; } + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); __this_cpu_dec(*(prog->active)); migrate_enable(); @@ -647,7 +665,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call) { return -ENOTSUPP; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a39eedecc93a..aedac2ac02b9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -187,6 +187,9 @@ struct bpf_verifier_stack_elem { POISON_POINTER_DELTA)) #define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) +static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); +static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); + static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; @@ -245,6 +248,7 @@ struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; bool pkt_access; + u8 release_regno; int regno; int access_size; int mem_size; @@ -257,6 +261,8 @@ struct bpf_call_arg_meta { struct btf *ret_btf; u32 ret_btf_id; u32 subprogno; + struct bpf_map_value_off_desc *kptr_off_desc; + u8 uninit_dynptr_regno; }; struct btf *btf_vmlinux; @@ -452,7 +458,8 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { return base_type(type) == PTR_TO_SOCKET || base_type(type) == PTR_TO_TCP_SOCK || - base_type(type) == PTR_TO_MEM; + base_type(type) == PTR_TO_MEM || + base_type(type) == PTR_TO_BTF_ID; } static bool type_is_rdonly_mem(u32 type) @@ -470,17 +477,6 @@ static bool type_may_be_null(u32 type) return type & PTR_MAYBE_NULL; } -/* Determine whether the function releases some resources allocated by another - * function call. The first reference type argument will be assumed to be - * released by release_reference(). - */ -static bool is_release_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_release || - func_id == BPF_FUNC_ringbuf_submit || - func_id == BPF_FUNC_ringbuf_discard; -} - static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || @@ -498,7 +494,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_ringbuf_reserve) + func_id == BPF_FUNC_ringbuf_reserve || + func_id == BPF_FUNC_kptr_xchg) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -516,6 +513,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_sock || func_id == BPF_FUNC_skc_to_tcp6_sock || func_id == BPF_FUNC_skc_to_udp6_sock || + func_id == BPF_FUNC_skc_to_mptcp_sock || func_id == BPF_FUNC_skc_to_tcp_timewait_sock || func_id == BPF_FUNC_skc_to_tcp_request_sock; } @@ -535,10 +533,10 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) static const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { - char postfix[16] = {0}, prefix[16] = {0}; + char postfix[16] = {0}, prefix[32] = {0}; static const char * const str[] = { [NOT_INIT] = "?", - [SCALAR_VALUE] = "inv", + [SCALAR_VALUE] = "scalar", [PTR_TO_CTX] = "ctx", [CONST_PTR_TO_MAP] = "map_ptr", [PTR_TO_MAP_VALUE] = "map_value", @@ -553,7 +551,6 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", @@ -561,17 +558,22 @@ static const char *reg_type_str(struct bpf_verifier_env *env, }; if (type & PTR_MAYBE_NULL) { - if (base_type(type) == PTR_TO_BTF_ID || - base_type(type) == PTR_TO_PERCPU_BTF_ID) + if (base_type(type) == PTR_TO_BTF_ID) strncpy(postfix, "or_null_", 16); else strncpy(postfix, "_or_null", 16); } if (type & MEM_RDONLY) - strncpy(prefix, "rdonly_", 16); + strncpy(prefix, "rdonly_", 32); if (type & MEM_ALLOC) - strncpy(prefix, "alloc_", 16); + strncpy(prefix, "alloc_", 32); + if (type & MEM_USER) + strncpy(prefix, "user_", 32); + if (type & MEM_PERCPU) + strncpy(prefix, "percpu_", 32); + if (type & PTR_UNTRUSTED) + strncpy(prefix, "untrusted_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -583,6 +585,7 @@ static char slot_type_char[] = { [STACK_SPILL] = 'r', [STACK_MISC] = 'm', [STACK_ZERO] = '0', + [STACK_DYNPTR] = 'd', }; static void print_liveness(struct bpf_verifier_env *env, @@ -598,6 +601,25 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "D"); } +static int get_spi(s32 off) +{ + return (-off - 1) / BPF_REG_SIZE; +} + +static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) +{ + int allocated_slots = state->allocated_stack / BPF_REG_SIZE; + + /* We need to check that slots between [spi - nr_slots + 1, spi] are + * within [0, allocated_stack). + * + * Please note that the spi grows downwards. For example, a dynptr + * takes the size of two stack slots; the first slot will be at + * spi and the second slot will be at spi - 1. + */ + return spi - nr_slots + 1 >= 0 && spi < allocated_slots; +} + static struct bpf_func_state *func(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { @@ -649,6 +671,132 @@ static void mark_verifier_state_scratched(struct bpf_verifier_env *env) env->scratched_stack_slots = ~0ULL; } +static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) +{ + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + return BPF_DYNPTR_TYPE_LOCAL; + case DYNPTR_TYPE_RINGBUF: + return BPF_DYNPTR_TYPE_RINGBUF; + default: + return BPF_DYNPTR_TYPE_INVALID; + } +} + +static bool dynptr_type_refcounted(enum bpf_dynptr_type type) +{ + return type == BPF_DYNPTR_TYPE_RINGBUF; +} + +static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type, int insn_idx) +{ + struct bpf_func_state *state = func(env, reg); + enum bpf_dynptr_type type; + int spi, i, id; + + spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return -EINVAL; + + for (i = 0; i < BPF_REG_SIZE; i++) { + state->stack[spi].slot_type[i] = STACK_DYNPTR; + state->stack[spi - 1].slot_type[i] = STACK_DYNPTR; + } + + type = arg_to_dynptr_type(arg_type); + if (type == BPF_DYNPTR_TYPE_INVALID) + return -EINVAL; + + state->stack[spi].spilled_ptr.dynptr.first_slot = true; + state->stack[spi].spilled_ptr.dynptr.type = type; + state->stack[spi - 1].spilled_ptr.dynptr.type = type; + + if (dynptr_type_refcounted(type)) { + /* The id is used to track proper releasing */ + id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; + + state->stack[spi].spilled_ptr.id = id; + state->stack[spi - 1].spilled_ptr.id = id; + } + + return 0; +} + +static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i; + + spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return -EINVAL; + + for (i = 0; i < BPF_REG_SIZE; i++) { + state->stack[spi].slot_type[i] = STACK_INVALID; + state->stack[spi - 1].slot_type[i] = STACK_INVALID; + } + + /* Invalidate any slices associated with this dynptr */ + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { + release_reference(env, state->stack[spi].spilled_ptr.id); + state->stack[spi].spilled_ptr.id = 0; + state->stack[spi - 1].spilled_ptr.id = 0; + } + + state->stack[spi].spilled_ptr.dynptr.first_slot = false; + state->stack[spi].spilled_ptr.dynptr.type = 0; + state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + + return 0; +} + +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + int i; + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return true; + + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] == STACK_DYNPTR || + state->stack[spi - 1].slot_type[i] == STACK_DYNPTR) + return false; + } + + return true; +} + +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + int i; + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.dynptr.first_slot) + return false; + + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] != STACK_DYNPTR || + state->stack[spi - 1].slot_type[i] != STACK_DYNPTR) + return false; + } + + /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ + if (arg_type == ARG_PTR_TO_DYNPTR) + return true; + + return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type); +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -682,74 +830,79 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " R%d", i); print_liveness(env, reg->live); - verbose(env, "=%s", reg_type_str(env, t)); + verbose(env, "="); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ + verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (base_type(t) == PTR_TO_BTF_ID || - base_type(t) == PTR_TO_PERCPU_BTF_ID) + const char *sep = ""; + + verbose(env, "%s", reg_type_str(env, t)); + if (base_type(t) == PTR_TO_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); - verbose(env, "(id=%d", reg->id); - if (reg_type_may_be_refcounted_or_null(t)) - verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); + verbose(env, "("); +/* + * _a stands for append, was shortened to avoid multiline statements below. + * This macro is used to output a comma separated list of attributes. + */ +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) + + if (reg->id) + verbose_a("id=%d", reg->id); + if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id) + verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) - verbose(env, ",off=%d", reg->off); + verbose_a("off=%d", reg->off); if (type_is_pkt_pointer(t)) - verbose(env, ",r=%d", reg->range); + verbose_a("r=%d", reg->range); else if (base_type(t) == CONST_PTR_TO_MAP || base_type(t) == PTR_TO_MAP_KEY || base_type(t) == PTR_TO_MAP_VALUE) - verbose(env, ",ks=%d,vs=%d", - reg->map_ptr->key_size, - reg->map_ptr->value_size); + verbose_a("ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { /* Typically an immediate SCALAR_VALUE, but * could be a pointer whose offset is too big * for reg->off */ - verbose(env, ",imm=%llx", reg->var_off.value); + verbose_a("imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(env, ",smin_value=%lld", - (long long)reg->smin_value); + verbose_a("smin=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(env, ",smax_value=%lld", - (long long)reg->smax_value); + verbose_a("smax=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(env, ",umin_value=%llu", - (unsigned long long)reg->umin_value); + verbose_a("umin=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(env, ",umax_value=%llu", - (unsigned long long)reg->umax_value); + verbose_a("umax=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, ",var_off=%s", tn_buf); + verbose_a("var_off=%s", tn_buf); } if (reg->s32_min_value != reg->smin_value && reg->s32_min_value != S32_MIN) - verbose(env, ",s32_min_value=%d", - (int)(reg->s32_min_value)); + verbose_a("s32_min=%d", (int)(reg->s32_min_value)); if (reg->s32_max_value != reg->smax_value && reg->s32_max_value != S32_MAX) - verbose(env, ",s32_max_value=%d", - (int)(reg->s32_max_value)); + verbose_a("s32_max=%d", (int)(reg->s32_max_value)); if (reg->u32_min_value != reg->umin_value && reg->u32_min_value != U32_MIN) - verbose(env, ",u32_min_value=%d", - (int)(reg->u32_min_value)); + verbose_a("u32_min=%d", (int)(reg->u32_min_value)); if (reg->u32_max_value != reg->umax_value && reg->u32_max_value != U32_MAX) - verbose(env, ",u32_max_value=%d", - (int)(reg->u32_max_value)); + verbose_a("u32_max=%d", (int)(reg->u32_max_value)); } +#undef verbose_a + verbose(env, ")"); } } @@ -774,7 +927,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (is_spilled_reg(&state->stack[i])) { reg = &state->stack[i].spilled_ptr; t = reg->type; - verbose(env, "=%s", reg_type_str(env, t)); + verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) @@ -1546,14 +1699,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, static void mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, enum bpf_reg_type reg_type, - struct btf *btf, u32 btf_id) + struct btf *btf, u32 btf_id, + enum bpf_type_flag flag) { if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, regno); return; } mark_reg_known_zero(env, regs, regno); - regs[regno].type = PTR_TO_BTF_ID; + regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; } @@ -1743,7 +1897,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) } static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, - s16 offset, struct module **btf_modp) + s16 offset) { struct bpf_kfunc_btf kf_btf = { .offset = offset }; struct bpf_kfunc_btf_tab *tab; @@ -1797,8 +1951,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); } - if (btf_modp) - *btf_modp = b->module; return b->btf; } @@ -1814,9 +1966,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) kfree(tab); } -static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, - u32 func_id, s16 offset, - struct module **btf_modp) +static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) { if (offset) { if (offset < 0) { @@ -1827,7 +1977,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, return ERR_PTR(-EINVAL); } - return __find_kfunc_desc_btf(env, offset, btf_modp); + return __find_kfunc_desc_btf(env, offset); } return btf_vmlinux ?: ERR_PTR(-ENOENT); } @@ -1841,6 +1991,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) struct bpf_kfunc_desc *desc; const char *func_name; struct btf *desc_btf; + unsigned long call_imm; unsigned long addr; int err; @@ -1890,7 +2041,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL); + desc_btf = find_kfunc_desc_btf(env, offset); if (IS_ERR(desc_btf)) { verbose(env, "failed to find BTF for kernel function\n"); return PTR_ERR(desc_btf); @@ -1925,9 +2076,17 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } + call_imm = BPF_CALL_IMM(addr); + /* Check whether or not the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel function %s is out of range\n", + func_name); + return -EINVAL; + } + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; - desc->imm = BPF_CALL_IMM(addr); + desc->imm = call_imm; desc->offset = offset; err = btf_distill_func_proto(&env->log, desc_btf, func_proto, func_name, @@ -2351,7 +2510,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) return NULL; - desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL); + desc_btf = find_kfunc_desc_btf(data, insn->off); if (IS_ERR(desc_btf)) return "<error>"; @@ -2767,7 +2926,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: case PTR_TO_BUF: - case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: @@ -3197,7 +3355,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return 0; } -enum stack_access_src { +enum bpf_access_src { ACCESS_DIRECT = 1, /* the access is performed by an instruction */ ACCESS_HELPER = 2, /* the access is performed by a helper */ }; @@ -3205,7 +3363,7 @@ enum stack_access_src { static int check_stack_range_initialized(struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum stack_access_src type, + enum bpf_access_src type, struct bpf_call_arg_meta *meta); static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) @@ -3455,9 +3613,175 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, return 0; } +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) +{ + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. + */ + + if (reg->off < 0) { + verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!fixed_off_ok && reg->off) { + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); + return -EACCES; + } + + return 0; +} + +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + +static int map_kptr_match_type(struct bpf_verifier_env *env, + struct bpf_map_value_off_desc *off_desc, + struct bpf_reg_state *reg, u32 regno) +{ + const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); + int perm_flags = PTR_MAYBE_NULL; + const char *reg_name = ""; + + /* Only unreferenced case accepts untrusted pointers */ + if (off_desc->type == BPF_KPTR_UNREF) + perm_flags |= PTR_UNTRUSTED; + + if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) + goto bad_type; + + if (!btf_is_kernel(reg->btf)) { + verbose(env, "R%d must point to kernel BTF\n", regno); + return -EINVAL; + } + /* We need to verify reg->type and reg->btf, before accessing reg->btf */ + reg_name = kernel_type_name(reg->btf, reg->btf_id); + + /* For ref_ptr case, release function check should ensure we get one + * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the + * normal store of unreferenced kptr, we must ensure var_off is zero. + * Since ref_ptr cannot be accessed directly by BPF insns, checks for + * reg->off and reg->ref_obj_id are not needed here. + */ + if (__check_ptr_off_reg(env, reg, regno, true)) + return -EACCES; + + /* A full type match is needed, as BTF can be vmlinux or module BTF, and + * we also need to take into account the reg->off. + * + * We want to support cases like: + * + * struct foo { + * struct bar br; + * struct baz bz; + * }; + * + * struct foo *v; + * v = func(); // PTR_TO_BTF_ID + * val->foo = v; // reg->off is zero, btf and btf_id match type + * val->bar = &v->br; // reg->off is still zero, but we need to retry with + * // first member type of struct after comparison fails + * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked + * // to match type + * + * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off + * is zero. We must also ensure that btf_struct_ids_match does not walk + * the struct to match type against first member of struct, i.e. reject + * second case from above. Hence, when type is BPF_KPTR_REF, we set + * strict mode to true for type match. + */ + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + off_desc->kptr.btf, off_desc->kptr.btf_id, + off_desc->type == BPF_KPTR_REF)) + goto bad_type; + return 0; +bad_type: + verbose(env, "invalid kptr access, R%d type=%s%s ", regno, + reg_type_str(env, reg->type), reg_name); + verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); + if (off_desc->type == BPF_KPTR_UNREF) + verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED), + targ_name); + else + verbose(env, "\n"); + return -EINVAL; +} + +static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, + int value_regno, int insn_idx, + struct bpf_map_value_off_desc *off_desc) +{ + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; + int class = BPF_CLASS(insn->code); + struct bpf_reg_state *val_reg; + + /* Things we already checked for in check_map_access and caller: + * - Reject cases where variable offset may touch kptr + * - size of access (must be BPF_DW) + * - tnum_is_const(reg->var_off) + * - off_desc->offset == off + reg->var_off.value + */ + /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n"); + return -EACCES; + } + + /* We only allow loading referenced kptr, since it will be marked as + * untrusted, similar to unreferenced kptr. + */ + if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) { + verbose(env, "store to referenced kptr disallowed\n"); + return -EACCES; + } + + if (class == BPF_LDX) { + val_reg = reg_state(env, value_regno); + /* We can simply mark the value_regno receiving the pointer + * value from map as PTR_TO_BTF_ID, with the correct type. + */ + mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, + off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + /* For mark_ptr_or_null_reg */ + val_reg->id = ++env->id_gen; + } else if (class == BPF_STX) { + val_reg = reg_state(env, value_regno); + if (!register_is_null(val_reg) && + map_kptr_match_type(env, off_desc, val_reg, value_regno)) + return -EACCES; + } else if (class == BPF_ST) { + if (insn->imm) { + verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n", + off_desc->offset); + return -EACCES; + } + } else { + verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n"); + return -EACCES; + } + return 0; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) + int off, int size, bool zero_size_allowed, + enum bpf_access_src src) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -3493,16 +3817,41 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } } + if (map_value_has_kptrs(map)) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + for (i = 0; i < tab->nr_off; i++) { + u32 p = tab->off[i].offset; + + if (reg->smin_value + off < p + sizeof(u64) && + p < reg->umax_value + off + size) { + if (src != ACCESS_DIRECT) { + verbose(env, "kptr cannot be accessed indirectly by helper\n"); + return -EACCES; + } + if (!tnum_is_const(reg->var_off)) { + verbose(env, "kptr access cannot have variable offset\n"); + return -EACCES; + } + if (p != off + reg->var_off.value) { + verbose(env, "kptr access misaligned expected=%u off=%llu\n", + p, off + reg->var_off.value); + return -EACCES; + } + if (size != bpf_size_to_bytes(BPF_DW)) { + verbose(env, "kptr access size must be BPF_DW\n"); + return -EACCES; + } + break; + } + } + } return err; } #define MAX_PACKET_OFF 0xffff -static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) -{ - return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; -} - static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_access_type t) @@ -3971,38 +4320,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -static int __check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - bool fixed_off_ok) -{ - /* Access to this pointer-typed register or passing it to a helper - * is only allowed in its original, unmodified form. - */ - - if (!fixed_off_ok && reg->off) { - verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); - return -EACCES; - } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable %s access var_off=%s disallowed\n", - reg_type_str(env, reg->type), tn_buf); - return -EACCES; - } - - return 0; -} - -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) -{ - return __check_ptr_off_reg(env, reg, regno, false); -} - static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -4047,9 +4364,9 @@ static int check_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, int off, int size, bool zero_size_allowed, - const char *buf_info, u32 *max_access) { + const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr"; int err; err = __check_buffer_access(env, buf_info, reg, regno, off, size); @@ -4159,6 +4476,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); + enum bpf_type_flag flag = 0; u32 btf_id; int ret; @@ -4178,9 +4496,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + if (reg->type & MEM_USER) { + verbose(env, + "R%d is ptr_%s access user memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + + if (reg->type & MEM_PERCPU) { + verbose(env, + "R%d is ptr_%s access percpu memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (env->ops->btf_struct_access) { ret = env->ops->btf_struct_access(&env->log, reg->btf, t, - off, size, atype, &btf_id); + off, size, atype, &btf_id, &flag); } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); @@ -4188,14 +4520,20 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } ret = btf_struct_access(&env->log, reg->btf, t, off, size, - atype, &btf_id); + atype, &btf_id, &flag); } if (ret < 0) return ret; + /* If this is an untrusted pointer, all pointers formed by walking it + * also inherit the untrusted flag. + */ + if (type_flag(reg->type) & PTR_UNTRUSTED) + flag |= PTR_UNTRUSTED; + if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); return 0; } @@ -4208,6 +4546,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; + enum bpf_type_flag flag = 0; const struct btf_type *t; const char *tname; u32 btf_id; @@ -4245,12 +4584,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id); + ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag); if (ret < 0) return ret; if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); return 0; } @@ -4285,7 +4624,7 @@ static int check_stack_slot_within_bounds(int off, static int check_stack_access_within_bounds( struct bpf_verifier_env *env, int regno, int off, int access_size, - enum stack_access_src src, enum bpf_access_type type) + enum bpf_access_src src, enum bpf_access_type type) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; @@ -4381,6 +4720,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_MAP_VALUE) { + struct bpf_map_value_off_desc *kptr_off_desc = NULL; + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4389,8 +4730,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_map_access_type(env, regno, off, size, t); if (err) return err; - err = check_map_access(env, regno, off, size, false); - if (!err && t == BPF_READ && value_regno >= 0) { + err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT); + if (err) + return err; + if (tnum_is_const(reg->var_off)) + kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr, + off + reg->var_off.value); + if (kptr_off_desc) { + err = check_map_kptr_access(env, regno, value_regno, insn_idx, + kptr_off_desc); + } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; /* if map is read-only, track its contents as scalars */ @@ -4451,7 +4800,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, + &btf_id); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -4535,7 +4885,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_tp_buffer_access(env, reg, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_BTF_ID) { + } else if (base_type(reg->type) == PTR_TO_BTF_ID && + !type_may_be_null(reg->type)) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { @@ -4543,7 +4894,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); } else if (base_type(reg->type) == PTR_TO_BUF) { bool rdonly_mem = type_is_rdonly_mem(reg->type); - const char *buf_info; u32 *max_access; if (rdonly_mem) { @@ -4552,15 +4902,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regno, reg_type_str(env, reg->type)); return -EACCES; } - buf_info = "rdonly"; max_access = &env->prog->aux->max_rdonly_access; } else { - buf_info = "rdwr"; max_access = &env->prog->aux->max_rdwr_access; } err = check_buffer_access(env, reg, regno, off, size, false, - buf_info, max_access); + max_access); if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); @@ -4694,7 +5042,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i static int check_stack_range_initialized( struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum stack_access_src type, struct bpf_call_arg_meta *meta) + enum bpf_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); @@ -4781,7 +5129,7 @@ static int check_stack_range_initialized( } if (is_spilled_reg(&state->stack[spi]) && - state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) goto mark; if (is_spilled_reg(&state->stack[spi]) && @@ -4823,7 +5171,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - const char *buf_info; u32 *max_access; switch (base_type(reg->type)) { @@ -4832,6 +5179,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: @@ -4840,25 +5192,33 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, BPF_READ)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, - zero_size_allowed); + zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + } return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); return -EACCES; + } - buf_info = "rdonly"; max_access = &env->prog->aux->max_rdonly_access; } else { - buf_info = "rdwr"; max_access = &env->prog->aux->max_rdwr_access; } return check_buffer_access(env, reg, regno, reg->off, access_size, zero_size_allowed, - buf_info, max_access); + max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, @@ -4877,27 +5237,119 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static int check_mem_size_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + int err; + + /* This is used to refine r0 return value bounds for helpers + * that enforce this value as an upper bound on return values. + * See do_refine_retval_range() for helpers that can refine + * the return value. C type of helper is u32 so we pull register + * bound from umax_value however, if negative verifier errors + * out. Only upper bounds can be learned because retval is an + * int type and negative retvals are allowed. + */ + meta->msize_max_value = reg->umax_value; + + /* The register is SCALAR_VALUE; the access check + * happens using its boundaries. + */ + if (!tnum_is_const(reg->var_off)) + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->smin_value < 0) { + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->umin_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->umax_value >= BPF_MAX_VAR_SIZ) { + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->umax_value, + zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); + return err; +} + int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size) { + bool may_be_null = type_may_be_null(reg->type); + struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; + int err; + if (register_is_null(reg)) return 0; - if (type_may_be_null(reg->type)) { - /* Assuming that the register contains a value check if the memory - * access is safe. Temporarily save and restore the register's state as - * the conversion shouldn't be visible to a caller. - */ - const struct bpf_reg_state saved_reg = *reg; - int rv; - + memset(&meta, 0, sizeof(meta)); + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + if (may_be_null) { + saved_reg = *reg; mark_ptr_not_null_reg(reg); - rv = check_helper_mem_access(env, regno, mem_size, true, NULL); + } + + err = check_helper_mem_access(env, regno, mem_size, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + + if (may_be_null) *reg = saved_reg; - return rv; + + return err; +} + +int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno) +{ + struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; + bool may_be_null = type_may_be_null(mem_reg->type); + struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; + int err; + + WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); + + memset(&meta, 0, sizeof(meta)); + + if (may_be_null) { + saved_reg = *mem_reg; + mark_ptr_not_null_reg(mem_reg); } - return check_helper_mem_access(env, regno, mem_size, true, NULL); + err = check_mem_size_reg(env, reg, regno, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); + + if (may_be_null) + *mem_reg = saved_reg; + return err; } /* Implementation details: @@ -5029,10 +5481,51 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } -static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +static int process_kptr_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) { - return base_type(type) == ARG_PTR_TO_MEM || - base_type(type) == ARG_PTR_TO_UNINIT_MEM; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map_value_off_desc *off_desc; + struct bpf_map *map_ptr = reg->map_ptr; + u32 kptr_off; + int ret; + + if (!tnum_is_const(reg->var_off)) { + verbose(env, + "R%d doesn't have constant offset. kptr has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map_ptr->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n", + map_ptr->name); + return -EINVAL; + } + if (!map_value_has_kptrs(map_ptr)) { + ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab); + if (ret == -E2BIG) + verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, + BPF_MAP_VALUE_OFF_MAX); + else if (ret == -EEXIST) + verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name); + else + verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + return -EINVAL; + } + + meta->map_ptr = map_ptr; + kptr_off = reg->off + reg->var_off.value; + off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off); + if (!off_desc) { + verbose(env, "off=%d doesn't point to kptr\n", kptr_off); + return -EACCES; + } + if (off_desc->type != BPF_KPTR_REF) { + verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); + return -EACCES; + } + meta->kptr_off_desc = off_desc; + return 0; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -5052,6 +5545,16 @@ static bool arg_type_is_int_ptr(enum bpf_arg_type type) type == ARG_PTR_TO_LONG; } +static bool arg_type_is_release(enum bpf_arg_type type) +{ + return type & OBJ_RELEASE; +} + +static bool arg_type_is_dynptr(enum bpf_arg_type type) +{ + return base_type(type) == ARG_PTR_TO_DYNPTR; +} + static int int_ptr_type_to_size(enum bpf_arg_type type) { if (type == ARG_PTR_TO_INT) @@ -5159,16 +5662,16 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | ME static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, @@ -5182,7 +5685,6 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, @@ -5191,11 +5693,14 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, + [ARG_PTR_TO_KPTR] = &kptr_types, + [ARG_PTR_TO_DYNPTR] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, - const u32 *arg_btf_id) + const u32 *arg_btf_id, + struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected, type = reg->type; @@ -5240,6 +5745,13 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, found: if (reg->type == PTR_TO_BTF_ID) { + /* For bpf_sk_release, it needs to match against first member + * 'struct sock_common', hence make an exception for it. This + * allows bpf_sk_release to work for multiple socket types. + */ + bool strict_type_match = arg_type_is_release(arg_type) && + meta->func_id != BPF_FUNC_sk_release; + if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -5248,8 +5760,12 @@ found: arg_btf_id = compatible->btf_id; } - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id)) { + if (meta->func_id == BPF_FUNC_kptr_xchg) { + if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) + return -EACCES; + } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id, + strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", regno, kernel_type_name(reg->btf, reg->btf_id), kernel_type_name(btf_vmlinux, *arg_btf_id)); @@ -5260,6 +5776,70 @@ found: return 0; } +int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + enum bpf_arg_type arg_type) +{ + enum bpf_reg_type type = reg->type; + bool fixed_off_ok = false; + + switch ((u32)type) { + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_STACK: + if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { + verbose(env, "cannot pass in dynptr at an offset\n"); + return -EINVAL; + } + fallthrough; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case SCALAR_VALUE: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM) + return 0; + break; + /* All the rest must be rejected, except PTR_TO_BTF_ID which allows + * fixed offset. + */ + case PTR_TO_BTF_ID: + /* When referenced PTR_TO_BTF_ID is passed to release function, + * it's fixed offset must be 0. In the other cases, fixed offset + * can be non-zero. + */ + if (arg_type_is_release(arg_type) && reg->off) { + verbose(env, "R%d must have zero offset when passed to release func\n", + regno); + return -EINVAL; + } + /* For arg is release pointer, fixed_off_ok must be false, but + * we already checked and rejected reg->off != 0 above, so set + * to true to allow fixed offset for all other cases. + */ + fixed_off_ok = true; + break; + default: + break; + } + return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); +} + +static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + + return state->stack[spi].spilled_ptr.id; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) @@ -5292,8 +5872,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || - base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; @@ -5305,40 +5884,37 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]); + err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta); if (err) return err; - switch ((u32)type) { - case SCALAR_VALUE: - /* Pointer types where reg offset is explicitly allowed: */ - case PTR_TO_PACKET: - case PTR_TO_PACKET_META: - case PTR_TO_MAP_KEY: - case PTR_TO_MAP_VALUE: - case PTR_TO_MEM: - case PTR_TO_MEM | MEM_RDONLY: - case PTR_TO_MEM | MEM_ALLOC: - case PTR_TO_BUF: - case PTR_TO_BUF | MEM_RDONLY: - case PTR_TO_STACK: - /* Some of the argument types nevertheless require a - * zero register offset. - */ - if (arg_type == ARG_PTR_TO_ALLOC_MEM) - goto force_off_check; - break; - /* All the rest must be rejected: */ - default: -force_off_check: - err = __check_ptr_off_reg(env, reg, regno, - type == PTR_TO_BTF_ID); - if (err < 0) - return err; - break; - } + err = check_func_arg_reg_off(env, reg, regno, arg_type); + if (err) + return err; skip_type_check: + if (arg_type_is_release(arg_type)) { + if (arg_type_is_dynptr(arg_type)) { + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.id) { + verbose(env, "arg %d is an unacquired reference\n", regno); + return -EINVAL; + } + } else if (!reg->ref_obj_id && !register_is_null(reg)) { + verbose(env, "R%d must be referenced when passed to release function\n", + regno); + return -EINVAL; + } + if (meta->release_regno) { + verbose(env, "verifier internal error: more than one release argument\n"); + return -EFAULT; + } + meta->release_regno = regno; + } + if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -5391,8 +5967,7 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || - base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) { if (type_may_be_null(arg_type) && register_is_null(reg)) return 0; @@ -5404,7 +5979,7 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } - meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); + meta->raw_mode = arg_type & MEM_UNINIT; err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); @@ -5431,59 +6006,49 @@ skip_type_check: return -EACCES; } else if (arg_type == ARG_PTR_TO_FUNC) { meta->subprogno = reg->subprogno; - } else if (arg_type_is_mem_ptr(arg_type)) { + } else if (base_type(arg_type) == ARG_PTR_TO_MEM) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. */ - meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM); + meta->raw_mode = arg_type & MEM_UNINIT; } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* This is used to refine r0 return value bounds for helpers - * that enforce this value as an upper bound on return values. - * See do_refine_retval_range() for helpers that can refine - * the return value. C type of helper is u32 so we pull register - * bound from umax_value however, if negative verifier errors - * out. Only upper bounds can be learned because retval is an - * int type and negative retvals are allowed. - */ - meta->msize_max_value = reg->umax_value; + err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); + } else if (arg_type_is_dynptr(arg_type)) { + if (arg_type & MEM_UNINIT) { + if (!is_dynptr_reg_valid_uninit(env, reg)) { + verbose(env, "Dynptr has to be an uninitialized dynptr\n"); + return -EINVAL; + } - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. - */ - if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. + /* We only support one dynptr being uninitialized at the moment, + * which is sufficient for the helper functions we have right now. */ - meta = NULL; + if (meta->uninit_dynptr_regno) { + verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); + return -EFAULT; + } - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); - return -EACCES; - } + meta->uninit_dynptr_regno = regno; + } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) { + const char *err_extra = ""; - if (reg->umin_value == 0) { - err = check_helper_mem_access(env, regno - 1, 0, - zero_size_allowed, - meta); - if (err) - return err; - } + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + err_extra = "local "; + break; + case DYNPTR_TYPE_RINGBUF: + err_extra = "ringbuf "; + break; + default: + break; + } - if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); - return -EACCES; + verbose(env, "Expected an initialized %sdynptr as arg #%d\n", + err_extra, arg + 1); + return -EINVAL; } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); - if (!err) - err = mark_chain_precision(env, regno); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", @@ -5520,7 +6085,8 @@ skip_type_check: } err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false); + map->value_size - reg->off, false, + ACCESS_HELPER); if (err) return err; @@ -5536,6 +6102,9 @@ skip_type_check: verbose(env, "string is not zero-terminated\n"); return -EINVAL; } + } else if (arg_type == ARG_PTR_TO_KPTR) { + if (process_kptr_func(env, regno, meta)) + return -EACCES; } return err; @@ -5601,7 +6170,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_RINGBUF: if (func_id != BPF_FUNC_ringbuf_output && func_id != BPF_FUNC_ringbuf_reserve && - func_id != BPF_FUNC_ringbuf_query) + func_id != BPF_FUNC_ringbuf_query && + func_id != BPF_FUNC_ringbuf_reserve_dynptr && + func_id != BPF_FUNC_ringbuf_submit_dynptr && + func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -5717,6 +6289,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_ringbuf_output: case BPF_FUNC_ringbuf_reserve: case BPF_FUNC_ringbuf_query: + case BPF_FUNC_ringbuf_reserve_dynptr: + case BPF_FUNC_ringbuf_submit_dynptr: + case BPF_FUNC_ringbuf_discard_dynptr: if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; @@ -5771,6 +6346,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_BLOOM_FILTER) goto error; break; + case BPF_FUNC_map_lookup_percpu_elem: + if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH) + goto error; + break; case BPF_FUNC_sk_storage_get: case BPF_FUNC_sk_storage_delete: if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) @@ -5822,10 +6403,8 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn) static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, enum bpf_arg_type arg_next) { - return (arg_type_is_mem_ptr(arg_curr) && - !arg_type_is_mem_size(arg_next)) || - (!arg_type_is_mem_ptr(arg_curr) && - arg_type_is_mem_size(arg_next)); + return (base_type(arg_curr) == ARG_PTR_TO_MEM) != + arg_type_is_mem_size(arg_next); } static bool check_arg_pair_ok(const struct bpf_func_proto *fn) @@ -5836,7 +6415,7 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) * helper function specification. */ if (arg_type_is_mem_size(fn->arg1_type) || - arg_type_is_mem_ptr(fn->arg5_type) || + base_type(fn->arg5_type) == ARG_PTR_TO_MEM || check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || @@ -5878,17 +6457,18 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) int i; for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) return false; } return true; } -static int check_func_proto(const struct bpf_func_proto *fn, int func_id) +static int check_func_proto(const struct bpf_func_proto *fn, int func_id, + struct bpf_call_arg_meta *meta) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && @@ -6383,7 +6963,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_pop_elem && func_id != BPF_FUNC_map_peek_elem && func_id != BPF_FUNC_for_each_map_elem && - func_id != BPF_FUNC_redirect_map) + func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_percpu_elem) return 0; if (map == NULL) { @@ -6572,7 +7153,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn, func_id); + err = check_func_proto(fn, func_id, &meta); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); @@ -6605,8 +7186,35 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (is_release_function(func_id)) { - err = release_reference(env, meta.ref_obj_id); + regs = cur_regs(env); + + if (meta.uninit_dynptr_regno) { + /* we write BPF_DW bits (8 bytes) at a time */ + for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { + err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, + i, BPF_DW, BPF_WRITE, -1, false); + if (err) + return err; + } + + err = mark_stack_slots_dynptr(env, ®s[meta.uninit_dynptr_regno], + fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1], + insn_idx); + if (err) + return err; + } + + if (meta.release_regno) { + err = -EINVAL; + if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) + err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); + else if (meta.ref_obj_id) + err = release_reference(env, meta.ref_obj_id); + /* meta.ref_obj_id can only be 0 if register that is meant to be + * released is NULL, which must be > R0. + */ + else if (register_is_null(®s[meta.release_regno])) + err = 0; if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -6614,8 +7222,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } - regs = cur_regs(env); - switch (func_id) { case BPF_FUNC_tail_call: err = check_reference_leak(env); @@ -6652,6 +7258,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_loop_callback_state); break; + case BPF_FUNC_dynptr_from_mem: + if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { + verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n", + reg_type_str(env, regs[BPF_REG_1].type)); + return -EACCES; + } } if (err) @@ -6739,21 +7351,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { + struct btf *ret_btf; int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; - ret_btf_id = *fn->ret_btf_id; + if (func_id == BPF_FUNC_kptr_xchg) { + ret_btf = meta.kptr_off_desc->kptr.btf; + ret_btf_id = meta.kptr_off_desc->kptr.btf_id; + } else { + ret_btf = btf_vmlinux; + ret_btf_id = *fn->ret_btf_id; + } if (ret_btf_id == 0) { verbose(env, "invalid return type %u of func %s#%d\n", base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; } - /* current BPF helper definitions are only coming from - * built-in code with type IDs from vmlinux BTF - */ - regs[BPF_REG_0].btf = btf_vmlinux; + regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %u of func %s#%d\n", @@ -6776,6 +7392,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].id = id; /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = id; + } else if (func_id == BPF_FUNC_dynptr_data) { + int dynptr_id = 0, i; + + /* Find the id of the dynptr we're acquiring a reference to */ + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { + if (arg_type_is_dynptr(fn->arg_type[i])) { + if (dynptr_id) { + verbose(env, "verifier internal error: multiple dynptr args in func\n"); + return -EFAULT; + } + dynptr_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); + } + } + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = dynptr_id; } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); @@ -6842,22 +7473,23 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } -static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; - struct module *btf_mod = NULL; + int err, insn_idx = *insn_idx_p; const struct btf_param *args; struct btf *desc_btf; - int err; + bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) return 0; - desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod); + desc_btf = find_kfunc_desc_btf(env, insn->off); if (IS_ERR(desc_btf)) return PTR_ERR(desc_btf); @@ -6866,23 +7498,43 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) func_name = btf_name_by_offset(desc_btf, func->name_off); func_proto = btf_type_by_id(desc_btf, func->type); - if (!env->ops->check_kfunc_call || - !env->ops->check_kfunc_call(func_id, btf_mod)) { + if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_CHECK, func_id)) { verbose(env, "calling kernel function %s is not allowed\n", func_name); return -EACCES; } + acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_ACQUIRE, func_id); + /* Check the arguments */ err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs); - if (err) + if (err < 0) return err; + /* In case of release function, we get register number of refcounted + * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now + */ + if (err) { + err = release_reference(env, regs[err].ref_obj_id); + if (err) { + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, func_id); + return err; + } + } for (i = 0; i < CALLER_SAVED_REGS; i++) mark_reg_not_init(env, regs, caller_saved[i]); /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); + + if (acq && !btf_type_is_ptr(t)) { + verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); + return -EINVAL; + } + if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); mark_btf_func_reg_size(env, BPF_REG_0, t->size); @@ -6901,7 +7553,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RET_NULL, func_id)) { + regs[BPF_REG_0].type |= PTR_MAYBE_NULL; + /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ + regs[BPF_REG_0].id = ++env->id_gen; + } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + if (acq) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + regs[BPF_REG_0].id = id; + regs[BPF_REG_0].ref_obj_id = id; + } } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ nargs = btf_type_vlen(func_proto); @@ -7305,7 +7971,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, dst_reg->off, 1, false)) { + if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -9548,7 +10214,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->mem_size = aux->btf_var.mem_size; break; case PTR_TO_BTF_ID: - case PTR_TO_PERCPU_BTF_ID: dst_reg->btf = aux->btf_var.btf; dst_reg->btf_id = aux->btf_var.btf_id; break; @@ -10273,8 +10938,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env) aux->func_info[i].insn_off = env->subprog_info[i].start; } -#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ - sizeof(((struct bpf_line_info *)(0))->line_col)) +#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col) #define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE static int check_btf_line(struct bpf_verifier_env *env, @@ -11549,7 +12213,7 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) - err = check_kfunc_call(env, insn); + err = check_kfunc_call(env, insn, &env->insn_idx); else err = check_helper_call(env, insn, &env->insn_idx); if (err) @@ -11748,7 +12412,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, type = t->type; t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { - aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU; aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { @@ -12696,7 +13360,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (!ctx_access) continue; - switch (env->insn_aux_data[i + delta].ptr_type) { + switch ((int)env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) continue; @@ -12713,6 +13377,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | PTR_UNTRUSTED: if (type == BPF_READ) { insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); @@ -12897,6 +13562,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab; func[i]->aux->linfo = prog->aux->linfo; @@ -12992,6 +13658,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; + prog->jited_len = func[0]->jited_len; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); @@ -13019,6 +13686,7 @@ out_free: out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; + prog->blinding_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (!bpf_pseudo_call(insn)) continue; @@ -13112,7 +13780,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; enum bpf_attach_type eatype = prog->expected_attach_type; - bool expect_blinding = bpf_jit_blinding_enabled(prog); enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; @@ -13276,7 +13943,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->bpf_capable && !expect_blinding && + if (env->bpf_capable && !prog->blinding_requested && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -13364,6 +14031,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + if (insn->imm == BPF_FUNC_task_storage_get || + insn->imm == BPF_FUNC_sk_storage_get || + insn->imm == BPF_FUNC_inode_storage_get) { + if (env->prog->aux->sleepable) + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); + else + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. @@ -13376,7 +14063,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_pop_elem || insn->imm == BPF_FUNC_map_peek_elem || insn->imm == BPF_FUNC_redirect_map || - insn->imm == BPF_FUNC_for_each_map_elem)) { + insn->imm == BPF_FUNC_for_each_map_elem || + insn->imm == BPF_FUNC_map_lookup_percpu_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -13425,6 +14113,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) bpf_callback_t callback_fn, void *callback_ctx, u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem, + (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL)); patch_map_ops_generic: switch (insn->imm) { @@ -13452,6 +14142,9 @@ patch_map_ops_generic: case BPF_FUNC_for_each_map_elem: insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); continue; + case BPF_FUNC_map_lookup_percpu_elem: + insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); + continue; } goto patch_call_imm; diff --git a/kernel/capability.c b/kernel/capability.c index 46a361dde042..765194f5d678 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -360,6 +360,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } +EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, diff --git a/kernel/cfi.c b/kernel/cfi.c index 9594cfd1cf2c..08102d19ec15 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -281,6 +281,8 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr) static inline cfi_check_fn find_check_fn(unsigned long ptr) { cfi_check_fn fn = NULL; + unsigned long flags; + bool rcu_idle; if (is_kernel_text(ptr)) return __cfi_check; @@ -290,13 +292,21 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr) * the shadow and __module_address use RCU, so we need to wake it * up if necessary. */ - RCU_NONIDLE({ - if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) - fn = find_shadow_check_fn(ptr); + rcu_idle = !rcu_is_watching(); + if (rcu_idle) { + local_irq_save(flags); + rcu_irq_enter(); + } + + if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) + fn = find_shadow_check_fn(ptr); + if (!fn) + fn = find_module_check_fn(ptr); - if (!fn) - fn = find_module_check_fn(ptr); - }); + if (rcu_idle) { + rcu_irq_exit(); + local_irq_restore(flags); + } return fn; } diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 6e36e854b512..5da09c74228d 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -12,7 +12,6 @@ #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; -extern bool cgroup_debug; extern void __init enable_debug_cgroup(void); /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a557eea7166f..1779ccddb734 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(css_set_lock); DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; -bool cgroup_debug __read_mostly; +static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without @@ -1302,7 +1302,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct cgroup *root_cgrp = kf_root->kn->priv; + struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } @@ -2025,7 +2025,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = PTR_ERR(root->kf_root); goto exit_root_id; } - root_cgrp->kn = root->kf_root->kn; + root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); @@ -5685,7 +5685,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); - if (parent && cgroup_is_threaded(cgrp)) + if (cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); @@ -6243,6 +6243,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process + * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). @@ -6305,6 +6306,7 @@ void cgroup_cancel_fork(struct task_struct *child, /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process + * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5de18448016c..71a418858a5e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* * There could be abnormal cpuset configurations for cpu or memory - * node binding, add this key to provide a quick low-cost judgement + * node binding, add this key to provide a quick low-cost judgment * of the situation. */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); @@ -833,7 +833,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); goto done; } @@ -863,7 +863,7 @@ static int generate_sched_domains(cpumask_var_t **domains, if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_FLAG_DOMAIN)))) + housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; if (root_load_balance && @@ -952,7 +952,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -1181,7 +1181,7 @@ enum subparts_cmd { * effective_cpus. The function will return 0 if all the CPUs listed in * cpus_allowed can be granted or an error code will be returned. * - * For partcmd_disable, the cpuset is being transofrmed from a partition + * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in cpus_allowed that are in * parent's subparts_cpus will be taken away from that cpumask and put back * into parent's effective_cpus. 0 should always be returned. @@ -2027,7 +2027,7 @@ out: } /* - * update_prstate - update partititon_root_state + * update_prstate - update partition_root_state * cs: the cpuset to update * new_prs: new partition root state * @@ -2879,7 +2879,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is * set. This flag handling is implemented in cgroup core for - * histrical reasons - the flag may be specified during mount. + * historical reasons - the flag may be specified during mount. * * Currently, if any sibling cpusets have exclusive cpus or mem, we * refuse to clone the configuration - thereby refusing the task to @@ -3076,7 +3076,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migratecd to an ancestor. + * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) update_tasks_cpumask(cs); @@ -3390,8 +3390,11 @@ static struct notifier_block cpuset_track_online_nodes_nb = { */ void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_MEMORY]; + /* + * cpus_allowd/mems_allowed set to v2 values in the initial + * cpuset_bind() call will be reset to v1 values in another + * cpuset_bind() call when v1 cpuset is mounted. + */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 3984dd6b8ddb..617861a54793 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -1,4 +1,4 @@ -//SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> #include <linux/sched.h> #include <linux/sched/task.h> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 9d331ba44870..24b5c2ab5598 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -153,8 +153,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; + unsigned long flags; - raw_spin_lock(cpu_lock); + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + raw_spin_lock_irqsave(cpu_lock, flags); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; @@ -166,7 +175,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock(cpu_lock); + raw_spin_unlock_irqrestore(cpu_lock, flags); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || @@ -315,7 +324,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_base_stat cur, delta; + struct cgroup_base_stat delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ @@ -325,11 +334,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cur.cputime = rstatc->bstat.cputime; + delta = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); /* propagate percpu delta to global */ - delta = cur; cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index eb0029c9a6a6..e400fbbc8aba 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -1,5 +1,5 @@ # KEEP ALPHABETICALLY SORTED -# CONFIG_AIO is not set +# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_LEGACY_PTYS is not set diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config new file mode 100644 index 000000000000..dcd86f32f4ed --- /dev/null +++ b/kernel/configs/x86_debug.config @@ -0,0 +1,18 @@ +CONFIG_X86_DEBUG_FPU=y +CONFIG_LOCK_STAT=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_VMACACHE=y +CONFIG_DEBUG_VM_RB=y +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUG_KMEMLEAK=y +CONFIG_DEBUG_PAGEALLOC=y +CONFIG_SLUB_DEBUG_ON=y +CONFIG_KMEMCHECK=y +CONFIG_DEBUG_OBJECTS=y +CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 +CONFIG_GCOV_KERNEL=y +CONFIG_LOCKDEP=y +CONFIG_PROVE_LOCKING=y +CONFIG_SCHEDSTATS=y +CONFIG_VMLINUX_VALIDATION=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 407a2568f35e..bbad5e375d3b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -34,6 +34,8 @@ #include <linux/scs.h> #include <linux/percpu-rwsem.h> #include <linux/cpuset.h> +#include <linux/random.h> +#include <linux/cc_platform.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -70,7 +72,6 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; - int cpu; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -474,7 +475,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } #endif static inline enum cpuhp_state -cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; bool bringup = st->state < target; @@ -485,14 +486,15 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) st->target = target; st->single = false; st->bringup = bringup; - if (cpu_dying(st->cpu) != !bringup) - set_cpu_dying(st->cpu, !bringup); + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); return prev_state; } static inline void -cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state prev_state) { bool bringup = !st->bringup; @@ -519,8 +521,8 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) } st->bringup = bringup; - if (cpu_dying(st->cpu) != !bringup) - set_cpu_dying(st->cpu, !bringup); + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); } /* Regular hotplug invocation of the AP hotplug thread */ @@ -540,15 +542,16 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) wait_for_ap_thread(st, st->bringup); } -static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state target) { enum cpuhp_state prev_state; int ret; - prev_state = cpuhp_set_state(st, target); + prev_state = cpuhp_set_state(cpu, st, target); __cpuhp_kick_ap(st); if ((ret = st->result)) { - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } @@ -580,7 +583,7 @@ static int bringup_wait_for_ap(unsigned int cpu) if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; - return cpuhp_kick_ap(st, st->target); + return cpuhp_kick_ap(cpu, st, st->target); } static int bringup_cpu(unsigned int cpu) @@ -703,7 +706,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret, cpu, cpuhp_get_step(st->state)->name, st->state); - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); if (can_rollback_cpu(st)) WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, prev_state)); @@ -714,15 +717,6 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, /* * The cpu hotplug threads manage the bringup and teardown of the cpus */ -static void cpuhp_create(unsigned int cpu) -{ - struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - - init_completion(&st->done_up); - init_completion(&st->done_down); - st->cpu = cpu; -} - static int cpuhp_should_run(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); @@ -874,7 +868,7 @@ static int cpuhp_kick_ap_work(unsigned int cpu) cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); - ret = cpuhp_kick_ap(st, st->target); + ret = cpuhp_kick_ap(cpu, st, st->target); trace_cpuhp_exit(cpu, st->state, prev_state, ret); return ret; @@ -882,15 +876,27 @@ static int cpuhp_kick_ap_work(unsigned int cpu) static struct smp_hotplug_thread cpuhp_threads = { .store = &cpuhp_state.thread, - .create = &cpuhp_create, .thread_should_run = cpuhp_should_run, .thread_fn = cpuhp_thread_fun, .thread_comm = "cpuhp/%u", .selfparking = true, }; +static __init void cpuhp_init_state(void) +{ + struct cpuhp_cpu_state *st; + int cpu; + + for_each_possible_cpu(cpu) { + st = per_cpu_ptr(&cpuhp_state, cpu); + init_completion(&st->done_up); + init_completion(&st->done_down); + } +} + void __init cpuhp_threads_init(void) { + cpuhp_init_state(); BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); kthread_unpark(this_cpu_read(cpuhp_state.thread)); } @@ -1106,7 +1112,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret, cpu, cpuhp_get_step(st->state)->name, st->state); - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); if (st->state < prev_state) WARN_ON(cpuhp_invoke_callback_range(true, cpu, st, @@ -1133,7 +1139,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_tasks_frozen = tasks_frozen; - prev_state = cpuhp_set_state(st, target); + prev_state = cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. @@ -1164,7 +1170,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state < prev_state) { if (st->state == CPUHP_TEARDOWN_CPU) { - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } else { WARN(1, "DEAD callback error for CPU%d", cpu); @@ -1185,6 +1191,12 @@ out: static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) { + /* + * If the platform does not support hotplug, report it explicitly to + * differentiate it from a transient offlining failure. + */ + if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED)) + return -EOPNOTSUPP; if (cpu_hotplug_disabled) return -EBUSY; return _cpu_down(cpu, 0, target); @@ -1351,7 +1363,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - cpuhp_set_state(st, target); + cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. @@ -1488,8 +1500,8 @@ int freeze_secondary_cpus(int primary) cpu_maps_update_begin(); if (primary == -1) { primary = cpumask_first(cpu_online_mask); - if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) - primary = housekeeping_any_cpu(HK_FLAG_TIMER); + if (!housekeeping_cpu(primary, HK_TYPE_TIMER)) + primary = housekeeping_any_cpu(HK_TYPE_TIMER); } else { if (!cpu_online(primary)) primary = cpumask_first(cpu_online_mask); @@ -1659,6 +1671,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, + [CPUHP_RANDOM_PREPARE] = { + .name = "random:prepare", + .startup.single = random_prepare_cpu, + .teardown.single = NULL, + }, [CPUHP_WORKQUEUE_PREP] = { .name = "workqueue:prepare", .startup.single = workqueue_prepare_cpu, @@ -1782,6 +1799,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = workqueue_online_cpu, .teardown.single = workqueue_offline_cpu, }, + [CPUHP_AP_RANDOM_ONLINE] = { + .name = "random:online", + .startup.single = random_online_cpu, + .teardown.single = NULL, + }, [CPUHP_AP_RCUTREE_ONLINE] = { .name = "RCU/tree:online", .startup.single = rcutree_online_cpu, diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 256cf6db573c..71122e01623c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -222,9 +222,6 @@ next: p = strstr(p+1, name); } - if (!ck_cmdline) - return NULL; - return ck_cmdline; } @@ -243,9 +240,8 @@ static int __init __parse_crashkernel(char *cmdline, *crash_base = 0; ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - if (!ck_cmdline) - return -EINVAL; + return -ENOENT; ck_cmdline += strlen(name); diff --git a/kernel/cred.c b/kernel/cred.c index 933155c96922..e10c15f51c1f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -870,7 +870,7 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, /* * report use of invalid credentials */ -void __invalid_creds(const struct cred *cred, const char *file, unsigned line) +void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line) { printk(KERN_ERR "CRED: Invalid credentials\n"); printk(KERN_ERR "CRED: At %s:%u\n", file, line); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index da06a5553835..7beceb447211 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -53,6 +53,7 @@ #include <linux/vmacache.h> #include <linux/rcupdate.h> #include <linux/irq.h> +#include <linux/security.h> #include <asm/cacheflush.h> #include <asm/byteorder.h> @@ -752,6 +753,29 @@ cpu_master_loop: continue; kgdb_connected = 0; } else { + /* + * This is a brutal way to interfere with the debugger + * and prevent gdb being used to poke at kernel memory. + * This could cause trouble if lockdown is applied when + * there is already an active gdb session. For now the + * answer is simply "don't do that". Typically lockdown + * *will* be applied before the debug core gets started + * so only developers using kgdb for fairly advanced + * early kernel debug can be biten by this. Hopefully + * they are sophisticated enough to take care of + * themselves, especially with help from the lockdown + * message printed on the console! + */ + if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) { + if (IS_ENABLED(CONFIG_KGDB_KDB)) { + /* Switch back to kdb if possible... */ + dbg_kdb_mode = 1; + continue; + } else { + /* ... otherwise just bail */ + break; + } + } error = gdb_serial_stub(ks); } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6735ac36b718..67d3c48a1522 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -9,7 +9,6 @@ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. */ -#include <linux/module.h> #include <linux/types.h> #include <linux/ctype.h> #include <linux/kernel.h> diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index f877a0a0d7cf..f87c750d3eb3 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -11,7 +11,6 @@ #include <linux/kdb.h> #include <linux/keyboard.h> #include <linux/ctype.h> -#include <linux/module.h> #include <linux/io.h> /* Keyboard Controller Registers on normal PCs. */ diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0852a537dad4..438b868cbfa9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -26,7 +26,6 @@ #include <linux/utsname.h> #include <linux/vmalloc.h> #include <linux/atomic.h> -#include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> @@ -45,6 +44,7 @@ #include <linux/proc_fs.h> #include <linux/uaccess.h> #include <linux/slab.h> +#include <linux/security.h> #include "kdb_private.h" #undef MODULE_PARAM_PREFIX @@ -166,10 +166,62 @@ struct task_struct *kdb_curr_task(int cpu) } /* - * Check whether the flags of the current command and the permissions - * of the kdb console has allow a command to be run. + * Update the permissions flags (kdb_cmd_enabled) to match the + * current lockdown state. + * + * Within this function the calls to security_locked_down() are "lazy". We + * avoid calling them if the current value of kdb_cmd_enabled already excludes + * flags that might be subject to lockdown. Additionally we deliberately check + * the lockdown flags independently (even though read lockdown implies write + * lockdown) since that results in both simpler code and clearer messages to + * the user on first-time debugger entry. + * + * The permission masks during a read+write lockdown permits the following + * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE). + * + * The INSPECT commands are not blocked during lockdown because they are + * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes + * forcing them to have no arguments) and lsmod. These commands do expose + * some kernel state but do not allow the developer seated at the console to + * choose what state is reported. SIGNAL and REBOOT should not be controversial, + * given these are allowed for root during lockdown already. + */ +static void kdb_check_for_lockdown(void) +{ + const int write_flags = KDB_ENABLE_MEM_WRITE | + KDB_ENABLE_REG_WRITE | + KDB_ENABLE_FLOW_CTRL; + const int read_flags = KDB_ENABLE_MEM_READ | + KDB_ENABLE_REG_READ; + + bool need_to_lockdown_write = false; + bool need_to_lockdown_read = false; + + if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags)) + need_to_lockdown_write = + security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL); + + if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags)) + need_to_lockdown_read = + security_locked_down(LOCKDOWN_DBG_READ_KERNEL); + + /* De-compose KDB_ENABLE_ALL if required */ + if (need_to_lockdown_write || need_to_lockdown_read) + if (kdb_cmd_enabled & KDB_ENABLE_ALL) + kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL; + + if (need_to_lockdown_write) + kdb_cmd_enabled &= ~write_flags; + + if (need_to_lockdown_read) + kdb_cmd_enabled &= ~read_flags; +} + +/* + * Check whether the flags of the current command, the permissions of the kdb + * console and the lockdown state allow a command to be run. */ -static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, +static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, bool no_args) { /* permissions comes from userspace so needs massaging slightly */ @@ -1180,6 +1232,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_curr_task(raw_smp_processor_id()); KDB_DEBUG_STATE("kdb_local 1", reason); + + kdb_check_for_lockdown(); + kdb_go_count = 0; if (reason == KDB_REASON_DEBUG) { /* special case below */ @@ -2004,54 +2059,6 @@ static int kdb_ef(int argc, const char **argv) return 0; } -#if defined(CONFIG_MODULES) -/* - * kdb_lsmod - This function implements the 'lsmod' command. Lists - * currently loaded kernel modules. - * Mostly taken from userland lsmod. - */ -static int kdb_lsmod(int argc, const char **argv) -{ - struct module *mod; - - if (argc != 0) - return KDB_ARGCOUNT; - - kdb_printf("Module Size modstruct Used by\n"); - list_for_each_entry(mod, kdb_modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - - kdb_printf("%-20s%8u 0x%px ", mod->name, - mod->core_layout.size, (void *)mod); -#ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4d ", module_refcount(mod)); -#endif - if (mod->state == MODULE_STATE_GOING) - kdb_printf(" (Unloading)"); - else if (mod->state == MODULE_STATE_COMING) - kdb_printf(" (Loading)"); - else - kdb_printf(" (Live)"); - kdb_printf(" 0x%px", mod->core_layout.base); - -#ifdef CONFIG_MODULE_UNLOAD - { - struct module_use *use; - kdb_printf(" [ "); - list_for_each_entry(use, &mod->source_list, - source_list) - kdb_printf("%s ", use->target->name); - kdb_printf("]\n"); - } -#endif - } - - return 0; -} - -#endif /* CONFIG_MODULES */ - /* * kdb_env - This function implements the 'env' command. Display the * current environment variables. diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 0d2f9feea0a4..1f8c519a5f81 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -226,10 +226,6 @@ extern void kdb_kbd_cleanup_state(void); #define kdb_kbd_cleanup_state() #endif /* ! CONFIG_KDB_KEYBOARD */ -#ifdef CONFIG_MODULES -extern struct list_head *kdb_modules; -#endif /* CONFIG_MODULES */ - extern char kdb_prompt_str[]; #define KDB_WORD_SIZE ((int)sizeof(unsigned long)) diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index df2bface866e..0a39497140bf 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -17,7 +17,6 @@ #include <linux/stddef.h> #include <linux/vmalloc.h> #include <linux/ptrace.h> -#include <linux/module.h> #include <linux/highmem.h> #include <linux/hardirq.h> #include <linux/delay.h> @@ -291,7 +290,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size) */ int kdb_putarea_size(unsigned long addr, void *res, size_t size) { - int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size); + int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size); if (ret) { if (!KDB_STATE(SUPPRESS)) { kdb_func_printf("Bad address 0x%lx\n", addr); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index c5e8cea9e05f..164ed9ef77a3 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -44,7 +44,7 @@ void delayacct_init(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, +static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int state = delayacct_on; @@ -63,6 +63,26 @@ int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, set_delayacct(state); return err; } + +static struct ctl_table kern_delayacct_table[] = { + { + .procname = "task_delayacct", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_delayacct, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int kernel_delayacct_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_delayacct_table); + return 0; +} +late_initcall(kernel_delayacct_sysctls_init); #endif void __delayacct_tsk_init(struct task_struct *tsk) @@ -157,11 +177,14 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; tmp = d->compact_delay_total + tsk->delays->compact_delay; d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; + tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; + d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; d->compact_count += tsk->delays->compact_count; + d->wpcopy_count += tsk->delays->wpcopy_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -229,3 +252,16 @@ void __delayacct_compact_end(void) ¤t->delays->compact_delay, ¤t->delays->compact_count); } + +void __delayacct_wpcopy_start(void) +{ + current->delays->wpcopy_start = local_clock(); +} + +void __delayacct_wpcopy_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->wpcopy_start, + ¤t->delays->wpcopy_delay, + ¤t->delays->wpcopy_count); +} diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 1b02179758cb..56866aaa2ae1 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -110,15 +110,10 @@ config DMA_GLOBAL_POOL select DMA_DECLARE_COHERENT bool -config DMA_REMAP - bool - depends on MMU - select DMA_NONCOHERENT_MMAP - config DMA_DIRECT_REMAP bool - select DMA_REMAP select DMA_COHERENT_POOL + select DMA_NONCOHERENT_MMAP config DMA_CMA bool "DMA Contiguous Memory Allocator" diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 0dd65ec1d234..21926e46ef4f 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -8,5 +8,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o -obj-$(CONFIG_DMA_REMAP) += remap.o +obj-$(CONFIG_MMU) += remap.o obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 3d63d91cba5c..6ea80ae42622 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = { static int __init rmem_cma_setup(struct reserved_mem *rmem) { - phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; @@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - if ((rmem->base & mask) || (rmem->size & mask)) { + if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); return -EINVAL; } diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 7a14ca29c377..2caafd13f8aa 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -448,7 +448,7 @@ void debug_dma_dump_mappings(struct device *dev) * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. */ -static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); +static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC); static DEFINE_SPINLOCK(radix_lock); #define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) #define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) @@ -564,7 +564,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) rc = active_cacheline_insert(entry); if (rc == -ENOMEM) { - pr_err("cacheline tracking ENOMEM, dma-debug disabled\n"); + pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { err_printk(entry->dev, entry, @@ -927,7 +927,7 @@ static __init int dma_debug_cmdline(char *str) global_disable = true; } - return 0; + return 1; } static __init int dma_debug_entries_cmdline(char *str) @@ -936,7 +936,7 @@ static __init int dma_debug_entries_cmdline(char *str) return -EINVAL; if (!get_option(&str, &nr_prealloc_entries)) nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; - return 0; + return 1; } __setup("dma_debug=", dma_debug_cmdline); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 50f48e9e4598..8d0b68a17042 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -79,7 +79,7 @@ static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size) { if (!force_dma_unencrypted(dev)) return 0; - return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size)); + return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size)); } static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) @@ -88,7 +88,7 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) if (!force_dma_unencrypted(dev)) return 0; - ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size)); + ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size)); if (ret) pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n"); return ret; @@ -115,7 +115,7 @@ static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size) } static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - gfp_t gfp) + gfp_t gfp, bool allow_highmem) { int node = dev_to_node(dev); struct page *page = NULL; @@ -129,9 +129,12 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_free_contiguous(dev, page, size); - page = NULL; + if (page) { + if (!dma_coherent_ok(dev, page_to_phys(page), size) || + (!allow_highmem && PageHighMem(page))) { + dma_free_contiguous(dev, page, size); + page = NULL; + } } again: if (!page) @@ -189,7 +192,7 @@ static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size, { struct page *page; - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; @@ -262,31 +265,31 @@ void *dma_direct_alloc(struct device *dev, size_t size, return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; + + /* + * dma_alloc_contiguous can return highmem pages depending on a + * combination the cma= arguments and per-arch setup. These need to be + * remapped to return a kernel virtual address. + */ if (PageHighMem(page)) { - /* - * Depending on the cma= arguments and per-arch setup, - * dma_alloc_contiguous could return highmem pages. - * Without remapping there is no way to return them here, so - * log an error and fail. - */ - if (!IS_ENABLED(CONFIG_DMA_REMAP)) { - dev_info(dev, "Rejecting highmem page from CMA.\n"); - goto out_free_pages; - } remap = true; set_uncached = false; } if (remap) { + pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs); + + if (force_dma_unencrypted(dev)) + prot = pgprot_decrypted(prot); + /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, size, - dma_pgprot(dev, PAGE_KERNEL, attrs), + ret = dma_common_contiguous_remap(page, size, prot, __builtin_return_address(0)); if (!ret) goto out_free_pages; @@ -349,12 +352,12 @@ void dma_direct_free(struct device *dev, size_t size, dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; - if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) { + if (is_vmalloc_addr(cpu_addr)) { vunmap(cpu_addr); } else { if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) arch_dma_clear_uncached(cpu_addr, size); - if (dma_set_encrypted(dev, cpu_addr, 1 << page_order)) + if (dma_set_encrypted(dev, cpu_addr, size)) return; } @@ -370,19 +373,9 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); - page = __dma_direct_alloc_pages(dev, size, gfp); + page = __dma_direct_alloc_pages(dev, size, gfp, false); if (!page) return NULL; - if (PageHighMem(page)) { - /* - * Depending on the cma= arguments and per-arch setup - * dma_alloc_contiguous could return highmem pages. - * Without remapping there is no way to return them here, - * so log an error and fail. - */ - dev_info(dev, "Rejecting highmem page from CMA.\n"); - goto out_free_pages; - } ret = page_address(page); if (dma_set_decrypted(dev, ret, size)) @@ -399,7 +392,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_addr, enum dma_data_direction dir) { - unsigned int page_order = get_order(size); void *vaddr = page_address(page); /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ @@ -407,7 +399,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, dma_free_from_pool(dev, vaddr, size)) return; - if (dma_set_encrypted(dev, vaddr, 1 << page_order)) + if (dma_set_encrypted(dev, vaddr, size)) return; __dma_direct_free_pages(dev, page, size); } @@ -539,6 +531,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, int ret = -ENXIO; vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); + if (force_dma_unencrypted(dev)) + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 4632b0f4f72e..a78c0ba70645 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -91,7 +91,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, return swiotlb_map(dev, phys, size, dir, attrs); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - if (swiotlb_force != SWIOTLB_NO_FORCE) + if (is_swiotlb_active(dev)) return swiotlb_map(dev, phys, size, dir, attrs); dev_WARN_ONCE(dev, 1, @@ -114,6 +114,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(dev, phys))) - swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); } #endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 9b9af1bd6be3..0520a8f4fb1d 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -11,6 +11,7 @@ #include <linux/dma-mapping.h> #include <linux/kernel.h> #include <linux/kthread.h> +#include <linux/map_benchmark.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/pci.h> @@ -18,30 +19,6 @@ #include <linux/slab.h> #include <linux/timekeeping.h> -#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) -#define DMA_MAP_MAX_THREADS 1024 -#define DMA_MAP_MAX_SECONDS 300 -#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) - -#define DMA_MAP_BIDIRECTIONAL 0 -#define DMA_MAP_TO_DEVICE 1 -#define DMA_MAP_FROM_DEVICE 2 - -struct map_benchmark { - __u64 avg_map_100ns; /* average map latency in 100ns */ - __u64 map_stddev; /* standard deviation of map latency */ - __u64 avg_unmap_100ns; /* as above */ - __u64 unmap_stddev; - __u32 threads; /* how many threads will do map/unmap in parallel */ - __u32 seconds; /* how long the test will last */ - __s32 node; /* which numa node this benchmark will run on */ - __u32 dma_bits; /* DMA addressing capability */ - __u32 dma_dir; /* DMA data direction */ - __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ - __u8 expansion[76]; /* For future use */ -}; - struct map_benchmark_data { struct map_benchmark bparam; struct device *dev; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9478eccd1c8e..db7244291b74 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -407,8 +407,6 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs); */ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { - if (force_dma_unencrypted(dev)) - prot = pgprot_decrypted(prot); if (dev_is_dma_coherent(dev)) return prot; #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE @@ -745,7 +743,6 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) { /* @@ -761,7 +758,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) return 0; } EXPORT_SYMBOL(dma_set_coherent_mask); -#endif size_t dma_max_mapping_size(struct device *dev) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 6db1c475ec82..cb50f8d38360 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -21,40 +21,33 @@ #define pr_fmt(fmt) "software IO TLB: " fmt #include <linux/cache.h> +#include <linux/cc_platform.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> #include <linux/dma-direct.h> #include <linux/dma-map-ops.h> -#include <linux/mm.h> #include <linux/export.h> +#include <linux/gfp.h> +#include <linux/highmem.h> +#include <linux/io.h> +#include <linux/iommu-helper.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/mm.h> +#include <linux/pfn.h> +#include <linux/scatterlist.h> +#include <linux/set_memory.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/swiotlb.h> -#include <linux/pfn.h> #include <linux/types.h> -#include <linux/ctype.h> -#include <linux/highmem.h> -#include <linux/gfp.h> -#include <linux/scatterlist.h> -#include <linux/cc_platform.h> -#include <linux/set_memory.h> -#ifdef CONFIG_DEBUG_FS -#include <linux/debugfs.h> -#endif #ifdef CONFIG_DMA_RESTRICTED_POOL -#include <linux/io.h> #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/of_reserved_mem.h> #include <linux/slab.h> #endif -#include <asm/io.h> -#include <asm/dma.h> - -#include <linux/io.h> -#include <linux/init.h> -#include <linux/memblock.h> -#include <linux/iommu-helper.h> - #define CREATE_TRACE_POINTS #include <trace/events/swiotlb.h> @@ -69,18 +62,13 @@ #define INVALID_PHYS_ADDR (~(phys_addr_t)0) -enum swiotlb_force swiotlb_force; +static bool swiotlb_force_bounce; +static bool swiotlb_force_disable; struct io_tlb_mem io_tlb_default_mem; phys_addr_t swiotlb_unencrypted_base; -/* - * Max segment that we can provide which (if pages are contingous) will - * not be bounced (unless SWIOTLB_FORCE is set). - */ -static unsigned int max_segment; - static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; static int __init @@ -94,9 +82,9 @@ setup_io_tlb_npages(char *str) if (*str == ',') ++str; if (!strcmp(str, "force")) - swiotlb_force = SWIOTLB_FORCE; + swiotlb_force_bounce = true; else if (!strcmp(str, "noforce")) - swiotlb_force = SWIOTLB_NO_FORCE; + swiotlb_force_disable = true; return 0; } @@ -104,18 +92,12 @@ early_param("swiotlb", setup_io_tlb_npages); unsigned int swiotlb_max_segment(void) { - return io_tlb_default_mem.nslabs ? max_segment : 0; + if (!io_tlb_default_mem.nslabs) + return 0; + return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE); } EXPORT_SYMBOL_GPL(swiotlb_max_segment); -void swiotlb_set_max_segment(unsigned int val) -{ - if (swiotlb_force == SWIOTLB_FORCE) - max_segment = 1; - else - max_segment = rounddown(val, PAGE_SIZE); -} - unsigned long swiotlb_size_or_default(void) { return default_nslabs << IO_TLB_SHIFT; @@ -207,12 +189,10 @@ void __init swiotlb_update_mem_attributes(void) mem->vaddr = swiotlb_mem_remap(mem, bytes); if (!mem->vaddr) mem->vaddr = vaddr; - - memset(mem->vaddr, 0, bytes); } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, - unsigned long nslabs, bool late_alloc) + unsigned long nslabs, unsigned int flags, bool late_alloc) { void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; @@ -223,8 +203,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->index = 0; mem->late_alloc = late_alloc; - if (swiotlb_force == SWIOTLB_FORCE) - mem->force_bounce = true; + mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); spin_lock_init(&mem->lock); for (i = 0; i < mem->nslabs; i++) { @@ -245,17 +224,49 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, return; } -int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the DMA API. + */ +void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, + int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_mem *mem = &io_tlb_default_mem; + unsigned long nslabs = default_nslabs; size_t alloc_size; + size_t bytes; + void *tlb; - if (swiotlb_force == SWIOTLB_NO_FORCE) - return 0; + if (!addressing_limit && !swiotlb_force_bounce) + return; + if (swiotlb_force_disable) + return; - /* protect against double initialization */ - if (WARN_ON_ONCE(mem->nslabs)) - return -ENOMEM; + /* + * By default allocate the bounce buffer memory from low memory, but + * allow to pick a location everywhere for hypervisors with guest + * memory encryption. + */ +retry: + bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); + if (flags & SWIOTLB_ANY) + tlb = memblock_alloc(bytes, PAGE_SIZE); + else + tlb = memblock_alloc_low(bytes, PAGE_SIZE); + if (!tlb) { + pr_warn("%s: failed to allocate tlb structure\n", __func__); + return; + } + + if (remap && remap(tlb, nslabs) < 0) { + memblock_free(tlb, PAGE_ALIGN(bytes)); + + nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); + if (nslabs < IO_TLB_MIN_SLABS) + panic("%s: Failed to remap %zu bytes\n", + __func__, bytes); + goto retry; + } alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); @@ -263,39 +274,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); + swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false); - if (verbose) + if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); - swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); - return 0; } -/* - * Statically reserve bounce buffer space and initialize bounce buffer data - * structures for the software IO TLB used to implement the DMA API. - */ -void __init -swiotlb_init(int verbose) +void __init swiotlb_init(bool addressing_limit, unsigned int flags) { - size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); - void *tlb; - - if (swiotlb_force == SWIOTLB_NO_FORCE) - return; - - /* Get IO TLB memory from the low pages */ - tlb = memblock_alloc_low(bytes, PAGE_SIZE); - if (!tlb) - goto fail; - if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose)) - goto fail_free_mem; - return; - -fail_free_mem: - memblock_free(tlb, bytes); -fail: - pr_warn("Cannot allocate buffer"); + return swiotlb_init_remap(addressing_limit, flags, NULL); } /* @@ -303,72 +290,65 @@ fail: * initialize the swiotlb later using the slab allocator if needed. * This should be just like above, but with some error catching. */ -int -swiotlb_late_init_with_default_size(size_t default_size) +int swiotlb_init_late(size_t size, gfp_t gfp_mask, + int (*remap)(void *tlb, unsigned long nslabs)) { - unsigned long nslabs = - ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); - unsigned long bytes; + struct io_tlb_mem *mem = &io_tlb_default_mem; + unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned char *vstart = NULL; unsigned int order; + bool retried = false; int rc = 0; - if (swiotlb_force == SWIOTLB_NO_FORCE) + if (swiotlb_force_disable) return 0; - /* - * Get IO TLB memory from the low pages - */ +retry: order = get_order(nslabs << IO_TLB_SHIFT); nslabs = SLABS_PER_PAGE << order; - bytes = nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, order); if (vstart) break; order--; + nslabs = SLABS_PER_PAGE << order; + retried = true; } if (!vstart) return -ENOMEM; - if (order != get_order(bytes)) { - pr_warn("only able to allocate %ld MB\n", - (PAGE_SIZE << order) >> 20); - nslabs = SLABS_PER_PAGE << order; - } - rc = swiotlb_late_init_with_tbl(vstart, nslabs); - if (rc) + if (remap) + rc = remap(vstart, nslabs); + if (rc) { free_pages((unsigned long)vstart, order); - return rc; -} - -int -swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) -{ - struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long bytes = nslabs << IO_TLB_SHIFT; - - if (swiotlb_force == SWIOTLB_NO_FORCE) - return 0; + nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); + if (nslabs < IO_TLB_MIN_SLABS) + return rc; + retried = true; + goto retry; + } - /* protect against double initialization */ - if (WARN_ON_ONCE(mem->nslabs)) - return -ENOMEM; + if (retried) { + pr_warn("only able to allocate %ld MB\n", + (PAGE_SIZE << order) >> 20); + } mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(array_size(sizeof(*mem->slots), nslabs))); - if (!mem->slots) + if (!mem->slots) { + free_pages((unsigned long)vstart, order); return -ENOMEM; + } - set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); + set_memory_decrypted((unsigned long)vstart, + (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); + swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true); swiotlb_print_info(); - swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; } @@ -378,6 +358,9 @@ void __init swiotlb_exit(void) unsigned long tbl_vaddr; size_t tbl_size, slots_size; + if (swiotlb_force_bounce) + return; + if (!mem->nslabs) return; @@ -701,13 +684,10 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - /* - * Unconditional bounce is necessary to avoid corruption on - * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite - * the whole lengt of the bounce buffer. - */ - swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); - BUG_ON(!valid_dma_direction(dir)); + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + else + BUG_ON(dir != DMA_FROM_DEVICE); } void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, @@ -729,8 +709,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, phys_addr_t swiotlb_addr; dma_addr_t dma_addr; - trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, - swiotlb_force); + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size); swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir, attrs); @@ -755,7 +734,18 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, size_t swiotlb_max_mapping_size(struct device *dev) { - return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; + int min_align_mask = dma_get_min_align_mask(dev); + int min_align = 0; + + /* + * swiotlb_find_slots() skips slots according to + * min align mask. This affects max mapping size. + * Take it into acount here. + */ + if (min_align_mask) + min_align = roundup(min_align_mask, IO_TLB_SIZE); + + return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align; } bool is_swiotlb_active(struct device *dev) @@ -766,47 +756,29 @@ bool is_swiotlb_active(struct device *dev) } EXPORT_SYMBOL_GPL(is_swiotlb_active); -#ifdef CONFIG_DEBUG_FS -static struct dentry *debugfs_dir; - -static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem) +static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, + const char *dirname) { + mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); + if (!mem->nslabs) + return; + debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); } -static int __init swiotlb_create_default_debugfs(void) +static int __init __maybe_unused swiotlb_create_default_debugfs(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - - debugfs_dir = debugfs_create_dir("swiotlb", NULL); - if (mem->nslabs) { - mem->debugfs = debugfs_dir; - swiotlb_create_debugfs_files(mem); - } + swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb"); return 0; } +#ifdef CONFIG_DEBUG_FS late_initcall(swiotlb_create_default_debugfs); - #endif #ifdef CONFIG_DMA_RESTRICTED_POOL -#ifdef CONFIG_DEBUG_FS -static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) -{ - struct io_tlb_mem *mem = rmem->priv; - - mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir); - swiotlb_create_debugfs_files(mem); -} -#else -static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) -{ -} -#endif - struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; @@ -853,8 +825,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, if (!mem) return -ENOMEM; - mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs), - GFP_KERNEL); + mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL); if (!mem->slots) { kfree(mem); return -ENOMEM; @@ -862,13 +833,13 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false); - mem->force_bounce = true; + swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE, + false); mem->for_alloc = true; rmem->priv = mem; - rmem_swiotlb_debugfs_init(rmem); + swiotlb_create_debugfs_files(mem, rmem->name); } dev->dma_io_tlb_mem = mem; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index bad713684c2e..032f164abe7c 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,7 +2,9 @@ #include <linux/context_tracking.h> #include <linux/entry-common.h> +#include <linux/resume_user_mode.h> #include <linux/highmem.h> +#include <linux/jump_label.h> #include <linux/livepatch.h> #include <linux/audit.h> #include <linux/tick.h> @@ -15,7 +17,7 @@ /* See comment for enter_from_user_mode() in entry-common.h */ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { - arch_check_user_regs(regs); + arch_enter_from_user_mode(regs); lockdep_hardirqs_off(CALLER_ADDR0); CT_WARN_ON(ct_state() != CONTEXT_USER); @@ -58,7 +60,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Handle ptrace */ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = arch_syscall_enter_tracehook(regs); + ret = ptrace_report_syscall_entry(regs); if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } @@ -124,7 +126,7 @@ static __always_inline void __exit_to_user_mode(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); instrumentation_end(); user_enter_irqoff(); @@ -138,15 +140,7 @@ void noinstr exit_to_user_mode(void) } /* Workaround to allow gradual conversion of architecture code */ -void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } - -static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) -{ - if (ti_work & _TIF_NOTIFY_SIGNAL) - tracehook_notify_signal(); - - arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); -} +void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) @@ -169,10 +163,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, klp_update_patch_state(current); if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - handle_signal_work(regs, ti_work); + arch_do_signal_or_restart(regs); if (ti_work & _TIF_NOTIFY_RESUME) - tracehook_notify_resume(regs); + resume_user_mode_work(regs); /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work); @@ -252,7 +246,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work) step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - arch_syscall_exit_tracehook(regs, step); + ptrace_report_syscall_exit(regs, step); } /* @@ -380,7 +374,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } -void irqentry_exit_cond_resched(void) +void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { /* Sanity check RCU and thread stack */ @@ -392,7 +386,17 @@ void irqentry_exit_cond_resched(void) } } #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + return; + raw_irqentry_exit_cond_resched(); +} +#endif #endif noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) @@ -412,7 +416,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) instrumentation_begin(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); instrumentation_end(); rcu_irq_exit(); lockdep_hardirqs_on(CALLER_ADDR0); @@ -420,13 +424,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) } instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) { -#ifdef CONFIG_PREEMPT_DYNAMIC - static_call(irqentry_exit_cond_resched)(); -#else + if (IS_ENABLED(CONFIG_PREEMPTION)) irqentry_exit_cond_resched(); -#endif - } + /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); @@ -465,7 +465,7 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) ftrace_nmi_exit(); if (irq_state.lockdep) { trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); } instrumentation_end(); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 96d476e06c77..2e0f75bcb7fd 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -8,10 +8,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) do { int ret; - if (ti_work & _TIF_NOTIFY_SIGNAL) - tracehook_notify_signal(); - - if (ti_work & _TIF_SIGPENDING) { + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { kvm_handle_signal_exit(vcpu); return -EINTR; } @@ -20,7 +17,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) schedule(); if (ti_work & _TIF_NOTIFY_RESUME) - tracehook_notify_resume(NULL); + resume_user_mode_work(NULL); ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 58cbe357fb2b..1273be84392c 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, } if (regs) { - mm_segment_t fs; - if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index 6859229497b1..80782cddb1da 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -574,8 +574,7 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); + enum event_type_t event_type); static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); @@ -781,7 +780,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; - struct perf_cgroup *cgrp; /* * ensure we access cgroup data only when needed and @@ -790,21 +788,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current, event->ctx); + info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { - info = this_cpu_ptr(event->cgrp->info); + if (info->active) __update_cgrp_time(info, perf_clock(), true); - } } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp; + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; @@ -813,10 +809,10 @@ perf_cgroup_set_timestamp(struct task_struct *task, * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ - if (!task || !ctx->nr_cgroups) + if (!cgrp) return; - cgrp = perf_cgroup_from_task(task, ctx); + WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); @@ -828,17 +824,12 @@ perf_cgroup_set_timestamp(struct task_struct *task, static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); -#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ - /* * reschedule events based on the cgroup constraint of task. - * - * mode SWOUT : schedule out everything - * mode SWIN : schedule in based on cgroup for next */ -static void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task) { + struct perf_cgroup *cgrp; struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -849,35 +840,31 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) */ local_irq_save(flags); + cgrp = perf_cgroup_from_task(task, NULL); + list = this_cpu_ptr(&cgrp_cpuctx_list); list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + if (READ_ONCE(cpuctx->cgrp) == cgrp) + continue; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to update_cgrp_time_from_cpuctx() in + * ctx_sched_out() + */ + cpuctx->cgrp = cgrp; + /* + * set cgrp before ctxsw in to allow + * perf_cgroup_set_timestamp() in ctx_sched_in() + * to not have to pass task around + */ + cpu_ctx_sched_in(cpuctx, EVENT_ALL); - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, - &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } @@ -885,58 +872,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_restore(flags); } -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - rcu_read_lock(); - /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock - */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(next, NULL); - - /* - * only schedule out current cgroup events if we know - * that we are switching to a different cgroup. Otherwise, - * do no touch the cgroup events. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); - - rcu_read_unlock(); -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - rcu_read_lock(); - /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock - */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(prev, NULL); - - /* - * only need to schedule in cgroup events if we are changing - * cgroup during ctxsw. Cgroup events were not scheduled - * out of ctxsw out if that was not the case. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWIN); - - rcu_read_unlock(); -} - static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { @@ -1032,22 +967,10 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); - /* - * Since setting cpuctx->cgrp is conditional on the current @cgrp - * matching the event's cgroup, we must do this for every new event, - * because if the first would mismatch, the second would not try again - * and we would leave cpuctx->cgrp unset. - */ - if (ctx->is_active && !cpuctx->cgrp) { - struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); - - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - cpuctx->cgrp = cgrp; - } - if (ctx->nr_cgroups++) return; + cpuctx->cgrp = perf_cgroup_from_task(current, ctx); list_add(&cpuctx->cgrp_cpuctx_entry, per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); } @@ -1069,9 +992,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c if (--ctx->nr_cgroups) return; - if (ctx->is_active && cpuctx->cgrp) - cpuctx->cgrp = NULL; - + cpuctx->cgrp = NULL; list_del(&cpuctx->cgrp_cpuctx_entry); } @@ -1100,16 +1021,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, { } -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ -} - static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) @@ -1118,13 +1029,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ -} - -static inline void -perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } @@ -1147,6 +1052,10 @@ static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } + +static void perf_cgroup_switch(struct task_struct *task) +{ +} #endif /* @@ -2713,8 +2622,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); + enum event_type_t event_type); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, @@ -2730,15 +2638,14 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - struct task_struct *task) + struct perf_event_context *ctx) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); } /* @@ -2788,7 +2695,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (ctx_event_type & EVENT_PINNED) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, current); + perf_event_sched_in(cpuctx, task_ctx); perf_pmu_enable(cpuctx->ctx.pmu); } @@ -3011,7 +2918,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (!event_filter_match(event)) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); return; } @@ -3020,7 +2927,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); return; } @@ -3668,7 +3575,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * cgroup event are system-wide mode only */ if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_out(task, next); + perf_cgroup_switch(next); } /* @@ -3865,8 +3772,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) + enum event_type_t event_type) { int is_active = ctx->is_active; @@ -3878,7 +3784,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (is_active ^ EVENT_TIME) { /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(task, ctx); + perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -3909,12 +3815,11 @@ ctx_sched_in(struct perf_event_context *ctx, } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) + enum event_type_t event_type) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type, task); + ctx_sched_in(ctx, cpuctx, event_type); } static void perf_event_context_sched_in(struct perf_event_context *ctx, @@ -3956,7 +3861,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); + perf_event_sched_in(cpuctx, ctx); if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(cpuctx->task_ctx, true); @@ -3984,16 +3889,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; - /* - * If cgroup events exist on this CPU, then we need to check if we have - * to switch in PMU state; cgroup event are system-wide mode only. - * - * Since cgroup events are CPU events, we must schedule these in before - * we schedule in the task events. - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); - for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -4267,7 +4162,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) if (cpu_event) rotate_ctx(&cpuctx->ctx, cpu_event); - perf_event_sched_in(cpuctx, task_ctx, current); + perf_event_sched_in(cpuctx, task_ctx); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4339,7 +4234,7 @@ static void perf_event_enable_on_exec(int ctxn) clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); @@ -4362,7 +4257,6 @@ static void perf_event_remove_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *event, *next; - LIST_HEAD(free_list); unsigned long flags; bool modified = false; @@ -6352,7 +6246,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) { + if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } @@ -6533,8 +6427,8 @@ static void perf_sigtrap(struct perf_event *event) if (current->flags & PF_EXITING) return; - force_sig_perf((void __user *)event->pending_addr, - event->attr.type, event->attr.sig_data); + send_sig_perf((void __user *)event->pending_addr, + event->attr.type, event->attr.sig_data); } static void perf_pending_event_disable(struct perf_event *event) @@ -6746,7 +6640,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; - mm_segment_t fs; /* * We dump: @@ -6764,9 +6657,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); @@ -10531,8 +10422,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ if (state == IF_STATE_END) { ret = -EINVAL; - if (kernel && event->attr.exclude_kernel) - goto fail; /* * ACTION "filter" must have a non-zero length region @@ -10574,8 +10463,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } /* ready to consume more filters */ + kfree(filename); + filename = NULL; state = IF_STATE_ACTION; filter = NULL; + kernel = 0; } } @@ -11637,6 +11529,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (parent_event) + event->event_caps = parent_event->event_caps; + if (event->attr.sigtrap) atomic_set(&event->event_limit, 1); @@ -12321,6 +12216,9 @@ SYSCALL_DEFINE5(perf_event_open, * Do not allow to attach to a group in a different task * or CPU context. If we're moving SW events, we'll fix * this up later, so allow that. + * + * Racy, not holding group_leader->ctx->mutex, see comment with + * perf_event_ctx_lock(). */ if (!move_group && group_leader->ctx != ctx) goto err_context; @@ -12386,6 +12284,7 @@ SYSCALL_DEFINE5(perf_event_open, } else { perf_event_ctx_unlock(group_leader, gctx); move_group = 0; + goto not_move_group; } } @@ -12402,7 +12301,17 @@ SYSCALL_DEFINE5(perf_event_open, } } else { mutex_lock(&ctx->mutex); + + /* + * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, + * see the group_leader && !move_group test earlier. + */ + if (group_leader && group_leader->ctx != ctx) { + err = -EINVAL; + goto err_locked; + } } +not_move_group: if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; @@ -13564,7 +13473,7 @@ static int __perf_cgroup_move(void *info) { struct task_struct *task = info; rcu_read_lock(); - perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + perf_cgroup_switch(task); rcu_read_unlock(); return 0; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 082832738c8f..5150d5f84c03 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb) } #endif +static inline int data_page_nr(struct perf_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} + static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 52868716ec35..fb35b926024c 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -859,11 +859,6 @@ void rb_free(struct perf_buffer *rb) } #else -static int data_page_nr(struct perf_buffer *rb) -{ - return rb->nr_pages << page_order(rb); -} - static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6357c3580d07..2eaa327f8158 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,11 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = compound_head(old_page), - .vma = vma, - .address = addr, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); int err; struct mmu_notifier_range range; @@ -173,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ + /* For try_to_free_swap() below */ lock_page(old_page); mmu_notifier_invalidate_range_start(&range); @@ -184,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); + page_add_new_anon_rmap(new_page, vma, addr); lru_cache_add_inactive_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ @@ -201,13 +197,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) - munlock_vma_page(old_page); put_page(old_page); err = 0; @@ -794,10 +787,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, struct page *page; /* * Ensure that the page that has the original instruction is populated - * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), + * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ - if (mapping->a_ops->readpage) + if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); @@ -1150,7 +1143,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset, return -EINVAL; /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ - if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) + if (!inode->i_mapping->a_ops->read_folio && + !shmem_mapping(inode->i_mapping)) return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) diff --git a/kernel/exit.c b/kernel/exit.c index b00a25bb4ab9..f072959fcab7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -49,7 +49,8 @@ #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> -#include <linux/tracehook.h> +#include <linux/blkdev.h> +#include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> @@ -64,6 +65,7 @@ #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> +#include <linux/rethook.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -169,6 +171,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); + rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); @@ -735,21 +738,7 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - WARN_ON(blk_needs_flush_plug(tsk)); - - /* - * If do_dead is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - * - * On uptodate architectures force_uaccess_begin is a noop. On - * architectures that still have set_fs/get_fs in addition to handling - * oopses handles kernel threads that run as set_fs(KERNEL_DS) by - * default. - */ - force_uaccess_begin(); + WARN_ON(tsk->plug); kcov_task_exit(tsk); @@ -845,6 +834,7 @@ void __noreturn do_exit(long code) put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); + exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); @@ -906,7 +896,7 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -void +void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; diff --git a/kernel/extable.c b/kernel/extable.c index b6f330f0fe74..bda5e9761541 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -3,6 +3,7 @@ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. */ +#include <linux/elf.h> #include <linux/ftrace.h> #include <linux/memory.h> #include <linux/extable.h> @@ -132,12 +133,33 @@ out: } /* - * On some architectures (PPC64, IA64) function pointers + * On some architectures (PPC64, IA64, PARISC) function pointers * are actually only tokens to some data that then holds the * real function address. As a result, to find if a function * pointer is part of the kernel text, we need to do some * special dereferencing first. */ +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +void *dereference_function_descriptor(void *ptr) +{ + func_desc_t *desc = ptr; + void *p; + + if (!get_kernel_nofault(p, (void *)&desc->addr)) + ptr = p; + return ptr; +} +EXPORT_SYMBOL_GPL(dereference_function_descriptor); + +void *dereference_kernel_function_descriptor(void *ptr) +{ + if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd) + return ptr; + + return dereference_function_descriptor(ptr); +} +#endif + int func_ptr_is_kernel_text(void *ptr) { unsigned long addr; diff --git a/kernel/fork.c b/kernel/fork.c index f1e89007f228..9d44f2d46c69 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,7 @@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> +#include <linux/sched/mm.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -185,7 +186,7 @@ static inline void free_task_struct(struct task_struct *tsk) */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) -#ifdef CONFIG_VMAP_STACK +# ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. @@ -193,6 +194,41 @@ static inline void free_task_struct(struct task_struct *tsk) #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +struct vm_stack { + struct rcu_head rcu; + struct vm_struct *stack_vm_area; +}; + +static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +{ + unsigned int i; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL) + continue; + return true; + } + return false; +} + +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + + if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) + return; + + vfree(vm_stack); +} + +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct vm_stack *vm_stack = tsk->stack; + + vm_stack->stack_vm_area = tsk->stack_vm_area; + call_rcu(&vm_stack->rcu, thread_stack_free_rcu); +} + static int free_vm_stack_cache(unsigned int cpu) { struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); @@ -210,11 +246,35 @@ static int free_vm_stack_cache(unsigned int cpu) return 0; } -#endif -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) +static int memcg_charge_kernel_stack(struct vm_struct *vm) { -#ifdef CONFIG_VMAP_STACK + int i; + int ret; + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + if (ret) + goto err; + } + return 0; +err: + /* + * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is + * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will + * ignore this page. + */ + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); + return ret; +} + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + struct vm_struct *vm; void *stack; int i; @@ -226,15 +286,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; - /* Mark stack accessible for KASAN. */ + /* Reset stack metadata. */ kasan_unpoison_range(s->addr, THREAD_SIZE); + stack = kasan_reset_tag(s->addr); + /* Clear stale pointers from reused stack. */ - memset(s->addr, 0, THREAD_SIZE); + memset(stack, 0, THREAD_SIZE); + + if (memcg_charge_kernel_stack(s)) { + vfree(s->addr); + return -ENOMEM; + } tsk->stack_vm_area = s; - tsk->stack = s->addr; - return s->addr; + tsk->stack = stack; + return 0; } /* @@ -247,71 +314,96 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); + if (!stack) + return -ENOMEM; + vm = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm)) { + vfree(stack); + return -ENOMEM; + } /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - if (stack) { - tsk->stack_vm_area = find_vm_area(stack); - tsk->stack = stack; - } - return stack; -#else + tsk->stack_vm_area = vm; + stack = kasan_reset_tag(stack); + tsk->stack = stack; + return 0; +} + +static void free_thread_stack(struct task_struct *tsk) +{ + if (!try_release_thread_stack_to_cache(tsk->stack_vm_area)) + thread_stack_delayed_free(tsk); + + tsk->stack = NULL; + tsk->stack_vm_area = NULL; +} + +# else /* !CONFIG_VMAP_STACK */ + +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER); +} + +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct rcu_head *rh = tsk->stack; + + call_rcu(rh, thread_stack_free_rcu); +} + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); if (likely(page)) { tsk->stack = kasan_reset_tag(page_address(page)); - return tsk->stack; + return 0; } - return NULL; -#endif + return -ENOMEM; } -static inline void free_thread_stack(struct task_struct *tsk) +static void free_thread_stack(struct task_struct *tsk) { -#ifdef CONFIG_VMAP_STACK - struct vm_struct *vm = task_stack_vm_area(tsk); - - if (vm) { - int i; + thread_stack_delayed_free(tsk); + tsk->stack = NULL; +} - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); +# endif /* CONFIG_VMAP_STACK */ +# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ - for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_cmpxchg(cached_stacks[i], - NULL, tsk->stack_vm_area) != NULL) - continue; +static struct kmem_cache *thread_stack_cache; - return; - } +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + kmem_cache_free(thread_stack_cache, rh); +} - vfree_atomic(tsk->stack); - return; - } -#endif +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct rcu_head *rh = tsk->stack; - __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); + call_rcu(rh, thread_stack_free_rcu); } -# else -static struct kmem_cache *thread_stack_cache; -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); stack = kasan_reset_tag(stack); tsk->stack = stack; - return stack; + return stack ? 0 : -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, tsk->stack); + thread_stack_delayed_free(tsk); + tsk->stack = NULL; } void thread_stack_cache_init(void) @@ -321,8 +413,26 @@ void thread_stack_cache_init(void) THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } -# endif -#endif + +# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + unsigned long *stack; + + stack = arch_alloc_thread_stack_node(tsk, node); + tsk->stack = stack; + return stack ? 0 : -ENOMEM; +} + +static void free_thread_stack(struct task_struct *tsk) +{ + arch_free_thread_stack(tsk); + tsk->stack = NULL; +} + +#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -379,50 +489,34 @@ void vm_area_free(struct vm_area_struct *vma) static void account_kernel_stack(struct task_struct *tsk, int account) { - void *stack = task_stack_page(tsk); - struct vm_struct *vm = task_stack_vm_area(tsk); - - if (vm) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { + void *stack = task_stack_page(tsk); + /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } } -static int memcg_charge_kernel_stack(struct task_struct *tsk) +void exit_task_stack_account(struct task_struct *tsk) { -#ifdef CONFIG_VMAP_STACK - struct vm_struct *vm = task_stack_vm_area(tsk); - int ret; - - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + account_kernel_stack(tsk, -1); - if (vm) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm; int i; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); - - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - /* - * If memcg_kmem_charge_page() fails, page's - * memory cgroup pointer is NULL, and - * memcg_kmem_uncharge_page() in free_thread_stack() - * will ignore this page. - */ - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, - 0); - if (ret) - return ret; - } + vm = task_stack_vm_area(tsk); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); } -#endif - return 0; } static void release_task_stack(struct task_struct *tsk) @@ -430,12 +524,7 @@ static void release_task_stack(struct task_struct *tsk) if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */ - account_kernel_stack(tsk, -1); free_thread_stack(tsk); - tsk->stack = NULL; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = NULL; -#endif } #ifdef CONFIG_THREAD_INFO_IN_TASK @@ -523,9 +612,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = ksm_fork(mm, oldmm); if (retval) goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; + khugepaged_fork(mm, oldmm); prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { @@ -703,6 +790,7 @@ void __mmdrop(struct mm_struct *mm) mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); + mm_pasid_drop(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -874,8 +962,6 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - unsigned long *stack; - struct vm_struct *stack_vm_area __maybe_unused; int err; if (node == NUMA_NO_NODE) @@ -884,32 +970,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - stack = alloc_thread_stack_node(tsk, node); - if (!stack) + err = arch_dup_task_struct(tsk, orig); + if (err) goto free_tsk; - if (memcg_charge_kernel_stack(tsk)) - goto free_stack; - - stack_vm_area = task_stack_vm_area(tsk); - - err = arch_dup_task_struct(tsk, orig); + err = alloc_thread_stack_node(tsk, node); + if (err) + goto free_tsk; - /* - * arch_dup_task_struct() clobbers the stack-related fields. Make - * sure they're properly initialized before using any stack-related - * functions again. - */ - tsk->stack = stack; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = stack_vm_area; -#endif #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif - - if (err) - goto free_stack; + account_kernel_stack(tsk, 1); err = scs_prepare(tsk, node); if (err) @@ -953,8 +1025,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->wake_q.next = NULL; tsk->worker_private = NULL; - account_kernel_stack(tsk, 1); - kcov_task_init(tsk); kmap_local_fork(tsk); @@ -967,12 +1037,22 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif +#ifdef CONFIG_IOMMU_SVA + tsk->pasid_activated = 0; +#endif + #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif + +#ifdef CONFIG_CPU_SUP_INTEL + tsk->reported_split_lock = 0; +#endif + return tsk; free_stack: + exit_task_stack_account(tsk); free_thread_stack(tsk); free_tsk: free_task_struct(tsk); @@ -1019,13 +1099,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } -static void mm_init_pasid(struct mm_struct *mm) -{ -#ifdef CONFIG_IOMMU_SUPPORT - mm->pasid = INIT_PASID; -#endif -} - static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES @@ -1054,7 +1127,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); - mm_init_pasid(mm); + mm_pasid_init(mm); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); @@ -1909,7 +1982,7 @@ static __latent_entropy struct task_struct *copy_process( struct task_struct *p; struct multiprocess_signals delayed; struct file *pidfile = NULL; - u64 clone_flags = args->flags; + const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; /* @@ -1998,6 +2071,9 @@ static __latent_entropy struct task_struct *copy_process( p = dup_task_struct(current, node); if (!p) goto fork_out; + p->flags &= ~PF_KTHREAD; + if (args->kthread) + p->flags |= PF_KTHREAD; if (args->io_thread) { /* * Mark us an IO worker, and block any signal that isn't @@ -2087,7 +2163,7 @@ static __latent_entropy struct task_struct *copy_process( p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); - if (p->flags & PF_KTHREAD) { + if (args->kthread) { if (!set_kthread_struct(p)) goto bad_fork_cleanup_delayacct; } @@ -2170,7 +2246,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); + retval = copy_thread(p, args); if (retval) goto bad_fork_cleanup_io; @@ -2255,6 +2331,9 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; #endif +#ifdef CONFIG_RETHOOK + p->rethooks.first = NULL; +#endif /* * Ensure that the cgroup subsystem policies allow the new process to be @@ -2451,6 +2530,7 @@ bad_fork_cleanup_count: exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); + exit_task_stack_account(p); put_task_stack(p); delayed_free_task(p); fork_out: @@ -2470,11 +2550,21 @@ static inline void init_idle_pids(struct task_struct *idle) } } +static int idle_dummy(void *dummy) +{ + /* This function is never called */ + return 0; +} + struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { - .flags = CLONE_VM, + .flags = CLONE_VM, + .fn = &idle_dummy, + .fn_arg = NULL, + .kthread = 1, + .idle = 1, }; task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); @@ -2505,8 +2595,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, .io_thread = 1, }; @@ -2610,8 +2700,25 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, + .kthread = 1, + }; + + return kernel_clone(&args); +} + +/* + * Create a user mode thread. + */ +pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct kernel_clone_args args = { + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .fn = fn, + .fn_arg = arg, }; return kernel_clone(&args); diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 51dd822a8060..b22ef1efe751 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -302,7 +302,7 @@ again: * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the page lock (also * cases which we are happy to fail). And we hold a reference, - * so refcount care in invalidate_complete_page's remove_mapping + * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index c264cbeab71c..b5379c0e6d6d 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -3,6 +3,7 @@ #define _FUTEX_H #include <linux/futex.h> +#include <linux/rtmutex.h> #include <linux/sched/wake_q.h> #ifdef CONFIG_PREEMPT_RT diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 183b28c32c83..ce2889f12375 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -1005,7 +1005,7 @@ retry_private: rt_mutex_init_waiter(&rt_waiter); /* - * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not * hold it while doing rt_mutex_start_proxy(), because then it will * include hb->lock in the blocking chain, even through we'll not in * fact hold it while blocking. This will lead it to report -EDEADLK diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 1966a749e0d9..0c78e64f747d 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -74,7 +74,7 @@ fi # of tree builds having stale headers in srctree. Just silence CPIO for now. for f in $dir_list; do find "$f" -name "*.h"; -done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 +done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1 # Remove comments except SDPX lines find $cpio_dir -type f -print0 | diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 52501e5f7655..cff3ae8c818f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -73,7 +73,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ unsigned int __read_mostly sysctl_hung_task_panic = - CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f7ff8919dc9b..d9a5c1d65a79 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -258,7 +258,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, nodemask_t nodemsk = NODE_MASK_NONE; struct node_vectors *node_vectors; - if (!cpumask_weight(cpu_mask)) + if (cpumask_empty(cpu_mask)) return 0; nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); @@ -269,8 +269,9 @@ static int __irq_build_affinity_masks(unsigned int startvec, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_or(&masks[curvec].mask, &masks[curvec].mask, - node_to_cpumask[n]); + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); if (++curvec == last_affv) curvec = firstvec; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c09324663088..886789dcee43 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -38,7 +38,7 @@ struct irqaction chained_action = { * @irq: irq number * @chip: pointer to irq chip description structure */ -int irq_set_chip(unsigned int irq, struct irq_chip *chip) +int irq_set_chip(unsigned int irq, const struct irq_chip *chip) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); @@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) if (!desc) return -EINVAL; - if (!chip) - chip = &no_irq_chip; - - desc->irq_data.chip = chip; + desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in @@ -1009,8 +1006,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); irq_state_set_disabled(desc); - if (is_chained) + if (is_chained) { desc->action = NULL; + WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc))); + } desc->depth = 1; } desc->handle_irq = handle; @@ -1036,6 +1035,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; + WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc))); irq_activate_and_startup(desc, IRQ_RESEND); } } @@ -1073,7 +1073,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); void -irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { irq_set_chip(irq, chip); @@ -1558,6 +1558,14 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } +static struct device *irq_get_parent_device(struct irq_data *data) +{ + if (data->domain) + return data->domain->dev; + + return NULL; +} + /** * irq_chip_pm_get - Enable power for an IRQ chip * @data: Pointer to interrupt specific data @@ -1567,17 +1575,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) */ int irq_chip_pm_get(struct irq_data *data) { - int retval; + struct device *dev = irq_get_parent_device(data); + int retval = 0; - if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { - retval = pm_runtime_get_sync(data->chip->parent_device); - if (retval < 0) { - pm_runtime_put_noidle(data->chip->parent_device); - return retval; - } - } + if (IS_ENABLED(CONFIG_PM) && dev) + retval = pm_runtime_resume_and_get(dev); - return 0; + return retval; } /** @@ -1590,10 +1594,11 @@ int irq_chip_pm_get(struct irq_data *data) */ int irq_chip_pm_put(struct irq_data *data) { + struct device *dev = irq_get_parent_device(data); int retval = 0; - if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) - retval = pm_runtime_put(data->chip->parent_device); + if (IS_ENABLED(CONFIG_PM) && dev) + retval = pm_runtime_put(dev); return (retval < 0) ? retval : 0; } diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 39a41c56ad4f..1ed2b1739363 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu) { const struct cpumask *hk_mask; - if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) + if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) return false; - hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask)) return false; diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index e4cff358b437..bc8e40cf2b65 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -58,6 +58,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND), + BIT_MASK_DESCR(IRQCHIP_IMMUTABLE), }; static void @@ -69,8 +70,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind) seq_printf(m, "chip: None\n"); return; } - seq_printf(m, "%*schip: %s\n", ind, "", chip->name); - seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags); + seq_printf(m, "%*schip: ", ind, ""); + if (chip->irq_print_chip) + chip->irq_print_chip(data, m); + else + seq_printf(m, "%s", chip->name); + seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags); irq_debug_show_bits(m, ind, chip->flags, irqchip_flags, ARRAY_SIZE(irqchip_flags)); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 99cbdf55a8bd..f09c60393e55 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -29,12 +29,14 @@ extern struct irqaction chained_action; * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity * IRQTF_FORCED_THREAD - irq action is force threaded + * IRQTF_READY - signals that irq thread is ready */ enum { IRQTF_RUNTHREAD, IRQTF_WARNED, IRQTF_AFFINITY, IRQTF_FORCED_THREAD, + IRQTF_READY, }; /* diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 0cd02efa3a74..dd76323ea3fd 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -181,7 +181,7 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, goto err_free_bitmap; work_ctx->irq_count = num_irqs; - init_irq_work(&work_ctx->work, irq_sim_handle_irq); + work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq); return work_ctx->domain; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2267e6527db3..d323b180b0f3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -407,6 +407,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, lockdep_set_class(&desc->lock, &irq_desc_lock_class); mutex_init(&desc->request_mutex); init_rcu_head(&desc->rcu); + init_waitqueue_head(&desc->wait_for_threads); desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); @@ -575,6 +576,7 @@ int __init early_irq_init(void) raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); mutex_init(&desc[i].request_mutex); + init_waitqueue_head(&desc[i].wait_for_threads); desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); @@ -640,7 +642,7 @@ int handle_irq_desc(struct irq_desc *desc) return -EINVAL; data = irq_desc_get_irq_data(desc); - if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data))) + if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); @@ -662,6 +664,29 @@ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +/** + * generic_handle_irq_safe - Invoke the handler for a particular irq from any + * context. + * @irq: The irq number to handle + * + * Returns: 0 on success, a negative value on error. + * + * This function can be called from any context (IRQ or process context). It + * will report an error if not invoked from IRQ context and the irq has been + * marked to enforce IRQ-context only. + */ +int generic_handle_irq_safe(unsigned int irq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_to_desc(irq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_irq_safe); + #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging @@ -676,7 +701,6 @@ EXPORT_SYMBOL_GPL(generic_handle_irq); */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { - WARN_ON_ONCE(!in_irq()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index bf38c546aa25..d5ce96510549 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1319,7 +1319,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); * @chip_data: The associated chip data */ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, + const struct irq_chip *chip, void *chip_data) { struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); @@ -1328,7 +1329,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return -ENOENT; irq_data->hwirq = hwirq; - irq_data->chip = chip ? chip : &no_irq_chip; + irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip); irq_data->chip_data = chip_data; return 0; @@ -1347,7 +1348,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { @@ -1853,7 +1854,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f23ffd30385b..8c396319d5ac 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -222,11 +222,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, { struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); + const struct cpumask *prog_mask; int ret; + static DEFINE_RAW_SPINLOCK(tmp_mask_lock); + static struct cpumask tmp_mask; + if (!chip || !chip->irq_set_affinity) return -EINVAL; + raw_spin_lock(&tmp_mask_lock); /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with @@ -247,25 +252,35 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, * online. */ if (irqd_affinity_is_managed(data) && - housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) { - const struct cpumask *hk_mask, *prog_mask; - - static DEFINE_RAW_SPINLOCK(tmp_mask_lock); - static struct cpumask tmp_mask; + housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { + const struct cpumask *hk_mask; - hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); - raw_spin_lock(&tmp_mask_lock); cpumask_and(&tmp_mask, mask, hk_mask); if (!cpumask_intersects(&tmp_mask, cpu_online_mask)) prog_mask = mask; else prog_mask = &tmp_mask; - ret = chip->irq_set_affinity(data, prog_mask, force); - raw_spin_unlock(&tmp_mask_lock); } else { - ret = chip->irq_set_affinity(data, mask, force); + prog_mask = mask; } + + /* + * Make sure we only provide online CPUs to the irqchip, + * unless we are being asked to force the affinity (in which + * case we do as we are told). + */ + cpumask_and(&tmp_mask, prog_mask, cpu_online_mask); + if (!force && !cpumask_empty(&tmp_mask)) + ret = chip->irq_set_affinity(data, &tmp_mask, force); + else if (force) + ret = chip->irq_set_affinity(data, mask, force); + else + ret = -EINVAL; + + raw_spin_unlock(&tmp_mask_lock); + switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: @@ -1249,6 +1264,31 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) } /* + * Internal function to notify that a interrupt thread is ready. + */ +static void irq_thread_set_ready(struct irq_desc *desc, + struct irqaction *action) +{ + set_bit(IRQTF_READY, &action->thread_flags); + wake_up(&desc->wait_for_threads); +} + +/* + * Internal function to wake up a interrupt thread and wait until it is + * ready. + */ +static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, + struct irqaction *action) +{ + if (!action || !action->thread) + return; + + wake_up_process(action->thread); + wait_event(desc->wait_for_threads, + test_bit(IRQTF_READY, &action->thread_flags)); +} + +/* * Interrupt handler thread */ static int irq_thread(void *data) @@ -1259,6 +1299,8 @@ static int irq_thread(void *data) irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + irq_thread_set_ready(desc, action); + sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, @@ -1683,8 +1725,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - init_waitqueue_head(&desc->wait_for_threads); - /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, @@ -1780,14 +1820,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irq_setup_timings(desc, new); - /* - * Strictly no need to wake it up, but hung_task complains - * when no hard interrupt wakes the thread up. - */ - if (new->thread) - wake_up_process(new->thread); - if (new->secondary) - wake_up_process(new->secondary->thread); + wake_up_and_wait_for_irq_thread_ready(desc, new); + wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index bbfb26489aa1..1698e77645ac 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -286,7 +286,7 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu) { - unsigned int bit, cpu, end = m->alloc_end; + unsigned int bit, cpu, end; struct cpumap *cm; if (cpumask_empty(msk)) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2bdfce5edafd..a9ee535293eb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -818,6 +818,21 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag irqd_clr_can_reserve(irqd); if (vflags & VIRQ_NOMASK_QUIRK) irqd_set_msi_nomask_quirk(irqd); + + /* + * If the interrupt is managed but no CPU is available to + * service it, shut it down until better times. Note that + * we only do this on the !RESERVE path as x86 (the only + * architecture using this flag) deals with this in a + * different way by using a catch-all vector. + */ + if ((vflags & VIRQ_ACTIVATE) && + irqd_affinity_is_managed(irqd) && + !cpumask_intersects(irq_data_get_affinity_mask(irqd), + cpu_online_mask)) { + irqd_set_managed_shutdown(irqd); + return 0; + } } if (!(vflags & VIRQ_ACTIVATE)) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index f7df715ec28e..7afa40fe5cc4 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -137,7 +137,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; - kasan_record_aux_stack(work); + kasan_record_aux_stack_noalloc(work); preempt_disable(); if (cpu != smp_processor_id()) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 951c93216fc4..fbdf8d3279ac 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -29,6 +29,7 @@ #include <linux/compiler.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/bsearch.h> /* * These will be re-linked against their real values @@ -212,6 +213,10 @@ unsigned long kallsyms_lookup_name(const char *name) unsigned long i; unsigned int off; + /* Skip the search for empty string. */ + if (!*name) + return 0; + for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); @@ -224,7 +229,6 @@ unsigned long kallsyms_lookup_name(const char *name) return module_kallsyms_lookup_name(name); } -#ifdef CONFIG_LIVEPATCH /* * Iterate over all symbols in vmlinux. For symbols from modules use * module_kallsyms_on_each_symbol instead. @@ -247,7 +251,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, } return 0; } -#endif /* CONFIG_LIVEPATCH */ static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, diff --git a/kernel/kcov.c b/kernel/kcov.c index 36ca640c4f8e..e19c84b02452 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -204,8 +204,16 @@ void notrace __sanitizer_cov_trace_pc(void) /* The first 64-bit word is the number of subsequent PCs. */ pos = READ_ONCE(area[0]) + 1; if (likely(pos < t->kcov_size)) { - area[pos] = ip; + /* Previously we write pc before updating pos. However, some + * early interrupt code could bypass check_kcov_mode() check + * and invoke __sanitizer_cov_trace_pc(). If such interrupt is + * raised between writing pc and updating pos, the pc could be + * overitten by the recursive __sanitizer_cov_trace_pc(). + * Update pos before writing pc to avoid such interleaving. + */ WRITE_ONCE(area[0], pos); + barrier(); + area[pos] = ip; } } EXPORT_SYMBOL(__sanitizer_cov_trace_pc); @@ -236,11 +244,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) start_index = 1 + count * KCOV_WORDS_PER_CMP; end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64); if (likely(end_pos <= max_pos)) { + /* See comment in __sanitizer_cov_trace_pc(). */ + WRITE_ONCE(area[0], count + 1); + barrier(); area[start_index] = type; area[start_index + 1] = arg1; area[start_index + 2] = arg2; area[start_index + 3] = ip; - WRITE_ONCE(area[0], count + 1); } } @@ -459,37 +469,31 @@ void kcov_task_exit(struct task_struct *t) static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) { int res = 0; - void *area; struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; unsigned long flags; - area = vmalloc_user(vma->vm_end - vma->vm_start); - if (!area) - return -ENOMEM; - spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); - if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || + if (kcov->area == NULL || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; } - if (!kcov->area) { - kcov->area = area; - vma->vm_flags |= VM_DONTEXPAND; - spin_unlock_irqrestore(&kcov->lock, flags); - for (off = 0; off < size; off += PAGE_SIZE) { - page = vmalloc_to_page(kcov->area + off); - if (vm_insert_page(vma, vma->vm_start + off, page)) - WARN_ONCE(1, "vm_insert_page() failed"); + spin_unlock_irqrestore(&kcov->lock, flags); + vma->vm_flags |= VM_DONTEXPAND; + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + res = vm_insert_page(vma, vma->vm_start + off, page); + if (res) { + pr_warn_once("kcov: vm_insert_page() failed\n"); + return res; } - return 0; } + return 0; exit: spin_unlock_irqrestore(&kcov->lock, flags); - vfree(area); return res; } @@ -564,31 +568,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { struct task_struct *t; - unsigned long size, unused; + unsigned long flags, unused; int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; - unsigned long flags; switch (cmd) { - case KCOV_INIT_TRACE: - /* - * Enable kcov in trace mode and setup buffer size. - * Must happen before anything else. - */ - if (kcov->mode != KCOV_MODE_DISABLED) - return -EBUSY; - /* - * Size must be at least 2 to hold current position and one PC. - * Later we allocate size * sizeof(unsigned long) memory, - * that must not overflow. - */ - size = arg; - if (size < 2 || size > INT_MAX / sizeof(unsigned long)) - return -EINVAL; - kcov->size = size; - kcov->mode = KCOV_MODE_INIT; - return 0; case KCOV_ENABLE: /* * Enable coverage for the current task. @@ -692,9 +677,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; - unsigned long flags; + unsigned long size, flags; + void *area; - if (cmd == KCOV_REMOTE_ENABLE) { + kcov = filep->private_data; + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + * + * First check the size argument - it must be at least 2 + * to hold the current position and one PC. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + area = vmalloc_user(size * sizeof(unsigned long)); + if (area == NULL) + return -ENOMEM; + spin_lock_irqsave(&kcov->lock, flags); + if (kcov->mode != KCOV_MODE_DISABLED) { + spin_unlock_irqrestore(&kcov->lock, flags); + vfree(area); + return -EBUSY; + } + kcov->area = area; + kcov->size = size; + kcov->mode = KCOV_MODE_INIT; + spin_unlock_irqrestore(&kcov->lock, flags); + return 0; + case KCOV_REMOTE_ENABLE: if (get_user(remote_num_handles, (unsigned __user *)(arg + offsetof(struct kcov_remote_arg, num_handles)))) return -EFAULT; @@ -710,16 +723,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) return -EINVAL; } arg = (unsigned long)remote_arg; + fallthrough; + default: + /* + * All other commands can be normally executed under a spin lock, so we + * obtain and release it here in order to simplify kcov_ioctl_locked(). + */ + spin_lock_irqsave(&kcov->lock, flags); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock_irqrestore(&kcov->lock, flags); + kfree(remote_arg); + return res; } - - kcov = filep->private_data; - spin_lock_irqsave(&kcov->lock, flags); - res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock_irqrestore(&kcov->lock, flags); - - kfree(remote_arg); - - return res; } static const struct file_operations kcov_fops = { diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index a36fca063a73..dcec1b743c69 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc) else nthreads *= 2; - if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + if (!preempt_model_preemptible() || + !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { /* * Without any preemption, keep 2 CPUs free for other tasks, one * of which is the main test case function checking for * completion or failure. */ - const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const long min_unused_cpus = preempt_model_none() ? 2 : 0; const long min_required_cpus = 2 + min_unused_cpus; if (num_online_cpus() < min_required_cpus) { @@ -1565,14 +1566,6 @@ static void test_exit(struct kunit *test) torture_cleanup_end(); } -static struct kunit_suite kcsan_test_suite = { - .name = "kcsan", - .test_cases = kcsan_test_cases, - .init = test_init, - .exit = test_exit, -}; -static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; - __no_kcsan static void register_tracepoints(struct tracepoint *tp, void *ignore) { @@ -1588,11 +1581,7 @@ static void unregister_tracepoints(struct tracepoint *tp, void *ignore) tracepoint_probe_unregister(tp, probe_console, NULL); } -/* - * We only want to do tracepoints setup and teardown once, therefore we have to - * customize the init and exit functions and cannot rely on kunit_test_suite(). - */ -static int __init kcsan_test_init(void) +static int kcsan_suite_init(struct kunit_suite *suite) { /* * Because we want to be able to build the test as a module, we need to @@ -1600,18 +1589,25 @@ static int __init kcsan_test_init(void) * won't work here. */ for_each_kernel_tracepoint(register_tracepoints, NULL); - return __kunit_test_suites_init(kcsan_test_suites); + return 0; } -static void kcsan_test_exit(void) +static void kcsan_suite_exit(struct kunit_suite *suite) { - __kunit_test_suites_exit(kcsan_test_suites); for_each_kernel_tracepoint(unregister_tracepoints, NULL); tracepoint_synchronize_unregister(); } -late_initcall_sync(kcsan_test_init); -module_exit(kcsan_test_exit); +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kcsan_suite_init, + .suite_exit = kcsan_suite_exit, +}; + +kunit_test_suites(&kcsan_test_suite); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 68480f731192..4d34c78334ce 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -768,7 +768,6 @@ static struct page *kimage_alloc_page(struct kimage *image, kimage_free_pages(old_page); continue; } - addr = old_addr; page = old_page; break; } @@ -788,7 +787,6 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; - result = 0; if (image->file_mode) kbuf = segment->kbuf; else @@ -936,6 +934,28 @@ int kimage_load_segment(struct kimage *image, struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; +#ifdef CONFIG_SYSCTL +static struct ctl_table kexec_core_sysctls[] = { + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init kexec_core_sysctl_init(void) +{ + register_sysctl_init("kernel", kexec_core_sysctls); + return 0; +} +late_initcall(kexec_core_sysctl_init); +#endif /* * No panic_cpu check version of crash_kexec(). This function is called @@ -1078,7 +1098,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.common.pr_pid = current->pid; - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + elf_core_copy_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 8347fc158d2b..145321a5e798 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -109,40 +109,6 @@ int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, #endif /* - * arch_kexec_apply_relocations_add - apply relocations of type RELA - * @pi: Purgatory to be relocated. - * @section: Section relocations applying to. - * @relsec: Section containing RELAs. - * @symtab: Corresponding symtab. - * - * Return: 0 on success, negative errno on error. - */ -int __weak -arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, - const Elf_Shdr *relsec, const Elf_Shdr *symtab) -{ - pr_err("RELA relocation unsupported.\n"); - return -ENOEXEC; -} - -/* - * arch_kexec_apply_relocations - apply relocations of type REL - * @pi: Purgatory to be relocated. - * @section: Section relocations applying to. - * @relsec: Section containing RELs. - * @symtab: Corresponding symtab. - * - * Return: 0 on success, negative errno on error. - */ -int __weak -arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, - const Elf_Shdr *relsec, const Elf_Shdr *symtab) -{ - pr_err("REL relocation unsupported.\n"); - return -ENOEXEC; -} - -/* * Free up memory used by kernel, initrd, and command line. This is temporary * memory allocation which is not needed any more after these buffers have * been loaded into separate segments and have been copied elsewhere. @@ -1260,7 +1226,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } -int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, +int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { Elf64_Ehdr *ehdr; @@ -1324,7 +1290,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr++; /* Prepare PT_LOAD type program header for kernel text region */ - if (kernel_map) { + if (need_kernel_map) { phdr->p_type = PT_LOAD; phdr->p_flags = PF_R|PF_W|PF_X; phdr->p_vaddr = (unsigned long) _text; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 94cab8c9ce56..f214f8c088ed 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1237,27 +1237,6 @@ void kprobes_inc_nmissed_count(struct kprobe *p) } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); -static void free_rp_inst_rcu(struct rcu_head *head) -{ - struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); - - if (refcount_dec_and_test(&ri->rph->ref)) - kfree(ri->rph); - kfree(ri); -} -NOKPROBE_SYMBOL(free_rp_inst_rcu); - -static void recycle_rp_inst(struct kretprobe_instance *ri) -{ - struct kretprobe *rp = get_kretprobe(ri); - - if (likely(rp)) - freelist_add(&ri->freelist, &rp->freelist); - else - call_rcu(&ri->rcu, free_rp_inst_rcu); -} -NOKPROBE_SYMBOL(recycle_rp_inst); - static struct kprobe kprobe_busy = { .addr = (void *) get_kprobe, }; @@ -1278,56 +1257,6 @@ void kprobe_busy_end(void) preempt_enable(); } -/* - * This function is called from delayed_put_task_struct() when a task is - * dead and cleaned up to recycle any kretprobe instances associated with - * this task. These left over instances represent probed functions that - * have been called but will never return. - */ -void kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - struct llist_node *node; - - /* Early boot, not yet initialized. */ - if (unlikely(!kprobes_initialized)) - return; - - kprobe_busy_begin(); - - node = __llist_del_all(&tk->kretprobe_instances); - while (node) { - ri = container_of(node, struct kretprobe_instance, llist); - node = node->next; - - recycle_rp_inst(ri); - } - - kprobe_busy_end(); -} -NOKPROBE_SYMBOL(kprobe_flush_task); - -static inline void free_rp_inst(struct kretprobe *rp) -{ - struct kretprobe_instance *ri; - struct freelist_node *node; - int count = 0; - - node = rp->freelist.head; - while (node) { - ri = container_of(node, struct kretprobe_instance, freelist); - node = node->next; - - kfree(ri); - count++; - } - - if (refcount_sub_and_test(count, &rp->rph->ref)) { - kfree(rp->rph); - rp->rph = NULL; - } -} - /* Add the new probe to 'ap->list'. */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { @@ -1489,24 +1418,68 @@ bool within_kprobe_blacklist(unsigned long addr) } /* + * arch_adjust_kprobe_addr - adjust the address + * @addr: symbol base address + * @offset: offset within the symbol + * @on_func_entry: was this @addr+@offset on the function entry + * + * Typically returns @addr + @offset, except for special cases where the + * function might be prefixed by a CFI landing pad, in that case any offset + * inside the landing pad is mapped to the first 'real' instruction of the + * symbol. + * + * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C + * instruction at +0. + */ +kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr, + unsigned long offset, + bool *on_func_entry) +{ + *on_func_entry = !offset; + return (kprobe_opcode_t *)(addr + offset); +} + +/* * If 'symbol_name' is specified, look it up and add the 'offset' * to it. This way, we can specify a relative address to a symbol. * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. */ -static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr, - const char *symbol_name, unsigned int offset) +static kprobe_opcode_t * +_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name, + unsigned long offset, bool *on_func_entry) { if ((symbol_name && addr) || (!symbol_name && !addr)) goto invalid; if (symbol_name) { + /* + * Input: @sym + @offset + * Output: @addr + @offset + * + * NOTE: kprobe_lookup_name() does *NOT* fold the offset + * argument into it's output! + */ addr = kprobe_lookup_name(symbol_name, offset); if (!addr) return ERR_PTR(-ENOENT); } - addr = (kprobe_opcode_t *)(((char *)addr) + offset); + /* + * So here we have @addr + @offset, displace it into a new + * @addr' + @offset' where @addr' is the symbol start address. + */ + addr = (void *)addr + offset; + if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset)) + return ERR_PTR(-ENOENT); + addr = (void *)addr - offset; + + /* + * Then ask the architecture to re-combine them, taking care of + * magical function entry details while telling us if this was indeed + * at the start of the function. + */ + addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry); if (addr) return addr; @@ -1516,7 +1489,8 @@ invalid: static kprobe_opcode_t *kprobe_addr(struct kprobe *p) { - return _kprobe_addr(p->addr, p->symbol_name, p->offset); + bool on_func_entry; + return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); } /* @@ -1562,14 +1536,10 @@ static inline int warn_kprobe_rereg(struct kprobe *p) static int check_ftrace_location(struct kprobe *p) { - unsigned long ftrace_addr; + unsigned long addr = (unsigned long)p->addr; - ftrace_addr = ftrace_location((unsigned long)p->addr); - if (ftrace_addr) { + if (ftrace_location(addr) == addr) { #ifdef CONFIG_KPROBES_ON_FTRACE - /* Given address is not on the instruction boundary */ - if ((unsigned long)p->addr != ftrace_addr) - return -EILSEQ; p->flags |= KPROBE_FLAG_FTRACE; #else /* !CONFIG_KPROBES_ON_FTRACE */ return -EINVAL; @@ -1884,6 +1854,78 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES +#if !defined(CONFIG_KRETPROBE_ON_RETHOOK) +static void free_rp_inst_rcu(struct rcu_head *head) +{ + struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); + + if (refcount_dec_and_test(&ri->rph->ref)) + kfree(ri->rph); + kfree(ri); +} +NOKPROBE_SYMBOL(free_rp_inst_rcu); + +static void recycle_rp_inst(struct kretprobe_instance *ri) +{ + struct kretprobe *rp = get_kretprobe(ri); + + if (likely(rp)) + freelist_add(&ri->freelist, &rp->freelist); + else + call_rcu(&ri->rcu, free_rp_inst_rcu); +} +NOKPROBE_SYMBOL(recycle_rp_inst); + +/* + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any kretprobe instances associated with + * this task. These left over instances represent probed functions that + * have been called but will never return. + */ +void kprobe_flush_task(struct task_struct *tk) +{ + struct kretprobe_instance *ri; + struct llist_node *node; + + /* Early boot, not yet initialized. */ + if (unlikely(!kprobes_initialized)) + return; + + kprobe_busy_begin(); + + node = __llist_del_all(&tk->kretprobe_instances); + while (node) { + ri = container_of(node, struct kretprobe_instance, llist); + node = node->next; + + recycle_rp_inst(ri); + } + + kprobe_busy_end(); +} +NOKPROBE_SYMBOL(kprobe_flush_task); + +static inline void free_rp_inst(struct kretprobe *rp) +{ + struct kretprobe_instance *ri; + struct freelist_node *node; + int count = 0; + + node = rp->freelist.head; + while (node) { + ri = container_of(node, struct kretprobe_instance, freelist); + node = node->next; + + kfree(ri); + count++; + } + + if (refcount_sub_and_test(count, &rp->rph->ref)) { + kfree(rp->rph); + rp->rph = NULL; + } +} + /* This assumes the 'tsk' is the current task or the is not running. */ static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk, struct llist_node **cur) @@ -2046,11 +2088,57 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) return 0; } NOKPROBE_SYMBOL(pre_handler_kretprobe); +#else /* CONFIG_KRETPROBE_ON_RETHOOK */ +/* + * This kprobe pre_handler is registered with every kretprobe. When probe + * hits it will set up the return probe. + */ +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe *rp = container_of(p, struct kretprobe, kp); + struct kretprobe_instance *ri; + struct rethook_node *rhn; + + rhn = rethook_try_get(rp->rh); + if (!rhn) { + rp->nmissed++; + return 0; + } + + ri = container_of(rhn, struct kretprobe_instance, node); + + if (rp->entry_handler && rp->entry_handler(ri, regs)) + rethook_recycle(rhn); + else + rethook_hook(rhn, regs, kprobe_ftrace(p)); + + return 0; +} +NOKPROBE_SYMBOL(pre_handler_kretprobe); -bool __weak arch_kprobe_on_func_entry(unsigned long offset) +static void kretprobe_rethook_handler(struct rethook_node *rh, void *data, + struct pt_regs *regs) { - return !offset; + struct kretprobe *rp = (struct kretprobe *)data; + struct kretprobe_instance *ri; + struct kprobe_ctlblk *kcb; + + /* The data must NOT be null. This means rethook data structure is broken. */ + if (WARN_ON_ONCE(!data) || !rp->handler) + return; + + __this_cpu_write(current_kprobe, &rp->kp); + kcb = get_kprobe_ctlblk(); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + ri = container_of(rh, struct kretprobe_instance, node); + rp->handler(ri, regs); + + __this_cpu_write(current_kprobe, NULL); } +NOKPROBE_SYMBOL(kretprobe_rethook_handler); + +#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */ /** * kprobe_on_func_entry() -- check whether given address is function entry @@ -2067,15 +2155,13 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset) */ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) { - kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); + bool on_func_entry; + kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry); if (IS_ERR(kp_addr)) return PTR_ERR(kp_addr); - if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset)) - return -ENOENT; - - if (!arch_kprobe_on_func_entry(offset)) + if (!on_func_entry) return -EINVAL; return 0; @@ -2121,6 +2207,29 @@ int register_kretprobe(struct kretprobe *rp) rp->maxactive = num_possible_cpus(); #endif } +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler); + if (!rp->rh) + return -ENOMEM; + + for (i = 0; i < rp->maxactive; i++) { + inst = kzalloc(sizeof(struct kretprobe_instance) + + rp->data_size, GFP_KERNEL); + if (inst == NULL) { + rethook_free(rp->rh); + rp->rh = NULL; + return -ENOMEM; + } + rethook_add_node(rp->rh, &inst->node); + } + rp->nmissed = 0; + /* Establish function entry probe point */ + ret = register_kprobe(&rp->kp); + if (ret != 0) { + rethook_free(rp->rh); + rp->rh = NULL; + } +#else /* !CONFIG_KRETPROBE_ON_RETHOOK */ rp->freelist.head = NULL; rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL); if (!rp->rph) @@ -2145,6 +2254,7 @@ int register_kretprobe(struct kretprobe *rp) ret = register_kprobe(&rp->kp); if (ret != 0) free_rp_inst(rp); +#endif return ret; } EXPORT_SYMBOL_GPL(register_kretprobe); @@ -2183,7 +2293,11 @@ void unregister_kretprobes(struct kretprobe **rps, int num) for (i = 0; i < num; i++) { if (__unregister_kprobe_top(&rps[i]->kp) < 0) rps[i]->kp.addr = NULL; +#ifdef CONFIG_KRETPROBE_ON_RETHOOK + rethook_free(rps[i]->rh); +#else rps[i]->rph->rp = NULL; +#endif } mutex_unlock(&kprobe_mutex); @@ -2191,7 +2305,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num) for (i = 0; i < num; i++) { if (rps[i]->kp.addr) { __unregister_kprobe_bottom(&rps[i]->kp); +#ifndef CONFIG_KRETPROBE_ON_RETHOOK free_rp_inst(rps[i]); +#endif } } } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 35859da8bd4f..b1292a57c2a5 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -24,8 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KERNEL_ATTR_RW(_name) \ -static struct kobj_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) +static struct kobj_attribute _name##_attr = __ATTR_RW(_name) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, diff --git a/kernel/kthread.c b/kernel/kthread.c index 38c6dd822da8..3c677918d8f2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -55,7 +55,6 @@ struct kthread { int result; int (*threadfn)(void *); void *data; - mm_segment_t oldfs; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP @@ -341,7 +340,7 @@ static int kthread(void *_create) self = to_kthread(current); - /* If user was SIGKILLed, I release the structure. */ + /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create); @@ -356,7 +355,7 @@ static int kthread(void *_create) * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); + set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); @@ -399,7 +398,7 @@ static void create_kthread(struct kthread_create_info *create) /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { - /* If user was SIGKILLed, I release the structure. */ + /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); if (!done) { @@ -441,9 +440,9 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), */ if (unlikely(wait_for_completion_killable(&done))) { /* - * If I was SIGKILLed before kthreadd (or new kernel thread) - * calls complete(), leave the cleanup of this structure to - * that thread. + * If I was killed by a fatal signal before kthreadd (or new + * kernel thread) calls complete(), leave the cleanup of this + * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); @@ -722,7 +721,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD)); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; @@ -877,7 +876,7 @@ fail_task: * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) - * when the worker was SIGKILLed. + * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...) @@ -926,7 +925,7 @@ EXPORT_SYMBOL(kthread_create_worker); * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) - * when the worker was SIGKILLed. + * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, @@ -1441,8 +1440,6 @@ void kthread_use_mm(struct mm_struct *mm) mmdrop(active_mm); else smp_mb(); - - to_kthread(tsk)->oldfs = force_uaccess_begin(); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1457,8 +1454,6 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); - force_uaccess_end(to_kthread(tsk)->oldfs); - task_lock(tsk); /* * When a kthread stops operating on an address space, the loop @@ -1527,5 +1522,4 @@ struct cgroup_subsys_state *kthread_blkcg(void) } return NULL; } -EXPORT_SYMBOL(kthread_blkcg); #endif diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 166d7bf49666..76166df011a4 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -55,6 +55,7 @@ #include <linux/sched/stat.h> #include <linux/list.h> #include <linux/stacktrace.h> +#include <linux/sysctl.h> static DEFINE_RAW_SPINLOCK(latency_lock); @@ -63,6 +64,31 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; +#ifdef CONFIG_SYSCTL +static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} + +static struct ctl_table latencytop_sysctl[] = { + { + .procname = "latencytop", + .data = &latencytop_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_latencytop, + }, + {} +}; +#endif + void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; @@ -266,18 +292,9 @@ static const struct proc_ops lstats_proc_ops = { static int __init init_lstats_procfs(void) { proc_create("latency_stats", 0644, NULL, &lstats_proc_ops); +#ifdef CONFIG_SYSCTL + register_sysctl_init("kernel", latencytop_sysctl); +#endif return 0; } - -int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) -{ - int err; - - err = proc_dointvec(table, write, buffer, lenp, ppos); - if (latencytop_enabled) - force_schedstat_enabled(); - - return err; -} device_initcall(init_lstats_procfs); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 585494ec464f..bc475e62279d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -190,7 +190,7 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, +static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, unsigned int symndx, Elf_Shdr *relasec, const char *sec_objname) { @@ -218,7 +218,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { - sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); + sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index fe316c021d73..4c4f5a776d80 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -118,25 +118,12 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - klp_arch_set_pc(fregs, (unsigned long)func->new_func); + ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func); unlock: ftrace_test_recursion_unlock(bit); } -/* - * Convert a function address into the appropriate ftrace location. - * - * Usually this is just the address of the function, but on some architectures - * it's more complicated so allow them to provide a custom behaviour. - */ -#ifndef klp_get_ftrace_location -static unsigned long klp_get_ftrace_location(unsigned long faddr) -{ - return faddr; -} -#endif - static void klp_unpatch_func(struct klp_func *func) { struct klp_ops *ops; @@ -153,8 +140,7 @@ static void klp_unpatch_func(struct klp_func *func) if (list_is_singular(&ops->func_stack)) { unsigned long ftrace_loc; - ftrace_loc = - klp_get_ftrace_location((unsigned long)func->old_func); + ftrace_loc = ftrace_location((unsigned long)func->old_func); if (WARN_ON(!ftrace_loc)) return; @@ -186,8 +172,7 @@ static int klp_patch_func(struct klp_func *func) if (!ops) { unsigned long ftrace_loc; - ftrace_loc = - klp_get_ftrace_location((unsigned long)func->old_func); + ftrace_loc = ftrace_location((unsigned long)func->old_func); if (!ftrace_loc) { pr_err("failed to find location for function '%s'\n", func->old_name); diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 5683ac0d2566..5d03a2ad1066 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -9,7 +9,6 @@ #include <linux/cpu.h> #include <linux/stacktrace.h> -#include <linux/tracehook.h> #include "core.h" #include "patch.h" #include "transition.h" @@ -641,6 +640,13 @@ void klp_force_transition(void) for_each_possible_cpu(cpu) klp_update_patch_state(idle_task(cpu)); - klp_for_each_patch(patch) - patch->forced = true; + /* Set forced flag for patches being removed. */ + if (klp_target_state == KLP_UNPATCHED) + klp_transition_patch->forced = true; + else if (klp_transition_patch->replace) { + klp_for_each_patch(patch) { + if (patch != klp_transition_patch) + patch->forced = true; + } + } } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f8a0212189ca..f06b91ca6482 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -60,23 +60,53 @@ #include "lockdep_internals.h" -#define CREATE_TRACE_POINTS #include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING -int prove_locking = 1; +static int prove_locking = 1; module_param(prove_locking, int, 0644); #else #define prove_locking 0 #endif #ifdef CONFIG_LOCK_STAT -int lock_stat = 1; +static int lock_stat = 1; module_param(lock_stat, int, 0644); #else #define lock_stat 0 #endif +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_lockdep_table[] = { +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_LOCK_STAT */ + { } +}; + +static __init int kernel_lockdep_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_lockdep_table); + return 0; +} +late_initcall(kernel_lockdep_sysctls_init); +#endif /* CONFIG_SYSCTL */ + DEFINE_PER_CPU(unsigned int, lockdep_recursion); EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); @@ -183,11 +213,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; unsigned long nr_zapped_classes; -#ifndef CONFIG_DEBUG_LOCKDEP -static -#endif +unsigned long max_lock_class_idx; struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; -static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); +DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { @@ -338,7 +366,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock) * elements. These elements are linked together by the lock_entry member in * struct lock_class. */ -LIST_HEAD(all_lock_classes); +static LIST_HEAD(all_lock_classes); static LIST_HEAD(free_lock_classes); /** @@ -1252,6 +1280,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; + int idx; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -1317,6 +1346,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * of classes. */ list_move_tail(&class->lock_entry, &all_lock_classes); + idx = class - lock_classes; + if (idx > max_lock_class_idx) + max_lock_class_idx = idx; if (verbose(class)) { graph_unlock(); @@ -1378,7 +1410,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, u16 distance, u8 dep, + u16 distance, u8 dep, const struct lock_trace *trace) { struct lock_list *entry; @@ -3131,19 +3163,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * to the previous lock's dependency list: */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), - &hlock_class(prev)->locks_after, - next->acquire_ip, distance, - calc_dep(prev, next), - *trace); + &hlock_class(prev)->locks_after, distance, + calc_dep(prev, next), *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), - &hlock_class(next)->locks_before, - next->acquire_ip, distance, - calc_depb(prev, next), - *trace); + &hlock_class(next)->locks_before, distance, + calc_depb(prev, next), *trace); if (!ret) return 0; @@ -4234,14 +4262,13 @@ static void __trace_hardirqs_on_caller(void) /** * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts - * @ip: Caller address * * Invoked before a possible transition to RCU idle from exit to user or * guest mode. This ensures that all RCU operations are done before RCU * stops watching. After the RCU transition lockdep_hardirqs_on() has to be * invoked to set the final state. */ -void lockdep_hardirqs_on_prepare(unsigned long ip) +void lockdep_hardirqs_on_prepare(void) { if (unlikely(!debug_locks)) return; @@ -4838,8 +4865,7 @@ EXPORT_SYMBOL_GPL(__lockdep_no_validate__); static void print_lock_nested_lock_not_held(struct task_struct *curr, - struct held_lock *hlock, - unsigned long ip) + struct held_lock *hlock) { if (!debug_locks_off()) return; @@ -5015,7 +5041,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); if (nest_lock && !__lock_is_held(nest_lock, -1)) { - print_lock_nested_lock_not_held(curr, hlock, ip); + print_lock_nested_lock_not_held(curr, hlock); return 0; } @@ -5406,7 +5432,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) * be guessable and still allows some pin nesting in * our u32 pin_count. */ - cookie.val = 1 + (prandom_u32() >> 16); + cookie.val = 1 + (sched_clock() & 0xffff); hlock->pin_count += cookie.val; return cookie; } @@ -6000,6 +6026,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) WRITE_ONCE(class->name, NULL); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); + if (class - lock_classes == max_lock_class_idx) + max_lock_class_idx--; } else { WARN_ONCE(true, "%s() failed for class %s\n", __func__, class->name); @@ -6011,13 +6039,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) static void reinit_class(struct lock_class *class) { - void *const p = class; - const unsigned int offset = offsetof(struct lock_class, key); - WARN_ON_ONCE(!class->lock_entry.next); WARN_ON_ONCE(!list_empty(&class->locks_after)); WARN_ON_ONCE(!list_empty(&class->locks_before)); - memset(p + offset, 0, sizeof(*class) - offset); + memset_startat(class, 0, key); WARN_ON_ONCE(!class->lock_entry.next); WARN_ON_ONCE(!list_empty(&class->locks_after)); WARN_ON_ONCE(!list_empty(&class->locks_before)); @@ -6290,7 +6315,13 @@ void lockdep_reset_lock(struct lockdep_map *lock) lockdep_reset_lock_reg(lock); } -/* Unregister a dynamically allocated key. */ +/* + * Unregister a dynamically allocated key. + * + * Unlike lockdep_register_key(), a search is always done to find a matching + * key irrespective of debug_locks to avoid potential invalid access to freed + * memory in lock_class entry. + */ void lockdep_unregister_key(struct lock_class_key *key) { struct hlist_head *hash_head = keyhashentry(key); @@ -6305,10 +6336,8 @@ void lockdep_unregister_key(struct lock_class_key *key) return; raw_local_irq_save(flags); - if (!graph_lock()) - goto out_irq; + lockdep_lock(); - pf = get_pending_free(); hlist_for_each_entry_rcu(k, hash_head, hash_entry) { if (k == key) { hlist_del_rcu(&k->hash_entry); @@ -6316,11 +6345,13 @@ void lockdep_unregister_key(struct lock_class_key *key) break; } } - WARN_ON_ONCE(!found); - __lockdep_free_key_range(pf, key, 1); - call_rcu_zapped(pf); - graph_unlock(); -out_irq: + WARN_ON_ONCE(!found && debug_locks); + if (found) { + pf = get_pending_free(); + __lockdep_free_key_range(pf, key, 1); + call_rcu_zapped(pf); + } + lockdep_unlock(); raw_local_irq_restore(flags); /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index ecb8662e7a4e..bbe9000260d0 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -121,7 +121,6 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) -extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; #define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1) @@ -151,6 +150,10 @@ extern unsigned int nr_large_chain_blocks; extern unsigned int max_lockdep_depth; extern unsigned int max_bfs_queue_depth; +extern unsigned long max_lock_class_idx; + +extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +extern unsigned long lock_classes_in_use[]; #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); @@ -205,7 +208,6 @@ struct lockdep_stats { }; DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); -extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; #define __debug_atomic_inc(ptr) \ this_cpu_inc(lockdep_stats.ptr); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index b8d9a050c337..15fdc7fa5c68 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -24,14 +24,33 @@ #include "lockdep_internals.h" +/* + * Since iteration of lock_classes is done without holding the lockdep lock, + * it is not safe to iterate all_lock_classes list directly as the iteration + * may branch off to free_lock_classes or the zapped list. Iteration is done + * directly on the lock_classes array by checking the lock_classes_in_use + * bitmap and max_lock_class_idx. + */ +#define iterate_lock_classes(idx, class) \ + for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \ + idx++, class++) + static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &all_lock_classes, pos); + struct lock_class *class = v; + + ++class; + *pos = class - lock_classes; + return (*pos > max_lock_class_idx) ? NULL : class; } static void *l_start(struct seq_file *m, loff_t *pos) { - return seq_list_start_head(&all_lock_classes, *pos); + unsigned long idx = *pos; + + if (idx > max_lock_class_idx) + return NULL; + return lock_classes + idx; } static void l_stop(struct seq_file *m, void *v) @@ -57,14 +76,16 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - struct lock_class *class = list_entry(v, struct lock_class, lock_entry); + struct lock_class *class = v; struct lock_list *entry; char usage[LOCK_USAGE_CHARS]; + int idx = class - lock_classes; - if (v == &all_lock_classes) { + if (v == lock_classes) seq_printf(m, "all lock classes:\n"); + + if (!test_bit(idx, lock_classes_in_use)) return 0; - } seq_printf(m, "%p", class->key); #ifdef CONFIG_DEBUG_LOCKDEP @@ -220,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING struct lock_class *class; + unsigned long idx; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; if (class->usage_mask == 0) nr_unused++; @@ -254,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps += lockdep_count_forward_deps(class); } + #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif @@ -345,6 +370,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " max bfs queue depth: %11u\n", max_bfs_queue_depth); #endif + seq_printf(m, " max lock class index: %11lu\n", + max_lock_class_idx); lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); @@ -622,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file) if (!res) { struct lock_stat_data *iter = data->stats; struct seq_file *m = file->private_data; + unsigned long idx; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; iter->class = class; iter->stats = lock_stats(class); iter++; } + data->iter_end = iter; sort(data->stats, data->iter_end - data->stats, @@ -645,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct lock_class *class; + unsigned long idx; char c; if (count) { @@ -654,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, if (c != '0') return count; - list_for_each_entry(class, &all_lock_classes, lock_entry) + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; clear_lock_stats(class); + } } return count; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5e3585950ec8..d973fe6041bf 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -30,6 +30,9 @@ #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#define CREATE_TRACE_POINTS +#include <trace/events/lock.h> + #ifndef CONFIG_PREEMPT_RT #include "mutex.h" @@ -599,12 +602,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, NULL)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); if (ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); + trace_contention_end(lock, 0); preempt_enable(); return 0; } @@ -641,6 +646,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } set_current_state(state); + trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { bool first; @@ -680,10 +686,16 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if (__mutex_trylock_or_handoff(lock, first) || - (first && mutex_optimistic_spin(lock, ww_ctx, &waiter))) + if (__mutex_trylock_or_handoff(lock, first)) break; + if (first) { + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) + break; + trace_contention_begin(lock, LCB_F_MUTEX); + } + raw_spin_lock(&lock->wait_lock); } raw_spin_lock(&lock->wait_lock); @@ -707,6 +719,7 @@ acquired: skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); + trace_contention_end(lock, 0); if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); @@ -719,6 +732,7 @@ err: __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: + trace_contention_end(lock, ret); raw_spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 70a32a576f3f..5fe4c5495ba3 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,7 +7,9 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> +#include <linux/sched/debug.h> #include <linux/errno.h> +#include <trace/events/lock.h> int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *key) @@ -162,7 +164,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) { if (__percpu_down_read_trylock(sem)) return true; @@ -170,9 +172,11 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) if (try) return false; + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); percpu_rwsem_wait(sem, /* .reader = */ true); preempt_disable(); + trace_contention_end(sem, 0); return true; } @@ -211,10 +215,11 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) return true; } -void percpu_down_write(struct percpu_rw_semaphore *sem) +void __sched percpu_down_write(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); @@ -236,6 +241,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem) /* Wait for all active readers to complete. */ rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); + trace_contention_end(sem, 0); } EXPORT_SYMBOL_GPL(percpu_down_write); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index ec36b73f4733..2e1600906c9f 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -12,10 +12,11 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/spinlock.h> +#include <trace/events/lock.h> /** - * queued_read_lock_slowpath - acquire read lock of a queue rwlock - * @lock: Pointer to queue rwlock structure + * queued_read_lock_slowpath - acquire read lock of a queued rwlock + * @lock: Pointer to queued rwlock structure */ void queued_read_lock_slowpath(struct qrwlock *lock) { @@ -34,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock) } atomic_sub(_QR_BIAS, &lock->cnts); + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ); + /* * Put the reader into the wait queue */ @@ -51,17 +54,21 @@ void queued_read_lock_slowpath(struct qrwlock *lock) * Signal the next one in queue to become queue head */ arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queued_write_lock_slowpath - acquire write lock of a queue rwlock - * @lock : Pointer to queue rwlock structure + * queued_write_lock_slowpath - acquire write lock of a queued rwlock + * @lock : Pointer to queued rwlock structure */ void queued_write_lock_slowpath(struct qrwlock *lock) { int cnts; + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE); + /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); @@ -79,5 +86,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)); unlock: arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index cbff6ba53d56..65a9a10caa6f 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -22,6 +22,7 @@ #include <linux/prefetch.h> #include <asm/byteorder.h> #include <asm/qspinlock.h> +#include <trace/events/lock.h> /* * Include queued spinlock statistics code @@ -401,6 +402,8 @@ pv_queue: idx = node->count++; tail = encode_tail(smp_processor_id(), idx); + trace_contention_begin(lock, LCB_F_SPIN); + /* * 4 nodes are allocated based on the assumption that there will * not be nested NMIs taking spinlocks. That may not be true in @@ -554,6 +557,8 @@ locked: pv_kick_node(lock, next); release: + trace_contention_end(lock, 0); + /* * release the node */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8555c4efe97c..7779ee8abc2a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -24,6 +24,8 @@ #include <linux/sched/wake_q.h> #include <linux/ww_mutex.h> +#include <trace/events/lock.h> + #include "rtmutex_common.h" #ifndef WW_RT @@ -1579,6 +1581,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, set_current_state(state); + trace_contention_begin(lock, LCB_F_RT); + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk); if (likely(!ret)) ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); @@ -1601,6 +1605,9 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, * unconditionally. We might have to fix that up. */ fixup_rt_mutex_waiters(lock); + + trace_contention_end(lock, ret); + return ret; } @@ -1683,6 +1690,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) /* Save current state and set state to TASK_RTLOCK_WAIT */ current_save_and_set_rtlock_wait_state(); + trace_contention_begin(lock, LCB_F_RT); + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK); for (;;) { @@ -1712,6 +1721,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) */ fixup_rt_mutex_waiters(lock); debug_rt_mutex_free_waiter(&waiter); + + trace_contention_end(lock, 0); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 6fd3162e4098..c201aadb9301 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -112,6 +112,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, * Reader2 to call up_read(), which might be unbound. */ + trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ); + /* * For rwlocks this returns 0 unconditionally, so the below * !ret conditionals are optimized out. @@ -130,6 +132,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, raw_spin_unlock_irq(&rtm->wait_lock); if (!ret) rwbase_rtmutex_unlock(rtm); + + trace_contention_end(rwb, ret); return ret; } @@ -247,11 +251,13 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, goto out_unlock; rwbase_set_and_save_current_state(state); + trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE); for (;;) { /* Optimized out for rwlocks */ if (rwbase_signal_pending_state(state, current)) { rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); + trace_contention_end(rwb, -EINTR); return -EINTR; } @@ -265,6 +271,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, set_current_state(state); } rwbase_restore_current_state(); + trace_contention_end(rwb, 0); out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 69aba4abe104..9d1db4a54d34 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <trace/events/lock.h> #ifndef CONFIG_PREEMPT_RT #include "lock_events.h" @@ -375,16 +376,19 @@ rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) * * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of * this function. Modify with care. + * + * Return: true if wait_list isn't empty and false otherwise */ -static inline void +static inline bool rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { lockdep_assert_held(&sem->wait_lock); list_del(&waiter->list); if (likely(!list_empty(&sem->wait_list))) - return; + return true; atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); + return false; } /* @@ -559,6 +563,33 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, } /* + * Remove a waiter and try to wake up other waiters in the wait queue + * This function is called from the out_nolock path of both the reader and + * writer slowpaths with wait_lock held. It releases the wait_lock and + * optionally wake up waiters before it returns. + */ +static inline void +rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, + struct wake_q_head *wake_q) + __releases(&sem->wait_lock) +{ + bool first = rwsem_first_waiter(sem) == waiter; + + wake_q_init(wake_q); + + /* + * If the wait_list isn't empty and the waiter to be deleted is + * the first waiter, we wake up the remaining waiters as they may + * be eligible to acquire or spin on the lock. + */ + if (rwsem_del_waiter(sem, waiter) && first) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + if (!wake_q_empty(wake_q)) + wake_up_q(wake_q); +} + +/* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. @@ -901,7 +932,7 @@ done: */ static inline void clear_nonspinnable(struct rw_semaphore *sem) { - if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) + if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))) atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner); } @@ -926,6 +957,31 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) #endif /* + * Prepare to wake up waiter(s) in the wait queue by putting them into the + * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely + * reader-owned, wake up read lock waiters in queue front or wake up any + * front waiter otherwise. + + * This is being called from both reader and writer slow paths. + */ +static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count, + struct wake_q_head *wake_q) +{ + enum rwsem_wake_type wake_type; + + if (count & RWSEM_WRITER_MASK) + return; + + if (count & RWSEM_READER_MASK) { + wake_type = RWSEM_WAKE_READERS; + } else { + wake_type = RWSEM_WAKE_ANY; + clear_nonspinnable(sem); + } + rwsem_mark_wake(sem, wake_type, wake_q); +} + +/* * Wait for the read lock to be granted */ static struct rw_semaphore __sched * @@ -935,7 +991,6 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat long rcnt = (count >> RWSEM_READER_SHIFT); struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); - bool wake = false; /* * To prevent a constant stream of readers from starving a sleeping @@ -977,12 +1032,11 @@ queue: if (list_empty(&sem->wait_list)) { /* * In case the wait queue is empty and the lock isn't owned - * by a writer or has the handoff bit set, this reader can - * exit the slowpath and return immediately as its - * RWSEM_READER_BIAS has already been set in the count. + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_READER_BIAS has already been set + * in the count. */ - if (!(atomic_long_read(&sem->count) & - (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { /* Provide lock ACQUIRE */ smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); @@ -997,22 +1051,13 @@ queue: /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (!(count & RWSEM_LOCK_MASK)) { - clear_nonspinnable(sem); - wake = true; - } - if (wake || (!(count & RWSEM_WRITER_MASK) && - (adjustment & RWSEM_FLAG_WAITERS))) - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - + rwsem_cond_wake_waiter(sem, count, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); + + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); + + trace_contention_begin(sem, LCB_F_READ); /* wait to be given the lock */ for (;;) { @@ -1035,23 +1080,23 @@ queue: __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); + trace_contention_end(sem, 0); return sem; out_nolock: - rwsem_del_waiter(sem, &waiter); - raw_spin_unlock_irq(&sem->wait_lock); + rwsem_del_wake_waiter(sem, &waiter, &wake_q); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); + trace_contention_end(sem, -EINTR); return ERR_PTR(-EINTR); } /* * Wait until we successfully acquire the write lock */ -static struct rw_semaphore * +static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { - long count; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); @@ -1075,23 +1120,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* we're now waiting on the lock */ if (rwsem_first_waiter(sem) != &waiter) { - count = atomic_long_read(&sem->count); - - /* - * If there were already threads queued before us and: - * 1) there are no active locks, wake the front - * queued process(es) as the handoff bit might be set. - * 2) there are no active writers and some readers, the lock - * must be read owned; so we try to wake any read lock - * waiters that were queued ahead of us. - */ - if (count & RWSEM_WRITER_MASK) - goto wait; - - rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) - ? RWSEM_WAKE_READERS - : RWSEM_WAKE_ANY, &wake_q); - + rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), + &wake_q); if (!wake_q_empty(&wake_q)) { /* * We want to minimize wait_lock hold time especially @@ -1099,16 +1129,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) */ raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); - wake_q_init(&wake_q); /* Used again, reinit */ raw_spin_lock_irq(&sem->wait_lock); } } else { atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } -wait: /* wait until we successfully acquire the lock */ set_current_state(state); + trace_contention_begin(sem, LCB_F_WRITE); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1148,17 +1178,15 @@ trylock_again: __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); + trace_contention_end(sem, 0); return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - rwsem_del_waiter(sem, &waiter); - if (!list_empty(&sem->wait_list)) - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); + rwsem_del_wake_waiter(sem, &waiter, &wake_q); lockevent_inc(rwsem_wlock_fail); + trace_contention_end(sem, -EINTR); return ERR_PTR(-EINTR); } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 9ee381e4d2a4..f2654d2fe43a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -32,6 +32,7 @@ #include <linux/semaphore.h> #include <linux/spinlock.h> #include <linux/ftrace.h> +#include <trace/events/lock.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -205,7 +206,7 @@ struct semaphore_waiter { * constant, and thus optimised away by the compiler. Likewise the * 'timeout' parameter for the cases without timeouts. */ -static inline int __sched __down_common(struct semaphore *sem, long state, +static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { struct semaphore_waiter waiter; @@ -236,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state, return -EINTR; } +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + int ret; + + trace_contention_begin(sem, 0); + ret = ___down_common(sem, state, timeout); + trace_contention_end(sem, ret); + + return ret; +} + static noinline void __sched __down(struct semaphore *sem) { __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); diff --git a/kernel/module-internal.h b/kernel/module-internal.h deleted file mode 100644 index 8c381c99062f..000000000000 --- a/kernel/module-internal.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* Module internals - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/elf.h> -#include <asm/module.h> - -struct load_info { - const char *name; - /* pointer to module in temporary copy, freed at end of load_module() */ - struct module *mod; - Elf_Ehdr *hdr; - unsigned long len; - Elf_Shdr *sechdrs; - char *secstrings, *strtab; - unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; - struct _ddebug *debug; - unsigned int num_debug; - bool sig_ok; -#ifdef CONFIG_KALLSYMS - unsigned long mod_kallsyms_init_off; -#endif -#ifdef CONFIG_MODULE_DECOMPRESS - struct page **pages; - unsigned int max_pages; - unsigned int used_pages; -#endif - struct { - unsigned int sym, str, mod, vers, info, pcpu; - } index; -}; - -extern int mod_verify_sig(const void *mod, struct load_info *info); - -#ifdef CONFIG_MODULE_DECOMPRESS -int module_decompress(struct load_info *info, const void *buf, size_t size); -void module_decompress_cleanup(struct load_info *info); -#else -static inline int module_decompress(struct load_info *info, - const void *buf, size_t size) -{ - return -EOPNOTSUPP; -} -static inline void module_decompress_cleanup(struct load_info *info) -{ -} -#endif diff --git a/kernel/module/Makefile b/kernel/module/Makefile new file mode 100644 index 000000000000..948efea81e85 --- /dev/null +++ b/kernel/module/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for linux kernel module support +# + +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n + +obj-y += main.o strict_rwx.o +obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o +obj-$(CONFIG_MODULE_SIG) += signing.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o +obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o +obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o +obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_PROC_FS) += procfs.o +obj-$(CONFIG_SYSFS) += sysfs.o +obj-$(CONFIG_KGDB_KDB) += kdb.o +obj-$(CONFIG_MODVERSIONS) += version.o +obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c new file mode 100644 index 000000000000..12a569d361e8 --- /dev/null +++ b/kernel/module/debug_kmemleak.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module kmemleak support + * + * Copyright (C) 2009 Catalin Marinas + */ + +#include <linux/module.h> +#include <linux/kmemleak.h> +#include "internal.h" + +void kmemleak_load_module(const struct module *mod, + const struct load_info *info) +{ + unsigned int i; + + /* only scan the sections containing data */ + kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); + + for (i = 1; i < info->hdr->e_shnum; i++) { + /* Scan all writable sections that's not executable */ + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || + !(info->sechdrs[i].sh_flags & SHF_WRITE) || + (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, + info->sechdrs[i].sh_size, GFP_KERNEL); + } +} diff --git a/kernel/module_decompress.c b/kernel/module/decompress.c index ffef98a20320..2fc7081dd7c1 100644 --- a/kernel/module_decompress.c +++ b/kernel/module/decompress.c @@ -12,7 +12,7 @@ #include <linux/sysfs.h> #include <linux/vmalloc.h> -#include "module-internal.h" +#include "internal.h" static int module_extend_max_pages(struct load_info *info, unsigned int extent) { @@ -113,6 +113,7 @@ static ssize_t module_gzip_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); + if (!page) { retval = -ENOMEM; goto out_inflate_end; @@ -171,6 +172,7 @@ static ssize_t module_xz_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); + if (!page) { retval = -ENOMEM; goto out; @@ -256,6 +258,7 @@ static ssize_t compression_show(struct kobject *kobj, { return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); } + static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); static int __init module_decompress_sysfs_init(void) diff --git a/kernel/module/internal.h b/kernel/module/internal.h new file mode 100644 index 000000000000..bc5507ab8450 --- /dev/null +++ b/kernel/module/internal.h @@ -0,0 +1,302 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Module internals + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/elf.h> +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> + +#ifndef ARCH_SHF_SMALL +#define ARCH_SHF_SMALL 0 +#endif + +/* If this is set, the section belongs in the init part of the module */ +#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG - 1)) +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) + +#ifndef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC +#define data_layout core_layout +#endif + +/* + * Modules' sections will be aligned on page boundaries + * to ensure complete separation of code and data, but + * only when CONFIG_STRICT_MODULE_RWX=y + */ +#ifdef CONFIG_STRICT_MODULE_RWX +# define strict_align(X) PAGE_ALIGN(X) +#else +# define strict_align(X) (X) +#endif + +extern struct mutex module_mutex; +extern struct list_head modules; + +extern struct module_attribute *modinfo_attrs[]; +extern size_t modinfo_attrs_count; + +/* Provided by the linker */ +extern const struct kernel_symbol __start___ksymtab[]; +extern const struct kernel_symbol __stop___ksymtab[]; +extern const struct kernel_symbol __start___ksymtab_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_gpl[]; +extern const s32 __start___kcrctab[]; +extern const s32 __start___kcrctab_gpl[]; + +struct load_info { + const char *name; + /* pointer to module in temporary copy, freed at end of load_module() */ + struct module *mod; + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; + struct _ddebug *debug; + unsigned int num_debug; + bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif +#ifdef CONFIG_MODULE_DECOMPRESS + struct page **pages; + unsigned int max_pages; + unsigned int used_pages; +#endif + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + +enum mod_license { + NOT_GPL_ONLY, + GPL_ONLY, +}; + +struct find_symbol_arg { + /* Input */ + const char *name; + bool gplok; + bool warn; + + /* Output */ + struct module *owner; + const s32 *crc; + const struct kernel_symbol *sym; + enum mod_license license; +}; + +int mod_verify_sig(const void *mod, struct load_info *info); +int try_to_force_load(struct module *mod, const char *reason); +bool find_symbol(struct find_symbol_arg *fsa); +struct module *find_module_all(const char *name, size_t len, bool even_unformed); +int cmp_name(const void *name, const void *sym); +long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, + unsigned int section); +char *module_flags(struct module *mod, char *buf); +size_t module_flags_taint(unsigned long taints, char *buf); + +static inline void module_assert_mutex_or_preempt(void) +{ +#ifdef CONFIG_LOCKDEP + if (unlikely(!debug_locks)) + return; + + WARN_ON_ONCE(!rcu_read_lock_sched_held() && + !lockdep_is_held(&module_mutex)); +#endif +} + +static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} + +#ifdef CONFIG_LIVEPATCH +int copy_module_elf(struct module *mod, struct load_info *info); +void free_module_elf(struct module *mod); +#else /* !CONFIG_LIVEPATCH */ +static inline int copy_module_elf(struct module *mod, struct load_info *info) +{ + return 0; +} + +static inline void free_module_elf(struct module *mod) { } +#endif /* CONFIG_LIVEPATCH */ + +static inline bool set_livepatch_module(struct module *mod) +{ +#ifdef CONFIG_LIVEPATCH + mod->klp = true; + return true; +#else + return false; +#endif +} + +#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING +struct mod_unload_taint { + struct list_head list; + char name[MODULE_NAME_LEN]; + unsigned long taints; + u64 count; +}; + +int try_add_tainted_module(struct module *mod); +void print_unloaded_tainted_modules(void); +#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */ +static inline int try_add_tainted_module(struct module *mod) +{ + return 0; +} + +static inline void print_unloaded_tainted_modules(void) +{ +} +#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */ + +#ifdef CONFIG_MODULE_DECOMPRESS +int module_decompress(struct load_info *info, const void *buf, size_t size); +void module_decompress_cleanup(struct load_info *info); +#else +static inline int module_decompress(struct load_info *info, + const void *buf, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline void module_decompress_cleanup(struct load_info *info) +{ +} +#endif + +struct mod_tree_root { +#ifdef CONFIG_MODULES_TREE_LOOKUP + struct latch_tree_root root; +#endif + unsigned long addr_min; + unsigned long addr_max; +}; + +extern struct mod_tree_root mod_tree; +extern struct mod_tree_root mod_data_tree; + +#ifdef CONFIG_MODULES_TREE_LOOKUP +void mod_tree_insert(struct module *mod); +void mod_tree_remove_init(struct module *mod); +void mod_tree_remove(struct module *mod); +struct module *mod_find(unsigned long addr, struct mod_tree_root *tree); +#else /* !CONFIG_MODULES_TREE_LOOKUP */ + +static inline void mod_tree_insert(struct module *mod) { } +static inline void mod_tree_remove_init(struct module *mod) { } +static inline void mod_tree_remove(struct module *mod) { } +static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree) +{ + struct module *mod; + + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { + if (within_module(addr, mod)) + return mod; + } + + return NULL; +} +#endif /* CONFIG_MODULES_TREE_LOOKUP */ + +void module_enable_ro(const struct module *mod, bool after_init); +void module_enable_nx(const struct module *mod); +void module_enable_x(const struct module *mod); +int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod); +bool module_check_misalignment(const struct module *mod); + +#ifdef CONFIG_MODULE_SIG +int module_sig_check(struct load_info *info, int flags); +#else /* !CONFIG_MODULE_SIG */ +static inline int module_sig_check(struct load_info *info, int flags) +{ + return 0; +} +#endif /* !CONFIG_MODULE_SIG */ + +#ifdef CONFIG_DEBUG_KMEMLEAK +void kmemleak_load_module(const struct module *mod, const struct load_info *info); +#else /* !CONFIG_DEBUG_KMEMLEAK */ +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { } +#endif /* CONFIG_DEBUG_KMEMLEAK */ + +#ifdef CONFIG_KALLSYMS +void init_build_id(struct module *mod, const struct load_info *info); +void layout_symtab(struct module *mod, struct load_info *info); +void add_kallsyms(struct module *mod, const struct load_info *info); +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); + +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} +#else /* !CONFIG_KALLSYMS */ +static inline void init_build_id(struct module *mod, const struct load_info *info) { } +static inline void layout_symtab(struct module *mod, struct load_info *info) { } +static inline void add_kallsyms(struct module *mod, const struct load_info *info) { } +#endif /* CONFIG_KALLSYMS */ + +#ifdef CONFIG_SYSFS +int mod_sysfs_setup(struct module *mod, const struct load_info *info, + struct kernel_param *kparam, unsigned int num_params); +void mod_sysfs_teardown(struct module *mod); +void init_param_lock(struct module *mod); +#else /* !CONFIG_SYSFS */ +static inline int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static inline void mod_sysfs_teardown(struct module *mod) { } +static inline void init_param_lock(struct module *mod) { } +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MODVERSIONS +int check_version(const struct load_info *info, + const char *symname, struct module *mod, const s32 *crc); +void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, + struct kernel_symbol *ks, struct tracepoint * const *tp); +int check_modstruct_version(const struct load_info *info, struct module *mod); +int same_magic(const char *amagic, const char *bmagic, bool has_crcs); +#else /* !CONFIG_MODVERSIONS */ +static inline int check_version(const struct load_info *info, + const char *symname, + struct module *mod, + const s32 *crc) +{ + return 1; +} + +static inline int check_modstruct_version(const struct load_info *info, + struct module *mod) +{ + return 1; +} + +static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs) +{ + return strcmp(amagic, bmagic) == 0; +} +#endif /* CONFIG_MODVERSIONS */ diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c new file mode 100644 index 000000000000..3e11523bc6f6 --- /dev/null +++ b/kernel/module/kallsyms.c @@ -0,0 +1,512 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module kallsyms support + * + * Copyright (C) 2010 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/buildid.h> +#include <linux/bsearch.h> +#include "internal.h" + +/* Lookup exported symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_exported_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + return bsearch(name, start, stop - start, + sizeof(struct kernel_symbol), cmp_name); +} + +static int is_exported(const char *name, unsigned long value, + const struct module *mod) +{ + const struct kernel_symbol *ks; + + if (!mod) + ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); + else + ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); + + return ks && kernel_symbol_value(ks) == value; +} + +/* As per nm */ +static char elf_type(const Elf_Sym *sym, const struct load_info *info) +{ + const Elf_Shdr *sechdrs = info->sechdrs; + + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { + if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) + return 'v'; + else + return 'w'; + } + if (sym->st_shndx == SHN_UNDEF) + return 'U'; + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) + return 'a'; + if (sym->st_shndx >= SHN_LORESERVE) + return '?'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) + return 't'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC && + sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { + if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) + return 'r'; + else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 'g'; + else + return 'd'; + } + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 's'; + else + return 'b'; + } + if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug")) { + return 'n'; + } + return '?'; +} + +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int pcpundx) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF || + src->st_shndx >= shnum || + !src->st_name) + return false; + +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +/* + * We only allocate and copy the strings needed by the parts of symtab + * we keep. This is simple, but has the effect of making multiple + * copies of duplicates. We could be more sophisticated, see + * linux-kernel thread starting with + * <73defb5e4bca04a6431392cc341112b1@localhost>. + */ +void layout_symtab(struct module *mod, struct load_info *info) +{ + Elf_Shdr *symsect = info->sechdrs + info->index.sym; + Elf_Shdr *strsect = info->sechdrs + info->index.str; + const Elf_Sym *src; + unsigned int i, nsrc, ndst, strtab_size = 0; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, symsect, + info->index.sym) | INIT_OFFSET_MASK; + pr_debug("\t%s\n", info->secstrings + symsect->sh_name); + + src = (void *)info->hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + + /* Compute total space required for the core symbols' strtab. */ + for (ndst = i = 0; i < nsrc; i++) { + if (i == 0 || is_livepatch_module(mod) || + is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { + strtab_size += strlen(&info->strtab[src[i].st_name]) + 1; + ndst++; + } + } + + /* Append room for core symbols at end of core part. */ + info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->data_layout.size += strtab_size; + info->core_typeoffs = mod->data_layout.size; + mod->data_layout.size += ndst * sizeof(char); + mod->data_layout.size = strict_align(mod->data_layout.size); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, strsect, + info->index.str) | INIT_OFFSET_MASK; + pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod->init_layout.size; + mod->init_layout.size += nsrc * sizeof(char); + mod->init_layout.size = strict_align(mod->init_layout.size); +} + +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ +void add_kallsyms(struct module *mod, const struct load_info *info) +{ + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + + /* Set up to point into init section. */ + mod->kallsyms = (void __rcu *)mod->init_layout.base + + info->mod_kallsyms_init_off; + + preempt_disable(); + /* The following is safe since this pointer cannot change */ + rcu_dereference_sched(mod->kallsyms)->symtab = (void *)symsec->sh_addr; + rcu_dereference_sched(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Make sure we get permanent strtab: don't use info->strtab. */ + rcu_dereference_sched(mod->kallsyms)->strtab = + (void *)info->sechdrs[info->index.str].sh_addr; + rcu_dereference_sched(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs; + + /* + * Now populate the cut down core kallsyms for after init + * and set types up while we still have access to sections. + */ + mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs; + mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs; + src = rcu_dereference_sched(mod->kallsyms)->symtab; + for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) { + rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info); + if (i == 0 || is_livepatch_module(mod) || + is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { + mod->core_kallsyms.typetab[ndst] = + rcu_dereference_sched(mod->kallsyms)->typetab[i]; + dst[ndst] = src[i]; + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strscpy(s, + &rcu_dereference_sched(mod->kallsyms)->strtab[src[i].st_name], + KSYM_NAME_LEN) + 1; + } + } + preempt_enable(); + mod->core_kallsyms.num_symtab = ndst; +} + +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) +void init_build_id(struct module *mod, const struct load_info *info) +{ + const Elf_Shdr *sechdr; + unsigned int i; + + for (i = 0; i < info->hdr->e_shnum; i++) { + sechdr = &info->sechdrs[i]; + if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE && + !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id, + sechdr->sh_size)) + break; + } +} +#else +void init_build_id(struct module *mod, const struct load_info *info) +{ +} +#endif + +/* + * This ignores the intensely annoying "mapping symbols" found + * in ARM ELF files: $a, $t and $d. + */ +static inline int is_arm_mapping_symbol(const char *str) +{ + if (str[0] == '.' && str[1] == 'L') + return true; + return str[0] == '$' && strchr("axtd", str[1]) && + (str[2] == '\0' || str[2] == '.'); +} + +static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + +/* + * Given a module and address, find the corresponding symbol and return its name + * while providing its size and offset if needed. + */ +static const char *find_kallsyms_symbol(struct module *mod, + unsigned long addr, + unsigned long *size, + unsigned long *offset) +{ + unsigned int i, best = 0; + unsigned long nextval, bestval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + + /* At worse, next value is at end of module */ + if (within_module_init(addr, mod)) + nextval = (unsigned long)mod->init_layout.base + mod->init_layout.text_size; + else + nextval = (unsigned long)mod->core_layout.base + mod->core_layout.text_size; + + bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); + + /* + * Scan for closest preceding symbol, and next symbol. (ELF + * starts real symbols at 1). + */ + for (i = 1; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + unsigned long thisval = kallsyms_symbol_value(sym); + + if (sym->st_shndx == SHN_UNDEF) + continue; + + /* + * We ignore unnamed symbols: they're uninformative + * and inserted at a whim. + */ + if (*kallsyms_symbol_name(kallsyms, i) == '\0' || + is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) + continue; + + if (thisval <= addr && thisval > bestval) { + best = i; + bestval = thisval; + } + if (thisval > addr && thisval < nextval) + nextval = thisval; + } + + if (!best) + return NULL; + + if (size) + *size = nextval - bestval; + if (offset) + *offset = addr - bestval; + + return kallsyms_symbol_name(kallsyms, best); +} + +void * __weak dereference_module_function_descriptor(struct module *mod, + void *ptr) +{ + return ptr; +} + +/* + * For kallsyms to ask for address resolution. NULL means not found. Careful + * not to lock to avoid deadlock on oopses, simply disable preemption. + */ +const char *module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) +{ + const char *ret = NULL; + struct module *mod; + + preempt_disable(); + mod = __module_address(addr); + if (mod) { + if (modname) + *modname = mod->name; + if (modbuildid) { +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) + *modbuildid = mod->build_id; +#else + *modbuildid = NULL; +#endif + } + + ret = find_kallsyms_symbol(mod, addr, size, offset); + } + /* Make a copy in here where it's safe */ + if (ret) { + strncpy(namebuf, ret, KSYM_NAME_LEN - 1); + ret = namebuf; + } + preempt_enable(); + + return ret; +} + +int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (within_module(addr, mod)) { + const char *sym; + + sym = find_kallsyms_symbol(mod, addr, NULL, NULL); + if (!sym) + goto out; + + strscpy(symname, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (within_module(addr, mod)) { + const char *sym; + + sym = find_kallsyms_symbol(mod, addr, size, offset); + if (!sym) + goto out; + if (modname) + strscpy(modname, mod->name, MODULE_NAME_LEN); + if (name) + strscpy(name, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + const Elf_Sym *sym = &kallsyms->symtab[symnum]; + + *value = kallsyms_symbol_value(sym); + *type = kallsyms->typetab[symnum]; + strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); + strscpy(module_name, mod->name, MODULE_NAME_LEN); + *exported = is_exported(name, *value, mod); + preempt_enable(); + return 0; + } + symnum -= kallsyms->num_symtab; + } + preempt_enable(); + return -ERANGE; +} + +/* Given a module and name of symbol, find and return the symbol's value */ +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) +{ + unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + + if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && + sym->st_shndx != SHN_UNDEF) + return kallsyms_symbol_value(sym); + } + return 0; +} + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name) +{ + struct module *mod; + char *colon; + unsigned long ret = 0; + + /* Don't lock: we're in enough trouble already. */ + preempt_disable(); + if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { + if ((mod = find_module_all(name, colon - name, false)) != NULL) + ret = find_kallsyms_symbol_value(mod, colon + 1); + } else { + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) + break; + } + } + preempt_enable(); + return ret; +} + +#ifdef CONFIG_LIVEPATCH +int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, + struct module *, unsigned long), + void *data) +{ + struct module *mod; + unsigned int i; + int ret = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + + /* Use rcu_dereference_sched() to remain compliant with the sparse tool */ + preempt_disable(); + kallsyms = rcu_dereference_sched(mod->kallsyms); + preempt_enable(); + + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + + if (sym->st_shndx == SHN_UNDEF) + continue; + + ret = fn(data, kallsyms_symbol_name(kallsyms, i), + mod, kallsyms_symbol_value(sym)); + if (ret != 0) + goto out; + } + } +out: + mutex_unlock(&module_mutex); + return ret; +} +#endif /* CONFIG_LIVEPATCH */ diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c new file mode 100644 index 000000000000..f4317f92e189 --- /dev/null +++ b/kernel/module/kdb.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module kdb support + * + * Copyright (C) 2010 Jason Wessel + */ + +#include <linux/module.h> +#include <linux/kdb.h> +#include "internal.h" + +/* + * kdb_lsmod - This function implements the 'lsmod' command. Lists + * currently loaded kernel modules. + * Mostly taken from userland lsmod. + */ +int kdb_lsmod(int argc, const char **argv) +{ + struct module *mod; + + if (argc != 0) + return KDB_ARGCOUNT; + + kdb_printf("Module Size modstruct Used by\n"); + list_for_each_entry(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + + kdb_printf("%-20s%8u", mod->name, mod->core_layout.size); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + kdb_printf("/%8u", mod->data_layout.size); +#endif + kdb_printf(" 0x%px ", (void *)mod); +#ifdef CONFIG_MODULE_UNLOAD + kdb_printf("%4d ", module_refcount(mod)); +#endif + if (mod->state == MODULE_STATE_GOING) + kdb_printf(" (Unloading)"); + else if (mod->state == MODULE_STATE_COMING) + kdb_printf(" (Loading)"); + else + kdb_printf(" (Live)"); + kdb_printf(" 0x%px", mod->core_layout.base); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + kdb_printf("/0x%px", mod->data_layout.base); +#endif + +#ifdef CONFIG_MODULE_UNLOAD + { + struct module_use *use; + + kdb_printf(" [ "); + list_for_each_entry(use, &mod->source_list, + source_list) + kdb_printf("%s ", use->target->name); + kdb_printf("]\n"); + } +#endif + } + + return 0; +} diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c new file mode 100644 index 000000000000..486d4ff92719 --- /dev/null +++ b/kernel/module/livepatch.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module livepatch support + * + * Copyright (C) 2016 Jessica Yu <jeyu@redhat.com> + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/slab.h> +#include "internal.h" + +/* + * Persist Elf information about a module. Copy the Elf header, + * section header table, section string table, and symtab section + * index from info to mod->klp_info. + */ +int copy_module_elf(struct module *mod, struct load_info *info) +{ + unsigned int size, symndx; + int ret; + + size = sizeof(*mod->klp_info); + mod->klp_info = kmalloc(size, GFP_KERNEL); + if (!mod->klp_info) + return -ENOMEM; + + /* Elf header */ + size = sizeof(mod->klp_info->hdr); + memcpy(&mod->klp_info->hdr, info->hdr, size); + + /* Elf section header table */ + size = sizeof(*info->sechdrs) * info->hdr->e_shnum; + mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); + if (!mod->klp_info->sechdrs) { + ret = -ENOMEM; + goto free_info; + } + + /* Elf section name string table */ + size = info->sechdrs[info->hdr->e_shstrndx].sh_size; + mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); + if (!mod->klp_info->secstrings) { + ret = -ENOMEM; + goto free_sechdrs; + } + + /* Elf symbol section index */ + symndx = info->index.sym; + mod->klp_info->symndx = symndx; + + /* + * For livepatch modules, core_kallsyms.symtab is a complete + * copy of the original symbol table. Adjust sh_addr to point + * to core_kallsyms.symtab since the copy of the symtab in module + * init memory is freed at the end of do_init_module(). + */ + mod->klp_info->sechdrs[symndx].sh_addr = (unsigned long)mod->core_kallsyms.symtab; + + return 0; + +free_sechdrs: + kfree(mod->klp_info->sechdrs); +free_info: + kfree(mod->klp_info); + return ret; +} + +void free_module_elf(struct module *mod) +{ + kfree(mod->klp_info->sechdrs); + kfree(mod->klp_info->secstrings); + kfree(mod->klp_info); +} diff --git a/kernel/module.c b/kernel/module/main.c index 46a5c2ed1928..fed58d30725d 100644 --- a/kernel/module.c +++ b/kernel/module/main.c @@ -14,16 +14,12 @@ #include <linux/init.h> #include <linux/kallsyms.h> #include <linux/buildid.h> -#include <linux/file.h> #include <linux/fs.h> -#include <linux/sysfs.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/elf.h> -#include <linux/proc_fs.h> -#include <linux/security.h> #include <linux/seq_file.h> #include <linux/syscalls.h> #include <linux/fcntl.h> @@ -58,236 +54,69 @@ #include <linux/dynamic_debug.h> #include <linux/audit.h> #include <uapi/linux/module.h> -#include "module-internal.h" +#include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/module.h> -#ifndef ARCH_SHF_SMALL -#define ARCH_SHF_SMALL 0 -#endif - -/* - * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data, but - * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y - */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX -# define debug_align(X) ALIGN(X, PAGE_SIZE) -#else -# define debug_align(X) (X) -#endif - -/* If this is set, the section belongs in the init part of the module */ -#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) - /* * Mutex protects: * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, - * 3) module_addr_min/module_addr_max. + * 3) mod_tree.addr_min/mod_tree.addr_max. * (delete and add uses RCU list operations). */ -static DEFINE_MUTEX(module_mutex); -static LIST_HEAD(modules); +DEFINE_MUTEX(module_mutex); +LIST_HEAD(modules); /* Work queue for freeing init sections in success case */ static void do_free_init(struct work_struct *w); static DECLARE_WORK(init_free_wq, do_free_init); static LLIST_HEAD(init_free_list); -#ifdef CONFIG_MODULES_TREE_LOOKUP - -/* - * Use a latched RB-tree for __module_address(); this allows us to use - * RCU-sched lookups of the address from any context. - * - * This is conditional on PERF_EVENTS || TRACING because those can really hit - * __module_address() hard by doing a lot of stack unwinding; potentially from - * NMI context. - */ - -static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) -{ - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - - return (unsigned long)layout->base; -} - -static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) -{ - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - - return (unsigned long)layout->size; -} - -static __always_inline bool -mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) -{ - return __mod_tree_val(a) < __mod_tree_val(b); -} - -static __always_inline int -mod_tree_comp(void *key, struct latch_tree_node *n) -{ - unsigned long val = (unsigned long)key; - unsigned long start, end; - - start = __mod_tree_val(n); - if (val < start) - return -1; - - end = start + __mod_tree_size(n); - if (val >= end) - return 1; - - return 0; -} - -static const struct latch_tree_ops mod_tree_ops = { - .less = mod_tree_less, - .comp = mod_tree_comp, +struct mod_tree_root mod_tree __cacheline_aligned = { + .addr_min = -1UL, }; -static struct mod_tree_root { - struct latch_tree_root root; - unsigned long addr_min; - unsigned long addr_max; -} mod_tree __cacheline_aligned = { +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC +struct mod_tree_root mod_data_tree __cacheline_aligned = { .addr_min = -1UL, }; +#endif #define module_addr_min mod_tree.addr_min #define module_addr_max mod_tree.addr_max -static noinline void __mod_tree_insert(struct mod_tree_node *node) -{ - latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops); -} - -static void __mod_tree_remove(struct mod_tree_node *node) -{ - latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops); -} - -/* - * These modifications: insert, remove_init and remove; are serialized by the - * module_mutex. - */ -static void mod_tree_insert(struct module *mod) -{ - mod->core_layout.mtn.mod = mod; - mod->init_layout.mtn.mod = mod; - - __mod_tree_insert(&mod->core_layout.mtn); - if (mod->init_layout.size) - __mod_tree_insert(&mod->init_layout.mtn); -} - -static void mod_tree_remove_init(struct module *mod) -{ - if (mod->init_layout.size) - __mod_tree_remove(&mod->init_layout.mtn); -} - -static void mod_tree_remove(struct module *mod) -{ - __mod_tree_remove(&mod->core_layout.mtn); - mod_tree_remove_init(mod); -} - -static struct module *mod_find(unsigned long addr) -{ - struct latch_tree_node *ltn; - - ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops); - if (!ltn) - return NULL; - - return container_of(ltn, struct mod_tree_node, node)->mod; -} - -#else /* MODULES_TREE_LOOKUP */ - -static unsigned long module_addr_min = -1UL, module_addr_max = 0; - -static void mod_tree_insert(struct module *mod) { } -static void mod_tree_remove_init(struct module *mod) { } -static void mod_tree_remove(struct module *mod) { } - -static struct module *mod_find(unsigned long addr) -{ - struct module *mod; - - list_for_each_entry_rcu(mod, &modules, list, - lockdep_is_held(&module_mutex)) { - if (within_module(addr, mod)) - return mod; - } - - return NULL; -} - -#endif /* MODULES_TREE_LOOKUP */ +struct symsearch { + const struct kernel_symbol *start, *stop; + const s32 *crcs; + enum mod_license license; +}; /* * Bounds of module text, for speeding up __module_address. * Protected by module_mutex. */ -static void __mod_update_bounds(void *base, unsigned int size) +static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_root *tree) { unsigned long min = (unsigned long)base; unsigned long max = min + size; - if (min < module_addr_min) - module_addr_min = min; - if (max > module_addr_max) - module_addr_max = max; + if (min < tree->addr_min) + tree->addr_min = min; + if (max > tree->addr_max) + tree->addr_max = max; } static void mod_update_bounds(struct module *mod) { - __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); + __mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree); if (mod->init_layout.size) - __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); -} - -#ifdef CONFIG_KGDB_KDB -struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ -#endif /* CONFIG_KGDB_KDB */ - -static void module_assert_mutex_or_preempt(void) -{ -#ifdef CONFIG_LOCKDEP - if (unlikely(!debug_locks)) - return; - - WARN_ON_ONCE(!rcu_read_lock_sched_held() && - !lockdep_is_held(&module_mutex)); -#endif -} - -#ifdef CONFIG_MODULE_SIG -static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); -module_param(sig_enforce, bool_enable_only, 0644); - -void set_module_sig_enforced(void) -{ - sig_enforce = true; -} -#else -#define sig_enforce false + __mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + __mod_update_bounds(mod->data_layout.base, mod->data_layout.size, &mod_data_tree); #endif - -/* - * Export sig_enforce kernel cmdline parameter to allow other subsystems rely - * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. - */ -bool is_module_sig_enforced(void) -{ - return sig_enforce; } -EXPORT_SYMBOL(is_module_sig_enforced); /* Block module loading/unloading? */ int modules_disabled = 0; @@ -335,7 +164,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag, /* * A thread that wants to hold a reference to a module only while it - * is running can call this to safely exit. nfsd and lockd use this. + * is running can call this to safely exit. */ void __noreturn __module_put_and_kthread_exit(struct module *mod, long code) { @@ -408,66 +237,12 @@ static __maybe_unused void *any_section_objs(const struct load_info *info, return (void *)info->sechdrs[sec].sh_addr; } -/* Provided by the linker */ -extern const struct kernel_symbol __start___ksymtab[]; -extern const struct kernel_symbol __stop___ksymtab[]; -extern const struct kernel_symbol __start___ksymtab_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_gpl[]; -extern const s32 __start___kcrctab[]; -extern const s32 __start___kcrctab_gpl[]; - #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL #else #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) #endif -struct symsearch { - const struct kernel_symbol *start, *stop; - const s32 *crcs; - enum mod_license { - NOT_GPL_ONLY, - GPL_ONLY, - } license; -}; - -struct find_symbol_arg { - /* Input */ - const char *name; - bool gplok; - bool warn; - - /* Output */ - struct module *owner; - const s32 *crc; - const struct kernel_symbol *sym; - enum mod_license license; -}; - -static bool check_exported_symbol(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) -{ - struct find_symbol_arg *fsa = data; - - if (!fsa->gplok && syms->license == GPL_ONLY) - return false; - fsa->owner = owner; - fsa->crc = symversion(syms->crcs, symnum); - fsa->sym = &syms->start[symnum]; - fsa->license = syms->license; - return true; -} - -static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) -{ -#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS - return (unsigned long)offset_to_ptr(&sym->value_offset); -#else - return sym->value; -#endif -} - static const char *kernel_symbol_name(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS @@ -488,33 +263,38 @@ static const char *kernel_symbol_namespace(const struct kernel_symbol *sym) #endif } -static int cmp_name(const void *name, const void *sym) +int cmp_name(const void *name, const void *sym) { return strcmp(name, kernel_symbol_name(sym)); } static bool find_exported_symbol_in_section(const struct symsearch *syms, struct module *owner, - void *data) + struct find_symbol_arg *fsa) { - struct find_symbol_arg *fsa = data; struct kernel_symbol *sym; + if (!fsa->gplok && syms->license == GPL_ONLY) + return false; + sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, sizeof(struct kernel_symbol), cmp_name); + if (!sym) + return false; - if (sym != NULL && check_exported_symbol(syms, owner, - sym - syms->start, data)) - return true; + fsa->owner = owner; + fsa->crc = symversion(syms->crcs, sym - syms->start); + fsa->sym = sym; + fsa->license = syms->license; - return false; + return true; } /* * Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ -static bool find_symbol(struct find_symbol_arg *fsa) +bool find_symbol(struct find_symbol_arg *fsa) { static const struct symsearch arr[] = { { __start___ksymtab, __stop___ksymtab, __start___kcrctab, @@ -558,8 +338,8 @@ static bool find_symbol(struct find_symbol_arg *fsa) * Search for module by name: must hold module_mutex (or preempt disabled * for read-only access). */ -static struct module *find_module_all(const char *name, size_t len, - bool even_unformed) +struct module *find_module_all(const char *name, size_t len, + bool even_unformed) { struct module *mod; @@ -985,31 +765,6 @@ out: return ret; } -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - struct module_use *use; - int printed_something = 0; - - seq_printf(m, " %i ", module_refcount(mod)); - - /* - * Always include a trailing , so userspace can differentiate - * between this and the old multi-field proc format. - */ - list_for_each_entry(use, &mod->source_list, source_list) { - printed_something = 1; - seq_printf(m, "%s,", use->source->name); - } - - if (mod->init != NULL && mod->exit == NULL) { - printed_something = 1; - seq_puts(m, "[permanent],"); - } - - if (!printed_something) - seq_puts(m, "-"); -} - void __symbol_put(const char *symbol) { struct find_symbol_arg fsa = { @@ -1099,12 +854,6 @@ void module_put(struct module *module) EXPORT_SYMBOL(module_put); #else /* !CONFIG_MODULE_UNLOAD */ -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - /* We don't know the usage count, or what modules are using. */ - seq_puts(m, " - -"); -} - static inline void module_unload_free(struct module *mod) { } @@ -1120,13 +869,13 @@ static inline int module_unload_init(struct module *mod) } #endif /* CONFIG_MODULE_UNLOAD */ -static size_t module_flags_taint(struct module *mod, char *buf) +size_t module_flags_taint(unsigned long taints, char *buf) { size_t l = 0; int i; for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &mod->taints)) + if (taint_flags[i].module && test_bit(i, &taints)) buf[l++] = taint_flags[i].c_true; } @@ -1179,6 +928,17 @@ static ssize_t show_coresize(struct module_attribute *mattr, static struct module_attribute modinfo_coresize = __ATTR(coresize, 0444, show_coresize, NULL); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC +static ssize_t show_datasize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->data_layout.size); +} + +static struct module_attribute modinfo_datasize = + __ATTR(datasize, 0444, show_datasize, NULL); +#endif + static ssize_t show_initsize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { @@ -1193,7 +953,7 @@ static ssize_t show_taint(struct module_attribute *mattr, { size_t l; - l = module_flags_taint(mk->mod, buffer); + l = module_flags_taint(mk->mod->taints, buffer); buffer[l++] = '\n'; return l; } @@ -1201,12 +961,15 @@ static ssize_t show_taint(struct module_attribute *mattr, static struct module_attribute modinfo_taint = __ATTR(taint, 0444, show_taint, NULL); -static struct module_attribute *modinfo_attrs[] = { +struct module_attribute *modinfo_attrs[] = { &module_uevent, &modinfo_version, &modinfo_srcversion, &modinfo_initstate, &modinfo_coresize, +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + &modinfo_datasize, +#endif &modinfo_initsize, &modinfo_taint, #ifdef CONFIG_MODULE_UNLOAD @@ -1215,9 +978,11 @@ static struct module_attribute *modinfo_attrs[] = { NULL, }; +size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs); + static const char vermagic[] = VERMAGIC_STRING; -static int try_to_force_load(struct module *mod, const char *reason) +int try_to_force_load(struct module *mod, const char *reason) { #ifdef CONFIG_MODULE_FORCE_LOAD if (!test_taint(TAINT_FORCED_MODULE)) @@ -1229,115 +994,6 @@ static int try_to_force_load(struct module *mod, const char *reason) #endif } -#ifdef CONFIG_MODVERSIONS - -static u32 resolve_rel_crc(const s32 *crc) -{ - return *(u32 *)((void *)crc + *crc); -} - -static int check_version(const struct load_info *info, - const char *symname, - struct module *mod, - const s32 *crc) -{ - Elf_Shdr *sechdrs = info->sechdrs; - unsigned int versindex = info->index.vers; - unsigned int i, num_versions; - struct modversion_info *versions; - - /* Exporting module didn't supply crcs? OK, we're already tainted. */ - if (!crc) - return 1; - - /* No versions at all? modprobe --force does this. */ - if (versindex == 0) - return try_to_force_load(mod, symname) == 0; - - versions = (void *) sechdrs[versindex].sh_addr; - num_versions = sechdrs[versindex].sh_size - / sizeof(struct modversion_info); - - for (i = 0; i < num_versions; i++) { - u32 crcval; - - if (strcmp(versions[i].name, symname) != 0) - continue; - - if (IS_ENABLED(CONFIG_MODULE_REL_CRCS)) - crcval = resolve_rel_crc(crc); - else - crcval = *crc; - if (versions[i].crc == crcval) - return 1; - pr_debug("Found checksum %X vs module %lX\n", - crcval, versions[i].crc); - goto bad_version; - } - - /* Broken toolchain. Warn once, then let it go.. */ - pr_warn_once("%s: no symbol version for %s\n", info->name, symname); - return 1; - -bad_version: - pr_warn("%s: disagrees about version of symbol %s\n", - info->name, symname); - return 0; -} - -static inline int check_modstruct_version(const struct load_info *info, - struct module *mod) -{ - struct find_symbol_arg fsa = { - .name = "module_layout", - .gplok = true, - }; - - /* - * Since this should be found in kernel (which can't be removed), no - * locking is necessary -- use preempt_disable() to placate lockdep. - */ - preempt_disable(); - if (!find_symbol(&fsa)) { - preempt_enable(); - BUG(); - } - preempt_enable(); - return check_version(info, "module_layout", mod, fsa.crc); -} - -/* First part is kernel version, which we ignore if module has crcs. */ -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - if (has_crcs) { - amagic += strcspn(amagic, " "); - bmagic += strcspn(bmagic, " "); - } - return strcmp(amagic, bmagic) == 0; -} -#else -static inline int check_version(const struct load_info *info, - const char *symname, - struct module *mod, - const s32 *crc) -{ - return 1; -} - -static inline int check_modstruct_version(const struct load_info *info, - struct module *mod) -{ - return 1; -} - -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - return strcmp(amagic, bmagic) == 0; -} -#endif /* CONFIG_MODVERSIONS */ - static char *get_modinfo(const struct load_info *info, const char *tag); static char *get_next_modinfo(const struct load_info *info, const char *tag, char *prev); @@ -1372,20 +1028,20 @@ static int verify_namespace_is_imported(const struct load_info *info, return 0; } -static bool inherit_taint(struct module *mod, struct module *owner) +static bool inherit_taint(struct module *mod, struct module *owner, const char *name) { if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints)) return true; if (mod->using_gplonly_symbols) { - pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n", - mod->name, owner->name); + pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n", + mod->name, name, owner->name); return false; } if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) { - pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n", - mod->name, owner->name); + pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n", + mod->name, name, owner->name); set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints); } return true; @@ -1417,7 +1073,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (fsa.license == GPL_ONLY) mod->using_gplonly_symbols = true; - if (!inherit_taint(mod, fsa.owner)) { + if (!inherit_taint(mod, fsa.owner, name)) { fsa.sym = NULL; goto getname; } @@ -1465,674 +1121,6 @@ resolve_symbol_wait(struct module *mod, return ksym; } -#ifdef CONFIG_KALLSYMS -static inline bool sect_empty(const Elf_Shdr *sect) -{ - return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; -} -#endif - -/* - * /sys/module/foo/sections stuff - * J. Corbet <corbet@lwn.net> - */ -#ifdef CONFIG_SYSFS - -#ifdef CONFIG_KALLSYMS -struct module_sect_attr { - struct bin_attribute battr; - unsigned long address; -}; - -struct module_sect_attrs { - struct attribute_group grp; - unsigned int nsections; - struct module_sect_attr attrs[]; -}; - -#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) -static ssize_t module_sect_read(struct file *file, struct kobject *kobj, - struct bin_attribute *battr, - char *buf, loff_t pos, size_t count) -{ - struct module_sect_attr *sattr = - container_of(battr, struct module_sect_attr, battr); - char bounce[MODULE_SECT_READ_SIZE + 1]; - size_t wrote; - - if (pos != 0) - return -EINVAL; - - /* - * Since we're a binary read handler, we must account for the - * trailing NUL byte that sprintf will write: if "buf" is - * too small to hold the NUL, or the NUL is exactly the last - * byte, the read will look like it got truncated by one byte. - * Since there is no way to ask sprintf nicely to not write - * the NUL, we have to use a bounce buffer. - */ - wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", - kallsyms_show_value(file->f_cred) - ? (void *)sattr->address : NULL); - count = min(count, wrote); - memcpy(buf, bounce, count); - - return count; -} - -static void free_sect_attrs(struct module_sect_attrs *sect_attrs) -{ - unsigned int section; - - for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].battr.attr.name); - kfree(sect_attrs); -} - -static void add_sect_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int nloaded = 0, i, size[2]; - struct module_sect_attrs *sect_attrs; - struct module_sect_attr *sattr; - struct bin_attribute **gattr; - - /* Count loaded sections and allocate structures */ - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i])) - nloaded++; - size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.bin_attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); - sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); - if (sect_attrs == NULL) - return; - - /* Setup section attributes. */ - sect_attrs->grp.name = "sections"; - sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; - - sect_attrs->nsections = 0; - sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.bin_attrs[0]; - for (i = 0; i < info->hdr->e_shnum; i++) { - Elf_Shdr *sec = &info->sechdrs[i]; - if (sect_empty(sec)) - continue; - sysfs_bin_attr_init(&sattr->battr); - sattr->address = sec->sh_addr; - sattr->battr.attr.name = - kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); - if (sattr->battr.attr.name == NULL) - goto out; - sect_attrs->nsections++; - sattr->battr.read = module_sect_read; - sattr->battr.size = MODULE_SECT_READ_SIZE; - sattr->battr.attr.mode = 0400; - *(gattr++) = &(sattr++)->battr; - } - *gattr = NULL; - - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) - goto out; - - mod->sect_attrs = sect_attrs; - return; - out: - free_sect_attrs(sect_attrs); -} - -static void remove_sect_attrs(struct module *mod) -{ - if (mod->sect_attrs) { - sysfs_remove_group(&mod->mkobj.kobj, - &mod->sect_attrs->grp); - /* - * We are positive that no one is using any sect attrs - * at this point. Deallocate immediately. - */ - free_sect_attrs(mod->sect_attrs); - mod->sect_attrs = NULL; - } -} - -/* - * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. - */ - -struct module_notes_attrs { - struct kobject *dir; - unsigned int notes; - struct bin_attribute attrs[]; -}; - -static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t pos, size_t count) -{ - /* - * The caller checked the pos and count against our size. - */ - memcpy(buf, bin_attr->private + pos, count); - return count; -} - -static void free_notes_attrs(struct module_notes_attrs *notes_attrs, - unsigned int i) -{ - if (notes_attrs->dir) { - while (i-- > 0) - sysfs_remove_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i]); - kobject_put(notes_attrs->dir); - } - kfree(notes_attrs); -} - -static void add_notes_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int notes, loaded, i; - struct module_notes_attrs *notes_attrs; - struct bin_attribute *nattr; - - /* failed to create section attributes, so can't create notes */ - if (!mod->sect_attrs) - return; - - /* Count notes sections and allocate structures. */ - notes = 0; - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i]) && - (info->sechdrs[i].sh_type == SHT_NOTE)) - ++notes; - - if (notes == 0) - return; - - notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), - GFP_KERNEL); - if (notes_attrs == NULL) - return; - - notes_attrs->notes = notes; - nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { - if (sect_empty(&info->sechdrs[i])) - continue; - if (info->sechdrs[i].sh_type == SHT_NOTE) { - sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; - nattr->attr.mode = S_IRUGO; - nattr->size = info->sechdrs[i].sh_size; - nattr->private = (void *) info->sechdrs[i].sh_addr; - nattr->read = module_notes_read; - ++nattr; - } - ++loaded; - } - - notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) - goto out; - - for (i = 0; i < notes; ++i) - if (sysfs_create_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i])) - goto out; - - mod->notes_attrs = notes_attrs; - return; - - out: - free_notes_attrs(notes_attrs, i); -} - -static void remove_notes_attrs(struct module *mod) -{ - if (mod->notes_attrs) - free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); -} - -#else - -static inline void add_sect_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_sect_attrs(struct module *mod) -{ -} - -static inline void add_notes_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_notes_attrs(struct module *mod) -{ -} -#endif /* CONFIG_KALLSYMS */ - -static void del_usage_links(struct module *mod) -{ -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); - mutex_unlock(&module_mutex); -#endif -} - -static int add_usage_links(struct module *mod) -{ - int ret = 0; -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - ret = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - if (ret) - break; - } - mutex_unlock(&module_mutex); - if (ret) - del_usage_links(mod); -#endif - return ret; -} - -static void module_remove_modinfo_attrs(struct module *mod, int end); - -static int module_add_modinfo_attrs(struct module *mod) -{ - struct module_attribute *attr; - struct module_attribute *temp_attr; - int error = 0; - int i; - - mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * - (ARRAY_SIZE(modinfo_attrs) + 1)), - GFP_KERNEL); - if (!mod->modinfo_attrs) - return -ENOMEM; - - temp_attr = mod->modinfo_attrs; - for (i = 0; (attr = modinfo_attrs[i]); i++) { - if (!attr->test || attr->test(mod)) { - memcpy(temp_attr, attr, sizeof(*temp_attr)); - sysfs_attr_init(&temp_attr->attr); - error = sysfs_create_file(&mod->mkobj.kobj, - &temp_attr->attr); - if (error) - goto error_out; - ++temp_attr; - } - } - - return 0; - -error_out: - if (i > 0) - module_remove_modinfo_attrs(mod, --i); - else - kfree(mod->modinfo_attrs); - return error; -} - -static void module_remove_modinfo_attrs(struct module *mod, int end) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { - if (end >= 0 && i > end) - break; - /* pick a field to test for end of list */ - if (!attr->attr.name) - break; - sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); - if (attr->free) - attr->free(mod); - } - kfree(mod->modinfo_attrs); -} - -static void mod_kobject_put(struct module *mod) -{ - DECLARE_COMPLETION_ONSTACK(c); - mod->mkobj.kobj_completion = &c; - kobject_put(&mod->mkobj.kobj); - wait_for_completion(&c); -} - -static int mod_sysfs_init(struct module *mod) -{ - int err; - struct kobject *kobj; - - if (!module_sysfs_initialized) { - pr_err("%s: module sysfs not initialized\n", mod->name); - err = -EINVAL; - goto out; - } - - kobj = kset_find_obj(module_kset, mod->name); - if (kobj) { - pr_err("%s: module is already loaded\n", mod->name); - kobject_put(kobj); - err = -EINVAL; - goto out; - } - - mod->mkobj.mod = mod; - - memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); - mod->mkobj.kobj.kset = module_kset; - err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, - "%s", mod->name); - if (err) - mod_kobject_put(mod); - -out: - return err; -} - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - int err; - - err = mod_sysfs_init(mod); - if (err) - goto out; - - mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); - if (!mod->holders_dir) { - err = -ENOMEM; - goto out_unreg; - } - - err = module_param_sysfs_setup(mod, kparam, num_params); - if (err) - goto out_unreg_holders; - - err = module_add_modinfo_attrs(mod); - if (err) - goto out_unreg_param; - - err = add_usage_links(mod); - if (err) - goto out_unreg_modinfo_attrs; - - add_sect_attrs(mod, info); - add_notes_attrs(mod, info); - - return 0; - -out_unreg_modinfo_attrs: - module_remove_modinfo_attrs(mod, -1); -out_unreg_param: - module_param_sysfs_remove(mod); -out_unreg_holders: - kobject_put(mod->holders_dir); -out_unreg: - mod_kobject_put(mod); -out: - return err; -} - -static void mod_sysfs_fini(struct module *mod) -{ - remove_notes_attrs(mod); - remove_sect_attrs(mod); - mod_kobject_put(mod); -} - -static void init_param_lock(struct module *mod) -{ - mutex_init(&mod->param_lock); -} -#else /* !CONFIG_SYSFS */ - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - return 0; -} - -static void mod_sysfs_fini(struct module *mod) -{ -} - -static void module_remove_modinfo_attrs(struct module *mod, int end) -{ -} - -static void del_usage_links(struct module *mod) -{ -} - -static void init_param_lock(struct module *mod) -{ -} -#endif /* CONFIG_SYSFS */ - -static void mod_sysfs_teardown(struct module *mod) -{ - del_usage_links(mod); - module_remove_modinfo_attrs(mod, -1); - module_param_sysfs_remove(mod); - kobject_put(mod->mkobj.drivers_dir); - kobject_put(mod->holders_dir); - mod_sysfs_fini(mod); -} - -/* - * LKM RO/NX protection: protect module's text/ro-data - * from modification and any data from execution. - * - * General layout of module is: - * [text] [read-only-data] [ro-after-init] [writable data] - * text_size -----^ ^ ^ ^ - * ro_size ------------------------| | | - * ro_after_init_size -----------------------------| | - * size -----------------------------------------------------------| - * - * These values are always page-aligned (as is base) - */ - -/* - * Since some arches are moving towards PAGE_KERNEL module allocations instead - * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the - * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of - * whether we are strict. - */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX -static void frob_text(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base, - layout->text_size >> PAGE_SHIFT); -} - -static void module_enable_x(const struct module *mod) -{ - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); -} -#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ -static void module_enable_x(const struct module *mod) { } -#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ - -#ifdef CONFIG_STRICT_MODULE_RWX -static void frob_rodata(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->text_size, - (layout->ro_size - layout->text_size) >> PAGE_SHIFT); -} - -static void frob_ro_after_init(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); -} - -static void frob_writable_data(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_after_init_size, - (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); -} - -static void module_enable_ro(const struct module *mod, bool after_init) -{ - if (!rodata_enabled) - return; - - set_vm_flush_reset_perms(mod->core_layout.base); - set_vm_flush_reset_perms(mod->init_layout.base); - frob_text(&mod->core_layout, set_memory_ro); - - frob_rodata(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - frob_rodata(&mod->init_layout, set_memory_ro); - - if (after_init) - frob_ro_after_init(&mod->core_layout, set_memory_ro); -} - -static void module_enable_nx(const struct module *mod) -{ - frob_rodata(&mod->core_layout, set_memory_nx); - frob_ro_after_init(&mod->core_layout, set_memory_nx); - frob_writable_data(&mod->core_layout, set_memory_nx); - frob_rodata(&mod->init_layout, set_memory_nx); - frob_writable_data(&mod->init_layout, set_memory_nx); -} - -static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) -{ - const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR; - int i; - - for (i = 0; i < hdr->e_shnum; i++) { - if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { - pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", - mod->name, secstrings + sechdrs[i].sh_name, i); - return -ENOEXEC; - } - } - - return 0; -} - -#else /* !CONFIG_STRICT_MODULE_RWX */ -static void module_enable_nx(const struct module *mod) { } -static void module_enable_ro(const struct module *mod, bool after_init) {} -static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) -{ - return 0; -} -#endif /* CONFIG_STRICT_MODULE_RWX */ - -#ifdef CONFIG_LIVEPATCH -/* - * Persist Elf information about a module. Copy the Elf header, - * section header table, section string table, and symtab section - * index from info to mod->klp_info. - */ -static int copy_module_elf(struct module *mod, struct load_info *info) -{ - unsigned int size, symndx; - int ret; - - size = sizeof(*mod->klp_info); - mod->klp_info = kmalloc(size, GFP_KERNEL); - if (mod->klp_info == NULL) - return -ENOMEM; - - /* Elf header */ - size = sizeof(mod->klp_info->hdr); - memcpy(&mod->klp_info->hdr, info->hdr, size); - - /* Elf section header table */ - size = sizeof(*info->sechdrs) * info->hdr->e_shnum; - mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); - if (mod->klp_info->sechdrs == NULL) { - ret = -ENOMEM; - goto free_info; - } - - /* Elf section name string table */ - size = info->sechdrs[info->hdr->e_shstrndx].sh_size; - mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); - if (mod->klp_info->secstrings == NULL) { - ret = -ENOMEM; - goto free_sechdrs; - } - - /* Elf symbol section index */ - symndx = info->index.sym; - mod->klp_info->symndx = symndx; - - /* - * For livepatch modules, core_kallsyms.symtab is a complete - * copy of the original symbol table. Adjust sh_addr to point - * to core_kallsyms.symtab since the copy of the symtab in module - * init memory is freed at the end of do_init_module(). - */ - mod->klp_info->sechdrs[symndx].sh_addr = \ - (unsigned long) mod->core_kallsyms.symtab; - - return 0; - -free_sechdrs: - kfree(mod->klp_info->sechdrs); -free_info: - kfree(mod->klp_info); - return ret; -} - -static void free_module_elf(struct module *mod) -{ - kfree(mod->klp_info->sechdrs); - kfree(mod->klp_info->secstrings); - kfree(mod->klp_info); -} -#else /* !CONFIG_LIVEPATCH */ -static int copy_module_elf(struct module *mod, struct load_info *info) -{ - return 0; -} - -static void free_module_elf(struct module *mod) -{ -} -#endif /* CONFIG_LIVEPATCH */ - void __weak module_memfree(void *module_region) { /* @@ -2192,6 +1180,9 @@ static void free_module(struct module *mod) module_bug_cleanup(mod); /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ synchronize_rcu(); + if (try_add_tainted_module(mod)) + pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n", + mod->name); mutex_unlock(&module_mutex); /* Clean up CFI for the module. */ @@ -2204,10 +1195,13 @@ static void free_module(struct module *mod) percpu_modfree(mod); /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); + lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size); /* Finally, free the core (containing the module structure) */ module_memfree(mod->core_layout.base); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + vfree(mod->data_layout.base); +#endif } void *__symbol_get(const char *symbol) @@ -2395,7 +1389,7 @@ unsigned int __weak arch_mod_section_prepend(struct module *mod, } /* Update size with this section: return offset. */ -static long get_offset(struct module *mod, unsigned int *size, +long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, unsigned int section) { long ret; @@ -2445,30 +1439,32 @@ static void layout_sections(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; const char *sname = info->secstrings + s->sh_name; + unsigned int *sizep; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL || module_init_layout_section(sname)) continue; - s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); + sizep = m ? &mod->data_layout.size : &mod->core_layout.size; + s->sh_entsize = module_get_offset(mod, sizep, s, i); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.size = strict_align(mod->core_layout.size); mod->core_layout.text_size = mod->core_layout.size; break; case 1: /* RO: text and ro-data */ - mod->core_layout.size = debug_align(mod->core_layout.size); - mod->core_layout.ro_size = mod->core_layout.size; + mod->data_layout.size = strict_align(mod->data_layout.size); + mod->data_layout.ro_size = mod->data_layout.size; break; case 2: /* RO after init */ - mod->core_layout.size = debug_align(mod->core_layout.size); - mod->core_layout.ro_after_init_size = mod->core_layout.size; + mod->data_layout.size = strict_align(mod->data_layout.size); + mod->data_layout.ro_after_init_size = mod->data_layout.size; break; case 4: /* whole core */ - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->data_layout.size = strict_align(mod->data_layout.size); break; } } @@ -2484,17 +1480,17 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || !module_init_layout_section(sname)) continue; - s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) + s->sh_entsize = (module_get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); mod->init_layout.text_size = mod->init_layout.size; break; case 1: /* RO: text and ro-data */ - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); mod->init_layout.ro_size = mod->init_layout.size; break; case 2: @@ -2505,7 +1501,7 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; break; case 4: /* whole init */ - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); break; } } @@ -2597,228 +1593,6 @@ static void free_modinfo(struct module *mod) } } -#ifdef CONFIG_KALLSYMS - -/* Lookup exported symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_exported_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - return bsearch(name, start, stop - start, - sizeof(struct kernel_symbol), cmp_name); -} - -static int is_exported(const char *name, unsigned long value, - const struct module *mod) -{ - const struct kernel_symbol *ks; - if (!mod) - ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); - else - ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); - - return ks != NULL && kernel_symbol_value(ks) == value; -} - -/* As per nm */ -static char elf_type(const Elf_Sym *sym, const struct load_info *info) -{ - const Elf_Shdr *sechdrs = info->sechdrs; - - if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { - if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) - return 'v'; - else - return 'w'; - } - if (sym->st_shndx == SHN_UNDEF) - return 'U'; - if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) - return 'a'; - if (sym->st_shndx >= SHN_LORESERVE) - return '?'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) - return 't'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC - && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { - if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) - return 'r'; - else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 'g'; - else - return 'd'; - } - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { - if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 's'; - else - return 'b'; - } - if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, - ".debug")) { - return 'n'; - } - return '?'; -} - -static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum, unsigned int pcpundx) -{ - const Elf_Shdr *sec; - - if (src->st_shndx == SHN_UNDEF - || src->st_shndx >= shnum - || !src->st_name) - return false; - -#ifdef CONFIG_KALLSYMS_ALL - if (src->st_shndx == pcpundx) - return true; -#endif - - sec = sechdrs + src->st_shndx; - if (!(sec->sh_flags & SHF_ALLOC) -#ifndef CONFIG_KALLSYMS_ALL - || !(sec->sh_flags & SHF_EXECINSTR) -#endif - || (sec->sh_entsize & INIT_OFFSET_MASK)) - return false; - - return true; -} - -/* - * We only allocate and copy the strings needed by the parts of symtab - * we keep. This is simple, but has the effect of making multiple - * copies of duplicates. We could be more sophisticated, see - * linux-kernel thread starting with - * <73defb5e4bca04a6431392cc341112b1@localhost>. - */ -static void layout_symtab(struct module *mod, struct load_info *info) -{ - Elf_Shdr *symsect = info->sechdrs + info->index.sym; - Elf_Shdr *strsect = info->sechdrs + info->index.str; - const Elf_Sym *src; - unsigned int i, nsrc, ndst, strtab_size = 0; - - /* Put symbol section at end of init part of module. */ - symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, - info->index.sym) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + symsect->sh_name); - - src = (void *)info->hdr + symsect->sh_offset; - nsrc = symsect->sh_size / sizeof(*src); - - /* Compute total space required for the core symbols' strtab. */ - for (ndst = i = 0; i < nsrc; i++) { - if (i == 0 || is_livepatch_module(mod) || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, - info->index.pcpu)) { - strtab_size += strlen(&info->strtab[src[i].st_name])+1; - ndst++; - } - } - - /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_layout.size += strtab_size; - info->core_typeoffs = mod->core_layout.size; - mod->core_layout.size += ndst * sizeof(char); - mod->core_layout.size = debug_align(mod->core_layout.size); - - /* Put string table section at end of init part of module. */ - strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, - info->index.str) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + strsect->sh_name); - - /* We'll tack temporary mod_kallsyms on the end. */ - mod->init_layout.size = ALIGN(mod->init_layout.size, - __alignof__(struct mod_kallsyms)); - info->mod_kallsyms_init_off = mod->init_layout.size; - mod->init_layout.size += sizeof(struct mod_kallsyms); - info->init_typeoffs = mod->init_layout.size; - mod->init_layout.size += nsrc * sizeof(char); - mod->init_layout.size = debug_align(mod->init_layout.size); -} - -/* - * We use the full symtab and strtab which layout_symtab arranged to - * be appended to the init section. Later we switch to the cut-down - * core-only ones. - */ -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ - unsigned int i, ndst; - const Elf_Sym *src; - Elf_Sym *dst; - char *s; - Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - - /* Set up to point into init section. */ - mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; - - mod->kallsyms->symtab = (void *)symsec->sh_addr; - mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); - /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; - - /* - * Now populate the cut down core kallsyms for after init - * and set types up while we still have access to sections. - */ - mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; - mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; - src = mod->kallsyms->symtab; - for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { - mod->kallsyms->typetab[i] = elf_type(src + i, info); - if (i == 0 || is_livepatch_module(mod) || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, - info->index.pcpu)) { - mod->core_kallsyms.typetab[ndst] = - mod->kallsyms->typetab[i]; - dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], - KSYM_NAME_LEN) + 1; - } - } - mod->core_kallsyms.num_symtab = ndst; -} -#else -static inline void layout_symtab(struct module *mod, struct load_info *info) -{ -} - -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ -} -#endif /* CONFIG_KALLSYMS */ - -#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) -static void init_build_id(struct module *mod, const struct load_info *info) -{ - const Elf_Shdr *sechdr; - unsigned int i; - - for (i = 0; i < info->hdr->e_shnum; i++) { - sechdr = &info->sechdrs[i]; - if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE && - !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id, - sechdr->sh_size)) - break; - } -} -#else -static void init_build_id(struct module *mod, const struct load_info *info) -{ -} -#endif - static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) { if (!debug) @@ -2849,97 +1623,6 @@ bool __weak module_exit_section(const char *name) return strstarts(name, ".exit"); } -#ifdef CONFIG_DEBUG_KMEMLEAK -static void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ - unsigned int i; - - /* only scan the sections containing data */ - kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - - for (i = 1; i < info->hdr->e_shnum; i++) { - /* Scan all writable sections that's not executable */ - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || - !(info->sechdrs[i].sh_flags & SHF_WRITE) || - (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, - info->sechdrs[i].sh_size, GFP_KERNEL); - } -} -#else -static inline void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ -} -#endif - -#ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info, int flags) -{ - int err = -ENODATA; - const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; - const char *reason; - const void *mod = info->hdr; - bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | - MODULE_INIT_IGNORE_VERMAGIC); - /* - * Do not allow mangled modules as a module with version information - * removed is no longer the module that was signed. - */ - if (!mangled_module && - info->len > markerlen && - memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { - /* We truncate the module to discard the signature */ - info->len -= markerlen; - err = mod_verify_sig(mod, info); - if (!err) { - info->sig_ok = true; - return 0; - } - } - - /* - * We don't permit modules to be loaded into the trusted kernels - * without a valid signature on them, but if we're not enforcing, - * certain errors are non-fatal. - */ - switch (err) { - case -ENODATA: - reason = "unsigned module"; - break; - case -ENOPKG: - reason = "module with unsupported crypto"; - break; - case -ENOKEY: - reason = "module with unavailable key"; - break; - - default: - /* - * All other errors are fatal, including lack of memory, - * unparseable signatures, and signature check failures -- - * even if signatures aren't required. - */ - return err; - } - - if (is_module_sig_enforced()) { - pr_notice("Loading of %s is rejected\n", reason); - return -EKEYREJECTED; - } - - return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); -} -#else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info, int flags) -{ - return 0; -} -#endif /* !CONFIG_MODULE_SIG */ - static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr) { #if defined(CONFIG_64BIT) @@ -3033,6 +1716,10 @@ static int elf_validity_check(struct load_info *info) * strings in the section safe. */ info->secstrings = (void *)info->hdr + strhdr->sh_offset; + if (strhdr->sh_size == 0) { + pr_err("empty section name table\n"); + goto no_exec; + } if (info->secstrings[strhdr->sh_size - 1] != '\0') { pr_err("ELF Spec violation: section name table isn't null terminated\n"); goto no_exec; @@ -3107,30 +1794,23 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l return 0; } -#ifdef CONFIG_LIVEPATCH static int check_modinfo_livepatch(struct module *mod, struct load_info *info) { - if (get_modinfo(info, "livepatch")) { - mod->klp = true; + if (!get_modinfo(info, "livepatch")) + /* Nothing more to do */ + return 0; + + if (set_livepatch_module(mod)) { add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", - mod->name); - } - - return 0; -} -#else /* !CONFIG_LIVEPATCH */ -static int check_modinfo_livepatch(struct module *mod, struct load_info *info) -{ - if (get_modinfo(info, "livepatch")) { - pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", - mod->name); - return -ENOEXEC; + mod->name); + return 0; } - return 0; + pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", + mod->name); + return -ENOEXEC; } -#endif /* CONFIG_LIVEPATCH */ static void check_modinfo_retpoline(struct module *mod, struct load_info *info) { @@ -3456,6 +2136,24 @@ static int move_module(struct module *mod, struct load_info *info) } else mod->init_layout.base = NULL; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + /* Do the allocs. */ + ptr = vmalloc(mod->data_layout.size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. Just mark it as not being a + * leak. + */ + kmemleak_not_leak(ptr); + if (!ptr) { + module_memfree(mod->core_layout.base); + module_memfree(mod->init_layout.base); + return -ENOMEM; + } + + memset(ptr, 0, mod->data_layout.size); + mod->data_layout.base = ptr; +#endif /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); for (i = 0; i < info->hdr->e_shnum; i++) { @@ -3468,6 +2166,8 @@ static int move_module(struct module *mod, struct load_info *info) if (shdr->sh_entsize & INIT_OFFSET_MASK) dest = mod->init_layout.base + (shdr->sh_entsize & ~INIT_OFFSET_MASK); + else if (!(shdr->sh_flags & SHF_EXECINSTR)) + dest = mod->data_layout.base + shdr->sh_entsize; else dest = mod->core_layout.base + shdr->sh_entsize; @@ -3629,6 +2329,9 @@ static void module_deallocate(struct module *mod, struct load_info *info) module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); module_memfree(mod->core_layout.base); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + vfree(mod->data_layout.base); +#endif } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3879,6 +2582,9 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + if (module_check_misalignment(mod)) + goto out_misaligned; + module_enable_ro(mod, false); module_enable_nx(mod); module_enable_x(mod); @@ -3892,6 +2598,8 @@ static int complete_formation(struct module *mod, struct load_info *info) return 0; +out_misaligned: + err = -EINVAL; out: mutex_unlock(&module_mutex); return err; @@ -4158,7 +2866,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_unlock(&module_mutex); free_module: /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); + lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size); module_deallocate(mod, info); free_copy: @@ -4227,287 +2935,6 @@ static inline int within(unsigned long addr, void *start, unsigned long size) return ((void *)addr >= start && (void *)addr < start + size); } -#ifdef CONFIG_KALLSYMS -/* - * This ignores the intensely annoying "mapping symbols" found - * in ARM ELF files: $a, $t and $d. - */ -static inline int is_arm_mapping_symbol(const char *str) -{ - if (str[0] == '.' && str[1] == 'L') - return true; - return str[0] == '$' && strchr("axtd", str[1]) - && (str[2] == '\0' || str[2] == '.'); -} - -static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) -{ - return kallsyms->strtab + kallsyms->symtab[symnum].st_name; -} - -/* - * Given a module and address, find the corresponding symbol and return its name - * while providing its size and offset if needed. - */ -static const char *find_kallsyms_symbol(struct module *mod, - unsigned long addr, - unsigned long *size, - unsigned long *offset) -{ - unsigned int i, best = 0; - unsigned long nextval, bestval; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - - /* At worse, next value is at end of module */ - if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; - else - nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; - - bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); - - /* - * Scan for closest preceding symbol, and next symbol. (ELF - * starts real symbols at 1). - */ - for (i = 1; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - unsigned long thisval = kallsyms_symbol_value(sym); - - if (sym->st_shndx == SHN_UNDEF) - continue; - - /* - * We ignore unnamed symbols: they're uninformative - * and inserted at a whim. - */ - if (*kallsyms_symbol_name(kallsyms, i) == '\0' - || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) - continue; - - if (thisval <= addr && thisval > bestval) { - best = i; - bestval = thisval; - } - if (thisval > addr && thisval < nextval) - nextval = thisval; - } - - if (!best) - return NULL; - - if (size) - *size = nextval - bestval; - if (offset) - *offset = addr - bestval; - - return kallsyms_symbol_name(kallsyms, best); -} - -void * __weak dereference_module_function_descriptor(struct module *mod, - void *ptr) -{ - return ptr; -} - -/* - * For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. - */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) -{ - const char *ret = NULL; - struct module *mod; - - preempt_disable(); - mod = __module_address(addr); - if (mod) { - if (modname) - *modname = mod->name; - if (modbuildid) { -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - *modbuildid = mod->build_id; -#else - *modbuildid = NULL; -#endif - } - - ret = find_kallsyms_symbol(mod, addr, size, offset); - } - /* Make a copy in here where it's safe */ - if (ret) { - strncpy(namebuf, ret, KSYM_NAME_LEN - 1); - ret = namebuf; - } - preempt_enable(); - - return ret; -} - -int lookup_module_symbol_name(unsigned long addr, char *symname) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, NULL, NULL); - if (!sym) - goto out; - - strlcpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, size, offset); - if (!sym) - goto out; - if (modname) - strlcpy(modname, mod->name, MODULE_NAME_LEN); - if (name) - strlcpy(name, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name, char *module_name, int *exported) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - struct mod_kallsyms *kallsyms; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - kallsyms = rcu_dereference_sched(mod->kallsyms); - if (symnum < kallsyms->num_symtab) { - const Elf_Sym *sym = &kallsyms->symtab[symnum]; - - *value = kallsyms_symbol_value(sym); - *type = kallsyms->typetab[symnum]; - strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); - strlcpy(module_name, mod->name, MODULE_NAME_LEN); - *exported = is_exported(name, *value, mod); - preempt_enable(); - return 0; - } - symnum -= kallsyms->num_symtab; - } - preempt_enable(); - return -ERANGE; -} - -/* Given a module and name of symbol, find and return the symbol's value */ -static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) -{ - unsigned int i; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - - for (i = 0; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - - if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && - sym->st_shndx != SHN_UNDEF) - return kallsyms_symbol_value(sym); - } - return 0; -} - -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name) -{ - struct module *mod; - char *colon; - unsigned long ret = 0; - - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { - if ((mod = find_module_all(name, colon - name, false)) != NULL) - ret = find_kallsyms_symbol_value(mod, colon+1); - } else { - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) - break; - } - } - preempt_enable(); - return ret; -} - -#ifdef CONFIG_LIVEPATCH -int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, - struct module *, unsigned long), - void *data) -{ - struct module *mod; - unsigned int i; - int ret = 0; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) { - /* We hold module_mutex: no need for rcu_dereference_sched */ - struct mod_kallsyms *kallsyms = mod->kallsyms; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - for (i = 0; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - - if (sym->st_shndx == SHN_UNDEF) - continue; - - ret = fn(data, kallsyms_symbol_name(kallsyms, i), - mod, kallsyms_symbol_value(sym)); - if (ret != 0) - goto out; - - cond_resched(); - } - } -out: - mutex_unlock(&module_mutex); - return ret; -} -#endif /* CONFIG_LIVEPATCH */ -#endif /* CONFIG_KALLSYMS */ - static void cfi_init(struct module *mod) { #ifdef CONFIG_CFI_CLANG @@ -4531,22 +2958,19 @@ static void cfi_init(struct module *mod) mod->exit = *exit; #endif - cfi_module_add(mod, module_addr_min); + cfi_module_add(mod, mod_tree.addr_min); #endif } static void cfi_cleanup(struct module *mod) { #ifdef CONFIG_CFI_CLANG - cfi_module_remove(mod, module_addr_min); + cfi_module_remove(mod, mod_tree.addr_min); #endif } -/* Maximum number of characters written by module_flags() */ -#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) - /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ -static char *module_flags(struct module *mod, char *buf) +char *module_flags(struct module *mod, char *buf) { int bx = 0; @@ -4555,7 +2979,7 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - bx += module_flags_taint(mod, buf + bx); + bx += module_flags_taint(mod->taints, buf + bx); /* Show a - for module-is-being-unloaded */ if (mod->state == MODULE_STATE_GOING) buf[bx++] = '-'; @@ -4569,103 +2993,6 @@ static char *module_flags(struct module *mod, char *buf) return buf; } -#ifdef CONFIG_PROC_FS -/* Called by the /proc file system to return a list of modules. */ -static void *m_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); -} - -static void *m_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &modules, pos); -} - -static void m_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&module_mutex); -} - -static int m_show(struct seq_file *m, void *p) -{ - struct module *mod = list_entry(p, struct module, list); - char buf[MODULE_FLAGS_BUF_SIZE]; - void *value; - - /* We always ignore unformed modules. */ - if (mod->state == MODULE_STATE_UNFORMED) - return 0; - - seq_printf(m, "%s %u", - mod->name, mod->init_layout.size + mod->core_layout.size); - print_unload_info(m, mod); - - /* Informative for users. */ - seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading" : - mod->state == MODULE_STATE_COMING ? "Loading" : - "Live"); - /* Used by oprofile and other similar tools. */ - value = m->private ? NULL : mod->core_layout.base; - seq_printf(m, " 0x%px", value); - - /* Taints info */ - if (mod->taints) - seq_printf(m, " %s", module_flags(mod, buf)); - - seq_puts(m, "\n"); - return 0; -} - -/* - * Format: modulename size refcount deps address - * - * Where refcount is a number or -, and deps is a comma-separated list - * of depends or -. - */ -static const struct seq_operations modules_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = m_show -}; - -/* - * This also sets the "private" pointer to non-NULL if the - * kernel pointers should be hidden (so you can just test - * "m->private" to see if you should keep the values private). - * - * We use the same logic as for /proc/kallsyms. - */ -static int modules_open(struct inode *inode, struct file *file) -{ - int err = seq_open(file, &modules_op); - - if (!err) { - struct seq_file *m = file->private_data; - m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; - } - - return err; -} - -static const struct proc_ops modules_proc_ops = { - .proc_flags = PROC_ENTRY_PERMANENT, - .proc_open = modules_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -}; - -static int __init proc_modules_init(void) -{ - proc_create("modules", 0, NULL, &modules_proc_ops); - return 0; -} -module_init(proc_modules_init); -#endif - /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { @@ -4721,13 +3048,20 @@ bool is_module_address(unsigned long addr) struct module *__module_address(unsigned long addr) { struct module *mod; + struct mod_tree_root *tree; - if (addr < module_addr_min || addr > module_addr_max) + if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max) + tree = &mod_tree; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + else if (addr >= mod_data_tree.addr_min && addr <= mod_data_tree.addr_max) + tree = &mod_data_tree; +#endif + else return NULL; module_assert_mutex_or_preempt(); - mod = mod_find(addr); + mod = mod_find(addr, tree); if (mod) { BUG_ON(!within_module(addr, mod)); if (mod->state == MODULE_STATE_UNFORMED) @@ -4788,23 +3122,10 @@ void print_modules(void) continue; pr_cont(" %s%s", mod->name, module_flags(mod, buf)); } + + print_unloaded_tainted_modules(); preempt_enable(); if (last_unloaded_module[0]) pr_cont(" [last unloaded: %s]", last_unloaded_module); pr_cont("\n"); } - -#ifdef CONFIG_MODVERSIONS -/* - * Generate the signature for all relevant module structures here. - * If these change, we don't want to try to parse the module. - */ -void module_layout(struct module *mod, - struct modversion_info *ver, - struct kernel_param *kp, - struct kernel_symbol *ks, - struct tracepoint * const *tp) -{ -} -EXPORT_SYMBOL(module_layout); -#endif diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c new file mode 100644 index 000000000000..9a8f4f0f6329 --- /dev/null +++ b/kernel/module/procfs.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module proc support + * + * Copyright (C) 2008 Alexey Dobriyan + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/mutex.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include "internal.h" + +#ifdef CONFIG_MODULE_UNLOAD +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + struct module_use *use; + int printed_something = 0; + + seq_printf(m, " %i ", module_refcount(mod)); + + /* + * Always include a trailing , so userspace can differentiate + * between this and the old multi-field proc format. + */ + list_for_each_entry(use, &mod->source_list, source_list) { + printed_something = 1; + seq_printf(m, "%s,", use->source->name); + } + + if (mod->init && !mod->exit) { + printed_something = 1; + seq_puts(m, "[permanent],"); + } + + if (!printed_something) + seq_puts(m, "-"); +} +#else /* !CONFIG_MODULE_UNLOAD */ +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + /* We don't know the usage count, or what modules are using. */ + seq_puts(m, " - -"); +} +#endif /* CONFIG_MODULE_UNLOAD */ + +/* Called by the /proc file system to return a list of modules. */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&module_mutex); + return seq_list_start(&modules, *pos); +} + +static void *m_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &modules, pos); +} + +static void m_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&module_mutex); +} + +static int m_show(struct seq_file *m, void *p) +{ + struct module *mod = list_entry(p, struct module, list); + char buf[MODULE_FLAGS_BUF_SIZE]; + void *value; + unsigned int size; + + /* We always ignore unformed modules. */ + if (mod->state == MODULE_STATE_UNFORMED) + return 0; + + size = mod->init_layout.size + mod->core_layout.size; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + size += mod->data_layout.size; +#endif + seq_printf(m, "%s %u", mod->name, size); + print_unload_info(m, mod); + + /* Informative for users. */ + seq_printf(m, " %s", + mod->state == MODULE_STATE_GOING ? "Unloading" : + mod->state == MODULE_STATE_COMING ? "Loading" : + "Live"); + /* Used by oprofile and other similar tools. */ + value = m->private ? NULL : mod->core_layout.base; + seq_printf(m, " 0x%px", value); + + /* Taints info */ + if (mod->taints) + seq_printf(m, " %s", module_flags(mod, buf)); + + seq_puts(m, "\n"); + return 0; +} + +/* + * Format: modulename size refcount deps address + * + * Where refcount is a number or -, and deps is a comma-separated list + * of depends or -. + */ +static const struct seq_operations modules_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = m_show +}; + +/* + * This also sets the "private" pointer to non-NULL if the + * kernel pointers should be hidden (so you can just test + * "m->private" to see if you should keep the values private). + * + * We use the same logic as for /proc/kallsyms. + */ +static int modules_open(struct inode *inode, struct file *file) +{ + int err = seq_open(file, &modules_op); + + if (!err) { + struct seq_file *m = file->private_data; + + m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; + } + + return err; +} + +static const struct proc_ops modules_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = modules_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int __init proc_modules_init(void) +{ + proc_create("modules", 0, NULL, &modules_proc_ops); + return 0; +} +module_init(proc_modules_init); diff --git a/kernel/module/signing.c b/kernel/module/signing.c new file mode 100644 index 000000000000..a2ff4242e623 --- /dev/null +++ b/kernel/module/signing.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/module_signature.h> +#include <linux/string.h> +#include <linux/verification.h> +#include <linux/security.h> +#include <crypto/public_key.h> +#include <uapi/linux/module.h> +#include "internal.h" + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "module." + +static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); +module_param(sig_enforce, bool_enable_only, 0644); + +/* + * Export sig_enforce kernel cmdline parameter to allow other subsystems rely + * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. + */ +bool is_module_sig_enforced(void) +{ + return sig_enforce; +} +EXPORT_SYMBOL(is_module_sig_enforced); + +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} + +/* + * Verify the signature on a module. + */ +int mod_verify_sig(const void *mod, struct load_info *info) +{ + struct module_signature ms; + size_t sig_len, modlen = info->len; + int ret; + + pr_devel("==>%s(,%zu)\n", __func__, modlen); + + if (modlen <= sizeof(ms)) + return -EBADMSG; + + memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); + + ret = mod_check_sig(&ms, modlen, "module"); + if (ret) + return ret; + + sig_len = be32_to_cpu(ms.sig_len); + modlen -= sig_len + sizeof(ms); + info->len = modlen; + + return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); +} + +int module_sig_check(struct load_info *info, int flags) +{ + int err = -ENODATA; + const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const char *reason; + const void *mod = info->hdr; + bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | + MODULE_INIT_IGNORE_VERMAGIC); + /* + * Do not allow mangled modules as a module with version information + * removed is no longer the module that was signed. + */ + if (!mangled_module && + info->len > markerlen && + memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + /* We truncate the module to discard the signature */ + info->len -= markerlen; + err = mod_verify_sig(mod, info); + if (!err) { + info->sig_ok = true; + return 0; + } + } + + /* + * We don't permit modules to be loaded into the trusted kernels + * without a valid signature on them, but if we're not enforcing, + * certain errors are non-fatal. + */ + switch (err) { + case -ENODATA: + reason = "unsigned module"; + break; + case -ENOPKG: + reason = "module with unsupported crypto"; + break; + case -ENOKEY: + reason = "module with unavailable key"; + break; + + default: + /* + * All other errors are fatal, including lack of memory, + * unparseable signatures, and signature check failures -- + * even if signatures aren't required. + */ + return err; + } + + if (is_module_sig_enforced()) { + pr_notice("Loading of %s is rejected\n", reason); + return -EKEYREJECTED; + } + + return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); +} diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c new file mode 100644 index 000000000000..14fbea66f12f --- /dev/null +++ b/kernel/module/strict_rwx.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module strict rwx + * + * Copyright (C) 2015 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/set_memory.h> +#include "internal.h" + +/* + * LKM RO/NX protection: protect module's text/ro-data + * from modification and any data from execution. + * + * General layout of module is: + * [text] [read-only-data] [ro-after-init] [writable data] + * text_size -----^ ^ ^ ^ + * ro_size ------------------------| | | + * ro_after_init_size -----------------------------| | + * size -----------------------------------------------------------| + * + * These values are always page-aligned (as is base) when + * CONFIG_STRICT_MODULE_RWX is set. + */ + +/* + * Since some arches are moving towards PAGE_KERNEL module allocations instead + * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() independent of + * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we + * are strict. + */ +static void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + set_memory((unsigned long)layout->base, + PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT); +} + +static void frob_rodata(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + set_memory((unsigned long)layout->base + layout->text_size, + (layout->ro_size - layout->text_size) >> PAGE_SHIFT); +} + +static void frob_ro_after_init(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); +} + +static void frob_writable_data(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + set_memory((unsigned long)layout->base + layout->ro_after_init_size, + (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); +} + +static bool layout_check_misalignment(const struct module_layout *layout) +{ + return WARN_ON(!PAGE_ALIGNED(layout->base)) || + WARN_ON(!PAGE_ALIGNED(layout->text_size)) || + WARN_ON(!PAGE_ALIGNED(layout->ro_size)) || + WARN_ON(!PAGE_ALIGNED(layout->ro_after_init_size)) || + WARN_ON(!PAGE_ALIGNED(layout->size)); +} + +bool module_check_misalignment(const struct module *mod) +{ + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return false; + + return layout_check_misalignment(&mod->core_layout) || + layout_check_misalignment(&mod->data_layout) || + layout_check_misalignment(&mod->init_layout); +} + +void module_enable_x(const struct module *mod) +{ + if (!PAGE_ALIGNED(mod->core_layout.base) || + !PAGE_ALIGNED(mod->init_layout.base)) + return; + + frob_text(&mod->core_layout, set_memory_x); + frob_text(&mod->init_layout, set_memory_x); +} + +void module_enable_ro(const struct module *mod, bool after_init) +{ + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return; +#ifdef CONFIG_STRICT_MODULE_RWX + if (!rodata_enabled) + return; +#endif + + set_vm_flush_reset_perms(mod->core_layout.base); + set_vm_flush_reset_perms(mod->init_layout.base); + frob_text(&mod->core_layout, set_memory_ro); + + frob_rodata(&mod->data_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->init_layout, set_memory_ro); + + if (after_init) + frob_ro_after_init(&mod->data_layout, set_memory_ro); +} + +void module_enable_nx(const struct module *mod) +{ + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return; + + frob_rodata(&mod->data_layout, set_memory_nx); + frob_ro_after_init(&mod->data_layout, set_memory_nx); + frob_writable_data(&mod->data_layout, set_memory_nx); + frob_rodata(&mod->init_layout, set_memory_nx); + frob_writable_data(&mod->init_layout, set_memory_nx); +} + +int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR; + int i; + + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return 0; + + for (i = 0; i < hdr->e_shnum; i++) { + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { + pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", + mod->name, secstrings + sechdrs[i].sh_name, i); + return -ENOEXEC; + } + } + + return 0; +} diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c new file mode 100644 index 000000000000..ce68f821dcd1 --- /dev/null +++ b/kernel/module/sysfs.c @@ -0,0 +1,436 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module sysfs support + * + * Copyright (C) 2008 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/kallsyms.h> +#include <linux/mutex.h> +#include "internal.h" + +/* + * /sys/module/foo/sections stuff + * J. Corbet <corbet@lwn.net> + */ +#ifdef CONFIG_KALLSYMS +struct module_sect_attr { + struct bin_attribute battr; + unsigned long address; +}; + +struct module_sect_attrs { + struct attribute_group grp; + unsigned int nsections; + struct module_sect_attr attrs[]; +}; + +#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) +static ssize_t module_sect_read(struct file *file, struct kobject *kobj, + struct bin_attribute *battr, + char *buf, loff_t pos, size_t count) +{ + struct module_sect_attr *sattr = + container_of(battr, struct module_sect_attr, battr); + char bounce[MODULE_SECT_READ_SIZE + 1]; + size_t wrote; + + if (pos != 0) + return -EINVAL; + + /* + * Since we're a binary read handler, we must account for the + * trailing NUL byte that sprintf will write: if "buf" is + * too small to hold the NUL, or the NUL is exactly the last + * byte, the read will look like it got truncated by one byte. + * Since there is no way to ask sprintf nicely to not write + * the NUL, we have to use a bounce buffer. + */ + wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", + kallsyms_show_value(file->f_cred) + ? (void *)sattr->address : NULL); + count = min(count, wrote); + memcpy(buf, bounce, count); + + return count; +} + +static void free_sect_attrs(struct module_sect_attrs *sect_attrs) +{ + unsigned int section; + + for (section = 0; section < sect_attrs->nsections; section++) + kfree(sect_attrs->attrs[section].battr.attr.name); + kfree(sect_attrs); +} + +static void add_sect_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int nloaded = 0, i, size[2]; + struct module_sect_attrs *sect_attrs; + struct module_sect_attr *sattr; + struct bin_attribute **gattr; + + /* Count loaded sections and allocate structures */ + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i])) + nloaded++; + size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), + sizeof(sect_attrs->grp.bin_attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); + sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); + if (!sect_attrs) + return; + + /* Setup section attributes. */ + sect_attrs->grp.name = "sections"; + sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; + + sect_attrs->nsections = 0; + sattr = §_attrs->attrs[0]; + gattr = §_attrs->grp.bin_attrs[0]; + for (i = 0; i < info->hdr->e_shnum; i++) { + Elf_Shdr *sec = &info->sechdrs[i]; + + if (sect_empty(sec)) + continue; + sysfs_bin_attr_init(&sattr->battr); + sattr->address = sec->sh_addr; + sattr->battr.attr.name = + kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); + if (!sattr->battr.attr.name) + goto out; + sect_attrs->nsections++; + sattr->battr.read = module_sect_read; + sattr->battr.size = MODULE_SECT_READ_SIZE; + sattr->battr.attr.mode = 0400; + *(gattr++) = &(sattr++)->battr; + } + *gattr = NULL; + + if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + goto out; + + mod->sect_attrs = sect_attrs; + return; +out: + free_sect_attrs(sect_attrs); +} + +static void remove_sect_attrs(struct module *mod) +{ + if (mod->sect_attrs) { + sysfs_remove_group(&mod->mkobj.kobj, + &mod->sect_attrs->grp); + /* + * We are positive that no one is using any sect attrs + * at this point. Deallocate immediately. + */ + free_sect_attrs(mod->sect_attrs); + mod->sect_attrs = NULL; + } +} + +/* + * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. + */ + +struct module_notes_attrs { + struct kobject *dir; + unsigned int notes; + struct bin_attribute attrs[]; +}; + +static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + /* + * The caller checked the pos and count against our size. + */ + memcpy(buf, bin_attr->private + pos, count); + return count; +} + +static void free_notes_attrs(struct module_notes_attrs *notes_attrs, + unsigned int i) +{ + if (notes_attrs->dir) { + while (i-- > 0) + sysfs_remove_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i]); + kobject_put(notes_attrs->dir); + } + kfree(notes_attrs); +} + +static void add_notes_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int notes, loaded, i; + struct module_notes_attrs *notes_attrs; + struct bin_attribute *nattr; + + /* failed to create section attributes, so can't create notes */ + if (!mod->sect_attrs) + return; + + /* Count notes sections and allocate structures. */ + notes = 0; + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i]) && + info->sechdrs[i].sh_type == SHT_NOTE) + ++notes; + + if (notes == 0) + return; + + notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), + GFP_KERNEL); + if (!notes_attrs) + return; + + notes_attrs->notes = notes; + nattr = ¬es_attrs->attrs[0]; + for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { + if (sect_empty(&info->sechdrs[i])) + continue; + if (info->sechdrs[i].sh_type == SHT_NOTE) { + sysfs_bin_attr_init(nattr); + nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; + nattr->attr.mode = 0444; + nattr->size = info->sechdrs[i].sh_size; + nattr->private = (void *)info->sechdrs[i].sh_addr; + nattr->read = module_notes_read; + ++nattr; + } + ++loaded; + } + + notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); + if (!notes_attrs->dir) + goto out; + + for (i = 0; i < notes; ++i) + if (sysfs_create_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i])) + goto out; + + mod->notes_attrs = notes_attrs; + return; + +out: + free_notes_attrs(notes_attrs, i); +} + +static void remove_notes_attrs(struct module *mod) +{ + if (mod->notes_attrs) + free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); +} + +#else /* !CONFIG_KALLSYMS */ +static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { } +static inline void remove_sect_attrs(struct module *mod) { } +static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { } +static inline void remove_notes_attrs(struct module *mod) { } +#endif /* CONFIG_KALLSYMS */ + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); +#endif +} + +static int add_usage_links(struct module *mod) +{ + int ret = 0; +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) { + ret = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + if (ret) + break; + } + mutex_unlock(&module_mutex); + if (ret) + del_usage_links(mod); +#endif + return ret; +} + +static void module_remove_modinfo_attrs(struct module *mod, int end) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { + if (end >= 0 && i > end) + break; + /* pick a field to test for end of list */ + if (!attr->attr.name) + break; + sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); + if (attr->free) + attr->free(mod); + } + kfree(mod->modinfo_attrs); +} + +static int module_add_modinfo_attrs(struct module *mod) +{ + struct module_attribute *attr; + struct module_attribute *temp_attr; + int error = 0; + int i; + + mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * + (modinfo_attrs_count + 1)), + GFP_KERNEL); + if (!mod->modinfo_attrs) + return -ENOMEM; + + temp_attr = mod->modinfo_attrs; + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (!attr->test || attr->test(mod)) { + memcpy(temp_attr, attr, sizeof(*temp_attr)); + sysfs_attr_init(&temp_attr->attr); + error = sysfs_create_file(&mod->mkobj.kobj, + &temp_attr->attr); + if (error) + goto error_out; + ++temp_attr; + } + } + + return 0; + +error_out: + if (i > 0) + module_remove_modinfo_attrs(mod, --i); + else + kfree(mod->modinfo_attrs); + return error; +} + +static void mod_kobject_put(struct module *mod) +{ + DECLARE_COMPLETION_ONSTACK(c); + + mod->mkobj.kobj_completion = &c; + kobject_put(&mod->mkobj.kobj); + wait_for_completion(&c); +} + +static int mod_sysfs_init(struct module *mod) +{ + int err; + struct kobject *kobj; + + if (!module_sysfs_initialized) { + pr_err("%s: module sysfs not initialized\n", mod->name); + err = -EINVAL; + goto out; + } + + kobj = kset_find_obj(module_kset, mod->name); + if (kobj) { + pr_err("%s: module is already loaded\n", mod->name); + kobject_put(kobj); + err = -EINVAL; + goto out; + } + + mod->mkobj.mod = mod; + + memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); + mod->mkobj.kobj.kset = module_kset; + err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, + "%s", mod->name); + if (err) + mod_kobject_put(mod); + +out: + return err; +} + +int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + int err; + + err = mod_sysfs_init(mod); + if (err) + goto out; + + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); + if (!mod->holders_dir) { + err = -ENOMEM; + goto out_unreg; + } + + err = module_param_sysfs_setup(mod, kparam, num_params); + if (err) + goto out_unreg_holders; + + err = module_add_modinfo_attrs(mod); + if (err) + goto out_unreg_param; + + err = add_usage_links(mod); + if (err) + goto out_unreg_modinfo_attrs; + + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); + + return 0; + +out_unreg_modinfo_attrs: + module_remove_modinfo_attrs(mod, -1); +out_unreg_param: + module_param_sysfs_remove(mod); +out_unreg_holders: + kobject_put(mod->holders_dir); +out_unreg: + mod_kobject_put(mod); +out: + return err; +} + +static void mod_sysfs_fini(struct module *mod) +{ + remove_notes_attrs(mod); + remove_sect_attrs(mod); + mod_kobject_put(mod); +} + +void mod_sysfs_teardown(struct module *mod) +{ + del_usage_links(mod); + module_remove_modinfo_attrs(mod, -1); + module_param_sysfs_remove(mod); + kobject_put(mod->mkobj.drivers_dir); + kobject_put(mod->holders_dir); + mod_sysfs_fini(mod); +} + +void init_param_lock(struct module *mod) +{ + mutex_init(&mod->param_lock); +} diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c new file mode 100644 index 000000000000..7f8133044d09 --- /dev/null +++ b/kernel/module/tracking.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module taint unload tracking support + * + * Copyright (C) 2022 Aaron Tomlin + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include "internal.h" + +static LIST_HEAD(unloaded_tainted_modules); + +int try_add_tainted_module(struct module *mod) +{ + struct mod_unload_taint *mod_taint; + + module_assert_mutex_or_preempt(); + + list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list, + lockdep_is_held(&module_mutex)) { + if (!strcmp(mod_taint->name, mod->name) && + mod_taint->taints & mod->taints) { + mod_taint->count++; + goto out; + } + } + + mod_taint = kmalloc(sizeof(*mod_taint), GFP_KERNEL); + if (unlikely(!mod_taint)) + return -ENOMEM; + strscpy(mod_taint->name, mod->name, MODULE_NAME_LEN); + mod_taint->taints = mod->taints; + list_add_rcu(&mod_taint->list, &unloaded_tainted_modules); + mod_taint->count = 1; +out: + return 0; +} + +void print_unloaded_tainted_modules(void) +{ + struct mod_unload_taint *mod_taint; + char buf[MODULE_FLAGS_BUF_SIZE]; + + if (!list_empty(&unloaded_tainted_modules)) { + printk(KERN_DEFAULT "Unloaded tainted modules:"); + list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, + list) { + size_t l; + + l = module_flags_taint(mod_taint->taints, buf); + buf[l++] = '\0'; + pr_cont(" %s(%s):%llu", mod_taint->name, buf, + mod_taint->count); + } + } +} diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c new file mode 100644 index 000000000000..8ec5cfd60496 --- /dev/null +++ b/kernel/module/tree_lookup.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Modules tree lookup + * + * Copyright (C) 2015 Peter Zijlstra + * Copyright (C) 2015 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/rbtree_latch.h> +#include "internal.h" + +/* + * Use a latched RB-tree for __module_address(); this allows us to use + * RCU-sched lookups of the address from any context. + * + * This is conditional on PERF_EVENTS || TRACING because those can really hit + * __module_address() hard by doing a lot of stack unwinding; potentially from + * NMI context. + */ + +static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) +{ + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + + return (unsigned long)layout->base; +} + +static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) +{ + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + + return (unsigned long)layout->size; +} + +static __always_inline bool +mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) +{ + return __mod_tree_val(a) < __mod_tree_val(b); +} + +static __always_inline int +mod_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long start, end; + + start = __mod_tree_val(n); + if (val < start) + return -1; + + end = start + __mod_tree_size(n); + if (val >= end) + return 1; + + return 0; +} + +static const struct latch_tree_ops mod_tree_ops = { + .less = mod_tree_less, + .comp = mod_tree_comp, +}; + +static noinline void __mod_tree_insert(struct mod_tree_node *node, struct mod_tree_root *tree) +{ + latch_tree_insert(&node->node, &tree->root, &mod_tree_ops); +} + +static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *tree) +{ + latch_tree_erase(&node->node, &tree->root, &mod_tree_ops); +} + +/* + * These modifications: insert, remove_init and remove; are serialized by the + * module_mutex. + */ +void mod_tree_insert(struct module *mod) +{ + mod->core_layout.mtn.mod = mod; + mod->init_layout.mtn.mod = mod; + + __mod_tree_insert(&mod->core_layout.mtn, &mod_tree); + if (mod->init_layout.size) + __mod_tree_insert(&mod->init_layout.mtn, &mod_tree); + +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + mod->data_layout.mtn.mod = mod; + __mod_tree_insert(&mod->data_layout.mtn, &mod_data_tree); +#endif +} + +void mod_tree_remove_init(struct module *mod) +{ + if (mod->init_layout.size) + __mod_tree_remove(&mod->init_layout.mtn, &mod_tree); +} + +void mod_tree_remove(struct module *mod) +{ + __mod_tree_remove(&mod->core_layout.mtn, &mod_tree); + mod_tree_remove_init(mod); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + __mod_tree_remove(&mod->data_layout.mtn, &mod_data_tree); +#endif +} + +struct module *mod_find(unsigned long addr, struct mod_tree_root *tree) +{ + struct latch_tree_node *ltn; + + ltn = latch_tree_find((void *)addr, &tree->root, &mod_tree_ops); + if (!ltn) + return NULL; + + return container_of(ltn, struct mod_tree_node, node)->mod; +} diff --git a/kernel/module/version.c b/kernel/module/version.c new file mode 100644 index 000000000000..53f43ac5a73e --- /dev/null +++ b/kernel/module/version.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module version support + * + * Copyright (C) 2008 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/printk.h> +#include "internal.h" + +int check_version(const struct load_info *info, + const char *symname, + struct module *mod, + const s32 *crc) +{ + Elf_Shdr *sechdrs = info->sechdrs; + unsigned int versindex = info->index.vers; + unsigned int i, num_versions; + struct modversion_info *versions; + + /* Exporting module didn't supply crcs? OK, we're already tainted. */ + if (!crc) + return 1; + + /* No versions at all? modprobe --force does this. */ + if (versindex == 0) + return try_to_force_load(mod, symname) == 0; + + versions = (void *)sechdrs[versindex].sh_addr; + num_versions = sechdrs[versindex].sh_size + / sizeof(struct modversion_info); + + for (i = 0; i < num_versions; i++) { + u32 crcval; + + if (strcmp(versions[i].name, symname) != 0) + continue; + + crcval = *crc; + if (versions[i].crc == crcval) + return 1; + pr_debug("Found checksum %X vs module %lX\n", + crcval, versions[i].crc); + goto bad_version; + } + + /* Broken toolchain. Warn once, then let it go.. */ + pr_warn_once("%s: no symbol version for %s\n", info->name, symname); + return 1; + +bad_version: + pr_warn("%s: disagrees about version of symbol %s\n", info->name, symname); + return 0; +} + +int check_modstruct_version(const struct load_info *info, + struct module *mod) +{ + struct find_symbol_arg fsa = { + .name = "module_layout", + .gplok = true, + }; + + /* + * Since this should be found in kernel (which can't be removed), no + * locking is necessary -- use preempt_disable() to placate lockdep. + */ + preempt_disable(); + if (!find_symbol(&fsa)) { + preempt_enable(); + BUG(); + } + preempt_enable(); + return check_version(info, "module_layout", mod, fsa.crc); +} + +/* First part is kernel version, which we ignore if module has crcs. */ +int same_magic(const char *amagic, const char *bmagic, + bool has_crcs) +{ + if (has_crcs) { + amagic += strcspn(amagic, " "); + bmagic += strcspn(bmagic, " "); + } + return strcmp(amagic, bmagic) == 0; +} + +/* + * Generate the signature for all relevant module structures here. + * If these change, we don't want to try to parse the module. + */ +void module_layout(struct module *mod, + struct modversion_info *ver, + struct kernel_param *kp, + struct kernel_symbol *ks, + struct tracepoint * const *tp) +{ +} +EXPORT_SYMBOL(module_layout); diff --git a/kernel/module_signing.c b/kernel/module_signing.c deleted file mode 100644 index 8723ae70ea1f..000000000000 --- a/kernel/module_signing.c +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Module signature checker - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/module.h> -#include <linux/module_signature.h> -#include <linux/string.h> -#include <linux/verification.h> -#include <crypto/public_key.h> -#include "module-internal.h" - -/* - * Verify the signature on a module. - */ -int mod_verify_sig(const void *mod, struct load_info *info) -{ - struct module_signature ms; - size_t sig_len, modlen = info->len; - int ret; - - pr_devel("==>%s(,%zu)\n", __func__, modlen); - - if (modlen <= sizeof(ms)) - return -EBADMSG; - - memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); - - ret = mod_check_sig(&ms, modlen, "module"); - if (ret) - return ret; - - sig_len = be32_to_cpu(ms.sig_len); - modlen -= sig_len + sizeof(ms); - info->len = modlen; - - return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, - VERIFY_USE_SECONDARY_KEYRING, - VERIFYING_MODULE_SIGNATURE, - NULL, NULL); -} diff --git a/kernel/notifier.c b/kernel/notifier.c index ba005ebf4730..0d5bd62c480e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -20,7 +20,8 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); */ static int notifier_chain_register(struct notifier_block **nl, - struct notifier_block *n) + struct notifier_block *n, + bool unique_priority) { while ((*nl) != NULL) { if (unlikely((*nl) == n)) { @@ -30,6 +31,8 @@ static int notifier_chain_register(struct notifier_block **nl, } if (n->priority > (*nl)->priority) break; + if (n->priority == (*nl)->priority && unique_priority) + return -EBUSY; nl = &((*nl)->next); } n->next = *nl; @@ -144,13 +147,36 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, int ret; spin_lock_irqsave(&nh->lock, flags); - ret = notifier_chain_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n, false); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); /** + * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an atomic notifier chain if there is no other + * notifier registered using the same priority. + * + * Returns 0 on success, %-EEXIST or %-EBUSY on error. + */ +int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n, true); + spin_unlock_irqrestore(&nh->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio); + +/** * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: Entry to remove from notifier chain @@ -204,23 +230,27 @@ int atomic_notifier_call_chain(struct atomic_notifier_head *nh, EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); +/** + * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty + * @nh: Pointer to head of the atomic notifier chain + * + * Checks whether notifier chain is empty. + * + * Returns true is notifier chain is empty, false otherwise. + */ +bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh) +{ + return !rcu_access_pointer(nh->head); +} + /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. */ -/** - * blocking_notifier_chain_register - Add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain. - * Must be called in process context. - * - * Returns 0 on success, %-EEXIST on error. - */ -int blocking_notifier_chain_register(struct blocking_notifier_head *nh, - struct notifier_block *n) +static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n, + bool unique_priority) { int ret; @@ -230,16 +260,49 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); + return notifier_chain_register(&nh->head, n, unique_priority); down_write(&nh->rwsem); - ret = notifier_chain_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n, unique_priority); up_write(&nh->rwsem); return ret; } + +/** + * blocking_notifier_chain_register - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a blocking notifier chain. + * Must be called in process context. + * + * Returns 0 on success, %-EEXIST on error. + */ +int blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + return __blocking_notifier_chain_register(nh, n, false); +} EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** + * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an blocking notifier chain if there is no other + * notifier registered using the same priority. + * + * Returns 0 on success, %-EEXIST or %-EBUSY on error. + */ +int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + return __blocking_notifier_chain_register(nh, n, true); +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio); + +/** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain @@ -341,7 +404,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) { - return notifier_chain_register(&nh->head, n); + return notifier_chain_register(&nh->head, n, false); } EXPORT_SYMBOL_GPL(raw_notifier_chain_register); @@ -420,10 +483,10 @@ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); + return notifier_chain_register(&nh->head, n, false); mutex_lock(&nh->mutex); - ret = notifier_chain_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n, false); mutex_unlock(&nh->mutex); return ret; } diff --git a/kernel/padata.c b/kernel/padata.c index 18d3a5c699d8..e5819bb8bd1d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -181,7 +181,7 @@ int padata_do_parallel(struct padata_shell *ps, goto out; if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) { - if (!cpumask_weight(pd->cpumask.cbcpu)) + if (cpumask_empty(pd->cpumask.cbcpu)) goto out; /* Select an alternate fallback CPU and notify the caller. */ diff --git a/kernel/panic.c b/kernel/panic.c index 55b50e052ec3..a3308af28a21 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -43,12 +43,14 @@ * Should we dump all CPUs backtraces in an oops event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = - IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; + IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -66,12 +68,35 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 +#define PANIC_PRINT_ALL_CPU_BT 0x00000040 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL) +static struct ctl_table kern_panic_table[] = { + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int kernel_panic_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_panic_table); + return 0; +} +late_initcall(kernel_panic_sysctls_init); +#endif + static long no_blink(int state) { return 0; @@ -147,10 +172,16 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); -static void panic_print_sys_info(void) +static void panic_print_sys_info(bool console_flush) { - if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) - console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (console_flush) { + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) + console_flush_on_panic(CONSOLE_REPLAY_ALL); + return; + } + + if (panic_print & PANIC_PRINT_ALL_CPU_BT) + trigger_all_cpu_backtrace(); if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); @@ -185,6 +216,16 @@ void panic(const char *fmt, ...) int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + } + /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since @@ -272,6 +313,8 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + panic_print_sys_info(false); + kmsg_dump(KMSG_DUMP_PANIC); /* @@ -302,7 +345,7 @@ void panic(const char *fmt, ...) debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); - panic_print_sys_info(); + panic_print_sys_info(true); if (!panic_blink) panic_blink = no_blink; @@ -576,16 +619,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (regs) show_regs(regs); - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn) panic("panic_on_warn set ...\n"); - } if (!regs) dump_stack(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a46a3723bc66..f4f8cb0435b4 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -52,7 +52,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) /* Name collision forces to do allocation under mutex. */ if (!*pkc) *pkc = kmem_cache_create(name, len, 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); diff --git a/kernel/platform-feature.c b/kernel/platform-feature.c new file mode 100644 index 000000000000..cb6a6c3e4fed --- /dev/null +++ b/kernel/platform-feature.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bitops.h> +#include <linux/cache.h> +#include <linux/export.h> +#include <linux/platform-feature.h> + +#define PLATFORM_FEAT_ARRAY_SZ BITS_TO_LONGS(PLATFORM_FEAT_N) +static unsigned long __read_mostly platform_features[PLATFORM_FEAT_ARRAY_SZ]; + +void platform_set(unsigned int feature) +{ + set_bit(feature, platform_features); +} +EXPORT_SYMBOL_GPL(platform_set); + +void platform_clear(unsigned int feature) +{ + clear_bit(feature, platform_features); +} +EXPORT_SYMBOL_GPL(platform_clear); + +bool platform_has(unsigned int feature) +{ + return test_bit(feature, platform_features); +} +EXPORT_SYMBOL_GPL(platform_has); diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 5899260a8bef..874ad834dc8d 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +ifeq ($(CONFIG_DYNAMIC_DEBUG), y) +CFLAGS_swap.o := -DDEBUG +CFLAGS_snapshot.o := -DDEBUG +CFLAGS_energy_model.o := -DDEBUG +endif KASAN_SANITIZE_snapshot.o := n diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0153b0ca7b23..6c373f2960e7 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -54,28 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static int em_debug_units_show(struct seq_file *s, void *unused) +static int em_debug_flags_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? - "milliWatts" : "bogoWatts"; - seq_printf(s, "%s\n", units); + seq_printf(s, "%#lx\n", pd->flags); return 0; } -DEFINE_SHOW_ATTRIBUTE(em_debug_units); - -static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) -{ - struct em_perf_domain *pd = s->private; - int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; - - seq_printf(s, "%d\n", enabled); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); +DEFINE_SHOW_ATTRIBUTE(em_debug_flags); static void em_debug_create_pd(struct device *dev) { @@ -89,9 +76,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, &em_debug_cpus_fops); - debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); - debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, - &em_debug_skip_inefficiencies_fops); + debugfs_create_file("flags", 0444, d, dev->em_pd, + &em_debug_flags_fops); /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) @@ -121,7 +107,8 @@ static void em_debug_remove_pd(struct device *dev) {} #endif static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, - int nr_states, struct em_data_callback *cb) + int nr_states, struct em_data_callback *cb, + unsigned long flags) { unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; struct em_perf_state *table; @@ -139,7 +126,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, dev); + ret = cb->active_power(dev, &power, &freq); if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); @@ -173,10 +160,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = nr_states - 1; i >= 0; i--) { - unsigned long power_res = em_scale_power(table[i].power); + unsigned long power_res, cost; + + if (flags & EM_PERF_DOMAIN_ARTIFICIAL) { + ret = cb->get_cost(dev, table[i].frequency, &cost); + if (ret || !cost || cost > EM_MAX_POWER) { + dev_err(dev, "EM: invalid cost %lu %d\n", + cost, ret); + goto free_ps_table; + } + } else { + power_res = em_scale_power(table[i].power); + cost = div64_u64(fmax * power_res, table[i].frequency); + } + + table[i].cost = cost; - table[i].cost = div64_u64(fmax * power_res, - table[i].frequency); if (table[i].cost >= prev_cost) { table[i].flags = EM_PERF_STATE_INEFFICIENT; dev_dbg(dev, "EM: OPP:%lu is inefficient\n", @@ -197,7 +196,8 @@ free_ps_table: } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus) + struct em_data_callback *cb, cpumask_t *cpus, + unsigned long flags) { struct em_perf_domain *pd; struct device *cpu_dev; @@ -215,7 +215,7 @@ static int em_create_pd(struct device *dev, int nr_states, return -ENOMEM; } - ret = em_create_perf_table(dev, pd, nr_states, cb); + ret = em_create_perf_table(dev, pd, nr_states, cb, flags); if (ret) { kfree(pd); return ret; @@ -259,6 +259,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev) found++; } + cpufreq_cpu_put(policy); + if (!found) return; @@ -332,6 +334,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, bool milliwatts) { unsigned long cap, prev_cap = 0; + unsigned long flags = 0; int cpu, ret; if (!dev || !nr_states || !cb) @@ -378,12 +381,16 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, } } - ret = em_create_pd(dev, nr_states, cb, cpus); + if (milliwatts) + flags |= EM_PERF_DOMAIN_MILLIWATTS; + else if (cb->get_cost) + flags |= EM_PERF_DOMAIN_ARTIFICIAL; + + ret = em_create_pd(dev, nr_states, cb, cpus, flags); if (ret) goto unlock; - if (milliwatts) - dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + dev->em_pd->flags |= flags; em_cpufreq_update_efficiencies(dev); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e6af502c2fd7..89c71fce225d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,7 +28,6 @@ #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/ctype.h> -#include <linux/genhd.h> #include <linux/ktime.h> #include <linux/security.h> #include <linux/secretmem.h> @@ -84,7 +83,7 @@ bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION) && - !secretmem_active(); + !secretmem_active() && !cxl_mem_active(); } /** @@ -666,7 +665,7 @@ static void power_down(void) hibernation_platform_enter(); fallthrough; case HIBERNATION_SHUTDOWN: - if (pm_power_off) + if (kernel_can_power_off()) kernel_power_off(); break; } @@ -689,8 +688,10 @@ static int load_image_and_restore(void) lock_device_hotplug(); error = create_basic_memory_bitmaps(); - if (error) + if (error) { + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; + } error = swsusp_read(&flags); swsusp_close(FMODE_READ | FMODE_EXCL); @@ -1328,7 +1329,7 @@ static int __init resumedelay_setup(char *str) int rc = kstrtouint(str, 0, &resume_delay); if (rc) - return rc; + pr_warn("resumedelay: bad option string '%s'\n", str); return 1; } diff --git a/kernel/power/main.c b/kernel/power/main.c index 7e646079fbeb..e3694034b753 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -127,7 +127,9 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *s = buf; suspend_state_t i; - for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { + if (i >= PM_SUSPEND_MEM && cxl_mem_active()) + continue; if (mem_sleep_states[i]) { const char *label = mem_sleep_states[i]; @@ -136,6 +138,7 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, else s += sprintf(s, "%s ", label); } + } /* Convert the last space to a newline if needed. */ if (s != buf) @@ -545,35 +548,6 @@ static int __init pm_debug_messages_setup(char *str) } __setup("pm_debug_messages", pm_debug_messages_setup); -/** - * __pm_pr_dbg - Print a suspend debug message to the kernel log. - * @defer: Whether or not to use printk_deferred() to print the message. - * @fmt: Message format. - * - * The message will be emitted if enabled through the pm_debug_messages - * sysfs attribute. - */ -void __pm_pr_dbg(bool defer, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - if (!pm_debug_messages_on) - return; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - if (defer) - printk_deferred(KERN_DEBUG "PM: %pV", &vaf); - else - printk(KERN_DEBUG "PM: %pV", &vaf); - - va_end(args); -} - #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ diff --git a/kernel/power/process.c b/kernel/power/process.c index 11b570fcf049..3068601e585a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -6,9 +6,6 @@ * Originally from swsusp. */ - -#undef DEBUG - #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 330d49937692..2a406753af90 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -326,7 +326,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -/** +/* * Data types related to memory bitmaps. * * Memory bitmap is a structure consisting of many linked lists of @@ -427,6 +427,10 @@ struct memory_bitmap { /** * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages not used before hibernation (restore only) + * @ca: Pointer to a linked list of pages ("a chain") to allocate from + * @list: Radix Tree node to add. * * This function is used to allocate inner nodes as well as the * leave nodes of the radix tree. It also adds the node to the @@ -902,7 +906,7 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /** - * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. + * memory_bm_next_pfn - Find the next set bit in a memory bitmap. * @bm: Memory bitmap. * * Starting from the last returned position this function searches for the next @@ -1937,7 +1941,7 @@ static inline int get_highmem_buffer(int safe_needed) } /** - * alloc_highmem_image_pages - Allocate some highmem pages for the image. + * alloc_highmem_pages - Allocate some highmem pages for the image. * * Try to allocate as many pages as needed, but if the number of free highmem * pages is less than that, allocate them all. @@ -2224,7 +2228,7 @@ static int check_header(struct swsusp_info *info) } /** - * load header - Check the image header and copy the data from it. + * load_header - Check the image header and copy the data from it. */ static int load_header(struct swsusp_info *info) { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6fcdee7e87a5..827075944d28 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -236,7 +236,8 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static bool sleep_state_supported(suspend_state_t state) { - return state == PM_SUSPEND_TO_IDLE || valid_state(state); + return state == PM_SUSPEND_TO_IDLE || + (valid_state(state) && !cxl_mem_active()); } static int platform_suspend_prepare(suspend_state_t state) diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index d20526c5be15..b663a97f5867 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value) value++; suspend_type = strsep(&value, ","); if (!suspend_type) - return 0; + return 1; repeat = strsep(&value, ","); if (repeat) { if (kstrtou32(repeat, 0, &test_repeat_count_max)) - return 0; + return 1; } for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (!strcmp(pm_labels[i], suspend_type)) { test_state_label = pm_labels[i]; - return 0; + return 1; } printk(warn_bad_state, suspend_type); - return 0; + return 1; } __setup("test_suspend", setup_test_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ad10359030a4..91fffdd2c7fb 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -16,7 +16,6 @@ #include <linux/file.h> #include <linux/delay.h> #include <linux/bitops.h> -#include <linux/genhd.h> #include <linux/device.h> #include <linux/bio.h> #include <linux/blkdev.h> @@ -89,7 +88,7 @@ struct swap_map_page_list { struct swap_map_page_list *next; }; -/** +/* * The swap_map_handle structure is used for handling swap in * a file-alike way */ @@ -117,7 +116,7 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; -/** +/* * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. */ @@ -171,7 +170,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/** +/* * alloc_swapdev_block - allocate a swap page and register that it has * been allocated, so that it can be freed in case of an error. */ @@ -190,7 +189,7 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/** +/* * free_all_swap_pages - free swap pages allocated for saving image data. * It also frees the extents used to register which swap entries had been * allocated. @@ -277,10 +276,9 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1); + bio = bio_alloc(hib_resume_bdev, 1, op | op_flags, + GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - bio_set_dev(bio, hib_resume_bdev); - bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { pr_err("Adding page to bio failed at %llu\n", diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 82abfaf3c2aa..b49c6ff6dca0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -93,6 +93,12 @@ EXPORT_SYMBOL_GPL(console_drivers); */ int __read_mostly suppress_printk; +/* + * During panic, heavy printk by other CPUs can delay the + * panic and risk deadlock on console resources. + */ +static int __read_mostly suppress_panic_printk; + #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -146,8 +152,10 @@ static int __control_devkmsg(char *str) static int __init control_devkmsg(char *str) { - if (__control_devkmsg(str) < 0) + if (__control_devkmsg(str) < 0) { + pr_warn("printk.devkmsg: bad option string '%s'\n", str); return 1; + } /* * Set sysctl string accordingly: @@ -166,7 +174,7 @@ static int __init control_devkmsg(char *str) */ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; - return 0; + return 1; } __setup("printk.devkmsg=", control_devkmsg); @@ -257,6 +265,11 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) +static bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -268,11 +281,6 @@ static void __up_console_sem(unsigned long ip) static int console_locked, console_suspended; /* - * If exclusive_console is non-NULL then only this console is to be printed to. - */ -static struct console *exclusive_console; - -/* * Array of consoles built from command line options (console=) */ @@ -361,12 +369,6 @@ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; -/* All 3 protected by @console_sem. */ -/* the next printk record to write to the console */ -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; - struct latched_seq { seqcount_latch_t latch; u64 val[2]; @@ -392,6 +394,9 @@ static struct latched_seq clear_seq = { /* the maximum size of a formatted record (i.e. with prefix added per line) */ #define CONSOLE_LOG_MAX 1024 +/* the maximum size for a dropped text message */ +#define DROPPED_TEXT_MAX 64 + /* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) @@ -733,8 +738,19 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, goto out; } + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with __wake_up_klogd:A. + */ ret = wait_event_interruptible(log_wait, - prb_read_valid(prb, atomic64_read(&user->seq), r)); + prb_read_valid(prb, + atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } @@ -1500,7 +1516,18 @@ static int syslog_print(char __user *buf, int size) seq = syslog_seq; mutex_unlock(&syslog_lock); - len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with __wake_up_klogd:A. + */ + len = wait_event_interruptible(log_wait, + prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) @@ -1843,6 +1870,16 @@ static int console_trylock_spinning(void) if (console_trylock()) return 1; + /* + * It's unsafe to spin once a panic has begun. If we are the + * panic CPU, we may have already halted the owner of the + * console_sem. If we are not the panic CPU, then we should + * avoid taking console_sem, so the panic CPU has a better + * chance of cleanly acquiring it later. + */ + if (panic_in_progress()) + return 0; + printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); @@ -1888,47 +1925,24 @@ static int console_trylock_spinning(void) } /* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. + * Call the specified console driver, asking it to write out the specified + * text and length. If @dropped_text is non-NULL and any records have been + * dropped, a dropped message will be written out first. */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text) { - static char dropped_text[64]; - size_t dropped_len = 0; - struct console *con; - - trace_console_rcuidle(text, len); + size_t dropped_len; - if (!console_drivers) - return; - - if (console_dropped) { - dropped_len = snprintf(dropped_text, sizeof(dropped_text), + if (con->dropped && dropped_text) { + dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX, "** %lu printk messages dropped **\n", - console_dropped); - console_dropped = 0; + con->dropped); + con->dropped = 0; + con->write(con, dropped_text, dropped_len); } - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if (!(con->flags & CON_ENABLED)) - continue; - if (!con->write) - continue; - if (!cpu_online(smp_processor_id()) && - !(con->flags & CON_ANYTIME)) - continue; - if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); - else { - if (dropped_len) - con->write(con, dropped_text, dropped_len); - con->write(con, text, len); - } - } + con->write(con, text, len); } /* @@ -2003,8 +2017,10 @@ static u8 *__printk_recursion_counter(void) int printk_delay_msec __read_mostly; -static inline void printk_delay(void) +static inline void printk_delay(int level) { + boot_delay_msec(level); + if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; @@ -2018,7 +2034,7 @@ static inline void printk_delay(void) static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : - 0x80000000 + raw_smp_processor_id(); + 0x80000000 + smp_processor_id(); } /** @@ -2092,6 +2108,8 @@ static u16 printk_sprint(char *text, u16 size, int facility, } } + trace_console_rcuidle(text, text_len); + return text_len; } @@ -2100,7 +2118,6 @@ int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { - const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; @@ -2110,10 +2127,14 @@ int vprintk_store(int facility, int level, u8 *recursion_ptr; u16 reserve_size; va_list args2; + u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; + if (!printk_enter_irqsave(recursion_ptr, irqflags)) + return 0; + /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is @@ -2122,8 +2143,7 @@ int vprintk_store(int facility, int level, */ ts_nsec = local_clock(); - if (!printk_enter_irqsave(recursion_ptr, irqflags)) - return 0; + caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be @@ -2218,28 +2238,34 @@ asmlinkage int vprintk_emit(int facility, int level, if (unlikely(suppress_printk)) return 0; + if (unlikely(suppress_panic_printk) && + atomic_read(&panic_cpu) != raw_smp_processor_id()) + return 0; + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; } - boot_delay_msec(level); - printk_delay(); + printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console + * The caller may be holding system-critical or + * timing-sensitive locks. Disable preemption during + * printing of all remaining records to all consoles so that + * this context can return as soon as possible. Hopefully + * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console - * semaphore. The release will print out buffers and wake up - * /dev/kmsg and syslog() users. + * semaphore. The release will print out buffers. With the + * spinning variant, this context tries to take over the + * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); @@ -2270,18 +2296,19 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); + #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 +#define DROPPED_TEXT_MAX 0 #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 +#define prb_next_seq(rb) 0 static u64 syslog_seq; -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; static size_t record_print_text(const struct printk_record *r, bool syslog, bool time) @@ -2298,9 +2325,12 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text) +{ +} static bool suppress_message_printing(int level) { return false; } +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -2324,6 +2354,20 @@ asmlinkage __visible void early_printk(const char *fmt, ...) } #endif +static void set_user_specified(struct console_cmdline *c, bool user_specified) +{ + if (!user_specified) + return; + + /* + * @c console was defined by the user on the command line. + * Do not clear when added twice also by SPCR or the device tree. + */ + c->user_specified = true; + /* At least one console defined by the user on the command line. */ + console_set_on_cmdline = 1; +} + static int __add_preferred_console(char *name, int idx, char *options, char *brl_options, bool user_specified) { @@ -2340,8 +2384,7 @@ static int __add_preferred_console(char *name, int idx, char *options, if (strcmp(c->name, name) == 0 && c->index == idx) { if (!brl_options) preferred_console = i; - if (user_specified) - c->user_specified = true; + set_user_specified(c, user_specified); return 0; } } @@ -2351,7 +2394,7 @@ static int __add_preferred_console(char *name, int idx, char *options, preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; - c->user_specified = user_specified; + set_user_specified(c, user_specified); braille_set_options(c, brl_options); c->index = idx; @@ -2417,7 +2460,6 @@ static int __init console_setup(char *str) *s = 0; __add_preferred_console(buf, idx, options, brl_options, true); - console_set_on_cmdline = 1; return 1; } __setup("console=", console_setup); @@ -2476,6 +2518,7 @@ void suspend_console(void) if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); + pr_flush(1000, true); console_lock(); console_suspended = 1; up_console_sem(); @@ -2488,6 +2531,7 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); + pr_flush(1000, true); } /** @@ -2558,31 +2602,220 @@ int is_console_locked(void) EXPORT_SYMBOL(is_console_locked); /* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. + * Return true when this CPU should unlock console_sem without pushing all + * messages to the console. This reduces the chance that the console is + * locked when the panic CPU tries to use it. */ -static int have_callable_console(void) +static bool abandon_console_lock_in_panic(void) { - struct console *con; + if (!panic_in_progress()) + return false; + + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return atomic_read(&panic_cpu) != raw_smp_processor_id(); +} + +/* + * Check if the given console is currently capable and allowed to print + * records. + * + * Requires the console_lock. + */ +static inline bool console_is_usable(struct console *con) +{ + if (!(con->flags & CON_ENABLED)) + return false; - for_each_console(con) - if ((con->flags & CON_ENABLED) && - (con->flags & CON_ANYTIME)) - return 1; + if (!con->write) + return false; - return 0; + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + return false; + + return true; +} + +static void __console_unlock(void) +{ + console_locked = 0; + up_console_sem(); } /* - * Can we actually use the console at this time on this cpu? + * Print one record for the given console. The record printed is whatever + * record is the next available record for the given console. + * + * @text is a buffer of size CONSOLE_LOG_MAX. + * + * If extended messages should be printed, @ext_text is a buffer of size + * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL. + * + * If dropped messages should be printed, @dropped_text is a buffer of size + * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL. * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns false if the given console has no next record to print, otherwise + * true. + * + * Requires the console_lock. */ -static inline int can_use_console(void) +static bool console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text, bool *handover) { - return cpu_online(raw_smp_processor_id()) || have_callable_console(); + static int panic_console_dropped; + struct printk_info info; + struct printk_record r; + unsigned long flags; + char *write_text; + size_t len; + + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + + *handover = false; + + if (!prb_read_valid(prb, con->seq, &r)) + return false; + + if (con->seq != r.info->seq) { + con->dropped += r.info->seq - con->seq; + con->seq = r.info->seq; + if (panic_in_progress() && panic_console_dropped++ > 10) { + suppress_panic_printk = 1; + pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); + } + } + + /* Skip record that has level above the console loglevel. */ + if (suppress_message_printing(r.info->level)) { + con->seq++; + goto skip; + } + + if (ext_text) { + write_text = ext_text; + len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info); + len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len, + &r.text_buf[0], r.info->text_len, &r.info->dev_info); + } else { + write_text = text; + len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + } + + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + + stop_critical_timings(); /* don't trace print latency */ + call_console_driver(con, write_text, len, dropped_text); + start_critical_timings(); + + con->seq++; + + *handover = console_lock_spinning_disable_and_check(); + printk_safe_exit_irqrestore(flags); +skip: + return true; +} + +/* + * Print out all remaining records to all consoles. + * + * @do_cond_resched is set by the caller. It can be true only in schedulable + * context. + * + * @next_seq is set to the sequence number after the last available record. + * The value is valid only when this function returns true. It means that all + * usable consoles are completely flushed. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns true when there was at least one usable console and all messages + * were flushed to all usable consoles. A returned false informs the caller + * that everything was not flushed (either there were no usable consoles or + * another context has taken over printing or it is a panic situation and this + * is not the panic CPU). Regardless the reason, the caller should assume it + * is not useful to immediately try again. + * + * Requires the console_lock. + */ +static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +{ + static char dropped_text[DROPPED_TEXT_MAX]; + static char ext_text[CONSOLE_EXT_LOG_MAX]; + static char text[CONSOLE_LOG_MAX]; + bool any_usable = false; + struct console *con; + bool any_progress; + + *next_seq = 0; + *handover = false; + + do { + any_progress = false; + + for_each_console(con) { + bool progress; + + if (!console_is_usable(con)) + continue; + any_usable = true; + + if (con->flags & CON_EXTENDED) { + /* Extended consoles do not print "dropped messages". */ + progress = console_emit_next_record(con, &text[0], + &ext_text[0], NULL, + handover); + } else { + progress = console_emit_next_record(con, &text[0], + NULL, &dropped_text[0], + handover); + } + if (*handover) + return false; + + /* Track the next of the highest seq flushed. */ + if (con->seq > *next_seq) + *next_seq = con->seq; + + if (!progress) + continue; + any_progress = true; + + /* Allow panic_cpu to take over the consoles safely. */ + if (abandon_console_lock_in_panic()) + return false; + + if (do_cond_resched) + cond_resched(); + } + } while (any_progress); + + return any_usable; } /** @@ -2595,27 +2828,20 @@ static inline int can_use_console(void) * by printk(). If this is the case, console_unlock(); emits * the output prior to releasing the lock. * - * If there is output waiting, we wake /dev/kmsg and syslog() users. - * * console_unlock(); may be called from any context. */ void console_unlock(void) { - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[CONSOLE_LOG_MAX]; - unsigned long flags; - bool do_cond_resched, retry; - struct printk_info info; - struct printk_record r; - u64 __maybe_unused next_seq; + bool do_cond_resched; + bool handover; + bool flushed; + u64 next_seq; if (console_suspended) { up_console_sem(); return; } - prb_rec_init_rd(&r, &info, text, sizeof(text)); - /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -2624,117 +2850,34 @@ void console_unlock(void) * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more - * messages practically incapacitating the system. - * - * console_trylock() is not able to detect the preemptive - * context reliably. Therefore the value must be stored before - * and cleared after the "again" goto label. + * messages practically incapacitating the system. Therefore, create + * a local to use for the printing loop. */ do_cond_resched = console_may_schedule; -again: - console_may_schedule = 0; - /* - * We released the console_sem lock, so we need to recheck if - * cpu is online and (if not) is there at least one CON_ANYTIME - * console. - */ - if (!can_use_console()) { - console_locked = 0; - up_console_sem(); - return; - } - - for (;;) { - size_t ext_len = 0; - int handover; - size_t len; - -skip: - if (!prb_read_valid(prb, console_seq, &r)) - break; - - if (console_seq != r.info->seq) { - console_dropped += r.info->seq - console_seq; - console_seq = r.info->seq; - } + do { + console_may_schedule = 0; - if (suppress_message_printing(r.info->level)) { - /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. - */ - console_seq++; - goto skip; - } - - /* Output to all consoles once old messages replayed. */ - if (unlikely(exclusive_console && - console_seq >= exclusive_console_stop_seq)) { - exclusive_console = NULL; - } + flushed = console_flush_all(do_cond_resched, &next_seq, &handover); + if (!handover) + __console_unlock(); /* - * Handle extended console text first because later - * record_print_text() will modify the record buffer in-place. + * Abort if there was a failure to flush all messages to all + * usable consoles. Either it is not possible to flush (in + * which case it would be an infinite loop of retrying) or + * another context has taken over printing. */ - if (nr_ext_console_drivers) { - ext_len = info_print_ext_header(ext_text, - sizeof(ext_text), - r.info); - ext_len += msg_print_ext_body(ext_text + ext_len, - sizeof(ext_text) - ext_len, - &r.text_buf[0], - r.info->text_len, - &r.info->dev_info); - } - len = record_print_text(&r, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time); - console_seq++; + if (!flushed) + break; /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). + * Some context may have added new records after + * console_flush_all() but before unlocking the console. + * Re-check if there is a new record to flush. If the trylock + * fails, another context is already handling the printing. */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(ext_text, ext_len, text, len); - start_critical_timings(); - - handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); - if (handover) - return; - - if (do_cond_resched) - cond_resched(); - } - - /* Get consistent value of the next-to-be-used sequence number. */ - next_seq = console_seq; - - console_locked = 0; - up_console_sem(); - - /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. - */ - retry = prb_read_valid(prb, next_seq, NULL); - if (retry && console_trylock()) - goto again; + } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } EXPORT_SYMBOL(console_unlock); @@ -2774,6 +2917,9 @@ void console_unblank(void) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); console_unlock(); + + if (!oops_in_progress) + pr_flush(1000, true); } /** @@ -2794,8 +2940,14 @@ void console_flush_on_panic(enum con_flush_mode mode) console_trylock(); console_may_schedule = 0; - if (mode == CONSOLE_REPLAY_ALL) - console_seq = prb_first_valid_seq(prb); + if (mode == CONSOLE_REPLAY_ALL) { + struct console *c; + u64 seq; + + seq = prb_first_valid_seq(prb); + for_each_console(c) + c->seq = seq; + } console_unlock(); } @@ -2826,6 +2978,7 @@ struct tty_driver *console_device(int *index) */ void console_stop(struct console *console) { + __pr_flush(console, 1000, true); console_lock(); console->flags &= ~CON_ENABLED; console_unlock(); @@ -2837,6 +2990,7 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -2923,6 +3077,11 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_CONSDEV; } +#define con_printk(lvl, con, fmt, ...) \ + printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \ + (con->flags & CON_BOOT) ? "boot" : "", \ + con->name, con->index, ##__VA_ARGS__) + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -3030,26 +3189,15 @@ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; + newcon->dropped = 0; if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - * - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - * - * Set exclusive_console with disabled interrupts to reduce - * race window with eventual console_flush_on_panic() that - * ignores console_lock. - */ - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; - /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); - console_seq = syslog_seq; + newcon->seq = syslog_seq; mutex_unlock(&syslog_lock); + } else { + /* Begin with next message. */ + newcon->seq = prb_next_seq(prb); } console_unlock(); console_sysfs_notify(); @@ -3061,9 +3209,7 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - pr_info("%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); + con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_enabled && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { @@ -3082,9 +3228,7 @@ int unregister_console(struct console *console) struct console *con; int res; - pr_info("%sconsole [%s%d] disabled\n", - (console->flags & CON_BOOT) ? "boot" : "" , - console->name, console->index); + con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) @@ -3218,6 +3362,79 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* If @con is specified, only wait for that console. Otherwise wait for all. */ +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) +{ + int remaining = timeout_ms; + struct console *c; + u64 last_diff = 0; + u64 printk_seq; + u64 diff; + u64 seq; + + might_sleep(); + + seq = prb_next_seq(prb); + + for (;;) { + diff = 0; + + console_lock(); + for_each_console(c) { + if (con && con != c) + continue; + if (!console_is_usable(c)) + continue; + printk_seq = c->seq; + if (printk_seq < seq) + diff += seq - printk_seq; + } + console_unlock(); + + if (diff != last_diff && reset_on_progress) + remaining = timeout_ms; + + if (diff == 0 || remaining == 0) + break; + + if (remaining < 0) { + /* no timeout limit */ + msleep(100); + } else if (remaining < 100) { + msleep(remaining); + remaining = 0; + } else { + msleep(100); + remaining -= 100; + } + + last_diff = diff; + } + + return (diff == 0); +} + +/** + * pr_flush() - Wait for printing threads to catch up. + * + * @timeout_ms: The maximum time (in ms) to wait. + * @reset_on_progress: Reset the timeout if forward progress is seen. + * + * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 + * represents infinite waiting. + * + * If @reset_on_progress is true, the timeout will be reset whenever any + * printer has been seen to make some forward progress. + * + * Context: Process context. May sleep while acquiring console lock. + * Return: true if all enabled printers are caught up. + */ +bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return __pr_flush(NULL, timeout_ms, reset_on_progress); +} +EXPORT_SYMBOL(pr_flush); + /* * Delayed printk version, for scheduler-internal messages: */ @@ -3228,7 +3445,7 @@ static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { - int pending = __this_cpu_xchg(printk_pending, 0); + int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { /* If trylock fails, someone else is doing the printing */ @@ -3243,28 +3460,43 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); -void wake_up_klogd(void) +static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the wait queue is empty. + * + * The full memory barrier within wq_has_sleeper() pairs with the full + * memory barrier within set_current_state() of + * prepare_to_wait_event(), which is called after ___wait_event() adds + * the waiter but before it has checked the wait condition. + * + * This pairs with devkmsg_read:A and syslog_print:A. + */ + if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ + (val & PRINTK_PENDING_OUTPUT)) { + this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } -void defer_console_output(void) +void wake_up_klogd(void) { - if (!printk_percpu_data_ready()) - return; + __wake_up_klogd(PRINTK_PENDING_WAKEUP); +} - preempt_disable(); - __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); - irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); - preempt_enable(); +void defer_console_output(void) +{ + /* + * New messages may have been added directly to the ringbuffer + * using vprintk_store(), so wake any waiters as well. + */ + __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) @@ -3600,26 +3832,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif #ifdef CONFIG_SMP -static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); -static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); +static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); +static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); /** - * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant - * spinning lock is not owned by any CPU. + * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant + * spinning lock is not owned by any CPU. * * Context: Any context. */ -void __printk_wait_on_cpu_lock(void) +void __printk_cpu_sync_wait(void) { do { cpu_relax(); - } while (atomic_read(&printk_cpulock_owner) != -1); + } while (atomic_read(&printk_cpu_sync_owner) != -1); } -EXPORT_SYMBOL(__printk_wait_on_cpu_lock); +EXPORT_SYMBOL(__printk_cpu_sync_wait); /** - * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant - * spinning lock. + * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant + * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the @@ -3628,7 +3860,7 @@ EXPORT_SYMBOL(__printk_wait_on_cpu_lock); * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ -int __printk_cpu_trylock(void) +int __printk_cpu_sync_try_get(void) { int cpu; int old; @@ -3638,79 +3870,80 @@ int __printk_cpu_trylock(void) /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with - * __printk_cpu_unlock:B. + * __printk_cpu_sync_put:B. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then - * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_put:A can never read from + * __printk_cpu_sync_try_get:B. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of this CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of this CPU */ - old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1, - cpu); /* LMM(__printk_cpu_trylock:A) */ + old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, + cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing - * data: LMM(__printk_cpu_trylock:B) + * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ - atomic_inc(&printk_cpulock_nested); + atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } -EXPORT_SYMBOL(__printk_cpu_trylock); +EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** - * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock. + * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ -void __printk_cpu_unlock(void) +void __printk_cpu_sync_put(void) { - if (atomic_read(&printk_cpulock_nested)) { - atomic_dec(&printk_cpulock_nested); + if (atomic_read(&printk_cpu_sync_nested)) { + atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: - * LMM(__printk_cpu_unlock:A) + * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs - * with __printk_cpu_trylock:A. + * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, - * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of the next CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of the next CPU */ - atomic_set_release(&printk_cpulock_owner, - -1); /* LMM(__printk_cpu_unlock:B) */ + atomic_set_release(&printk_cpu_sync_owner, + -1); /* LMM(__printk_cpu_sync_put:B) */ } -EXPORT_SYMBOL(__printk_cpu_unlock); +EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */ diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 8a7b7362c0dd..2b7b6ddab4f7 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -474,8 +474,10 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ - memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, - sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + if (desc_out) { + memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, + sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + } if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) @@ -528,7 +530,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ d_state = get_desc_state(id, state_val); out: - atomic_long_set(&desc_out->state_var, state_val); + if (desc_out) + atomic_long_set(&desc_out->state_var, state_val); return d_state; } @@ -1449,6 +1452,9 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ + + /* Best effort to remember the last finalized @id. */ + atomic_long_set(&desc_ring->last_finalized_id, id); } /** @@ -1657,7 +1663,12 @@ void prb_commit(struct prb_reserved_entry *e) */ void prb_final_commit(struct prb_reserved_entry *e) { + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + _prb_commit(e, desc_finalized); + + /* Best effort to remember the last finalized @id. */ + atomic_long_set(&desc_ring->last_finalized_id, e->id); } /* @@ -2005,9 +2016,39 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) */ u64 prb_next_seq(struct printk_ringbuffer *rb) { - u64 seq = 0; + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + unsigned long id; + u64 seq; + + /* Check if the cached @id still points to a valid @seq. */ + id = atomic_long_read(&desc_ring->last_finalized_id); + d_state = desc_read(desc_ring, id, NULL, &seq, NULL); - /* Search forward from the oldest descriptor. */ + if (d_state == desc_finalized || d_state == desc_reusable) { + /* + * Begin searching after the last finalized record. + * + * On 0, the search must begin at 0 because of hack#2 + * of the bootstrapping phase it is not known if a + * record at index 0 exists. + */ + if (seq != 0) + seq++; + } else { + /* + * The information about the last finalized sequence number + * has gone. It should happen only when there is a flood of + * new messages and the ringbuffer is rapidly recycled. + * Give up and start from the beginning. + */ + seq = 0; + } + + /* + * The information about the last finalized @seq might be inaccurate. + * Search forward to find the current one. + */ while (_prb_read_valid(rb, &seq, NULL, NULL)) seq++; @@ -2044,6 +2085,7 @@ void prb_init(struct printk_ringbuffer *rb, rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits)); rb->text_data_ring.size_bits = textbits; rb->text_data_ring.data = text_buf; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 73cc80e01cef..18cd25e489b8 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -75,6 +75,7 @@ struct prb_desc_ring { struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; + atomic_long_t last_finalized_id; }; /* @@ -258,6 +259,7 @@ static struct printk_ringbuffer name = { \ .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \ }, \ .text_data_ring = { \ .size_bits = (avgtextbits) + (descbits), \ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index eea265082e97..156a99283b11 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task) return true; } -/* Ensure that nothing can wake it up, even SIGKILL */ +/* + * Ensure that nothing can wake it up, even SIGKILL + * + * A task is switched to this state while a ptrace operation is in progress; + * such that the ptrace operation is uninterruptible. + */ static bool ptrace_freeze_traced(struct task_struct *task) { bool ret = false; @@ -197,7 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { - WRITE_ONCE(task->__state, __TASK_TRACED); + task->jobctl |= JOBCTL_PTRACE_FROZEN; ret = true; } spin_unlock_irq(&task->sighand->siglock); @@ -207,23 +212,21 @@ static bool ptrace_freeze_traced(struct task_struct *task) static void ptrace_unfreeze_traced(struct task_struct *task) { - if (READ_ONCE(task->__state) != __TASK_TRACED) - return; - - WARN_ON(!task->ptrace || task->parent != current); + unsigned long flags; /* - * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely. - * Recheck state under the lock to close this race. + * The child may be awake and may have cleared + * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will + * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew. */ - spin_lock_irq(&task->sighand->siglock); - if (READ_ONCE(task->__state) == __TASK_TRACED) { - if (__fatal_signal_pending(task)) + if (lock_task_sighand(task, &flags)) { + task->jobctl &= ~JOBCTL_PTRACE_FROZEN; + if (__fatal_signal_pending(task)) { + task->jobctl &= ~TASK_TRACED; wake_up_state(task, __TASK_TRACED); - else - WRITE_ONCE(task->__state, TASK_TRACED); + } + unlock_task_sighand(task, &flags); } - spin_unlock_irq(&task->sighand->siglock); } /** @@ -256,7 +259,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) */ read_lock(&tasklist_lock); if (child->ptrace && child->parent == current) { - WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). @@ -266,17 +268,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) } read_unlock(&tasklist_lock); - if (!ret && !ignore_state) { - if (!wait_task_inactive(child, __TASK_TRACED)) { - /* - * This can only happen if may_ptrace_stop() fails and - * ptrace_stop() changes ->state back to TASK_RUNNING, - * so we should not worry about leaking __TASK_TRACED. - */ - WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); - ret = -ESRCH; - } - } + if (!ret && !ignore_state && + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED))) + ret = -ESRCH; return ret; } @@ -371,6 +365,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; } +static int check_ptrace_options(unsigned long data) +{ + if (data & ~(unsigned long)PTRACE_O_MASK) + return -EINVAL; + + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + !IS_ENABLED(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + return 0; +} + static int ptrace_attach(struct task_struct *task, long request, unsigned long addr, unsigned long flags) @@ -382,8 +396,16 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) { if (addr != 0) goto out; + /* + * This duplicates the check in check_ptrace_options() because + * ptrace_attach() and ptrace_setoptions() have historically + * used different error codes for unknown ptrace options. + */ if (flags & ~(unsigned long)PTRACE_O_MASK) goto out; + retval = check_ptrace_options(flags); + if (retval) + return retval; flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); } else { flags = PT_PTRACED; @@ -447,8 +469,10 @@ static int ptrace_attach(struct task_struct *task, long request, * in and out of STOPPED are protected by siglock. */ if (task_is_stopped(task) && - task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) { + task->jobctl &= ~JOBCTL_STOPPED; signal_wake_up_state(task, __TASK_STOPPED); + } spin_unlock(&task->sighand->siglock); @@ -654,22 +678,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { unsigned flags; + int ret; - if (data & ~(unsigned long)PTRACE_O_MASK) - return -EINVAL; - - if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { - if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || - !IS_ENABLED(CONFIG_SECCOMP)) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || - current->ptrace & PT_SUSPEND_SECCOMP) - return -EPERM; - } + ret = check_ptrace_options(data); + if (ret) + return ret; /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; @@ -812,11 +825,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, } #endif -#ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) -#else -#define is_singlestep(request) 0 -#endif #ifdef PTRACE_SINGLEBLOCK #define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) @@ -833,8 +842,6 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, static int ptrace_resume(struct task_struct *child, long request, unsigned long data) { - bool need_siglock; - if (!valid_signal(data)) return -EIO; @@ -870,18 +877,12 @@ static int ptrace_resume(struct task_struct *child, long request, * Note that we need siglock even if ->exit_code == data and/or this * status was not reported yet, the new status must not be cleared by * wait_task_stopped() after resume. - * - * If data == 0 we do not care if wait_task_stopped() reports the old - * status and clears the code too; this can't race with the tracee, it - * takes siglock after resume. */ - need_siglock = data && !thread_group_empty(current); - if (need_siglock) - spin_lock_irq(&child->sighand->siglock); + spin_lock_irq(&child->sighand->siglock); child->exit_code = data; + child->jobctl &= ~JOBCTL_TRACED; wake_up_state(child, __TASK_TRACED); - if (need_siglock) - spin_unlock_irq(&child->sighand->siglock); + spin_unlock_irq(&child->sighand->siglock); return 0; } @@ -1204,9 +1205,7 @@ int ptrace_request(struct task_struct *child, long request, } #endif -#ifdef PTRACE_SINGLESTEP case PTRACE_SINGLESTEP: -#endif #ifdef PTRACE_SINGLEBLOCK case PTRACE_SINGLEBLOCK: #endif @@ -1219,9 +1218,8 @@ int ptrace_request(struct task_struct *child, long request, return ptrace_resume(child, request, data); case PTRACE_KILL: - if (child->exit_state) /* already dead */ - return 0; - return ptrace_resume(child, request, SIGKILL); + send_sig_info(SIGKILL, SEND_SIG_NOINFO, child); + return 0; #ifdef CONFIG_HAVE_ARCH_TRACEHOOK case PTRACE_GETREGSET: @@ -1268,10 +1266,6 @@ int ptrace_request(struct task_struct *child, long request, return ret; } -#ifndef arch_ptrace_attach -#define arch_ptrace_attach(child) do { } while (0) -#endif - SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, unsigned long, data) { @@ -1280,8 +1274,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, if (request == PTRACE_TRACEME) { ret = ptrace_traceme(); - if (!ret) - arch_ptrace_attach(current); goto out; } @@ -1293,12 +1285,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); goto out_put_task_struct; } @@ -1438,12 +1424,6 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); goto out_put_task_struct; } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bf8e341e75b4..1c630e573548 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -77,31 +77,56 @@ config TASKS_RCU_GENERIC This option enables generic infrastructure code supporting task-based RCU implementations. Not for manual selection. +config FORCE_TASKS_RCU + bool "Force selection of TASKS_RCU" + depends on RCU_EXPERT + select TASKS_RCU + default n + help + This option force-enables a task-based RCU implementation + that uses only voluntary context switch (not preemption!), + idle, and user-mode execution as quiescent states. Not for + manual selection in most cases. + config TASKS_RCU - def_bool PREEMPTION + bool + default n + select IRQ_WORK + +config FORCE_TASKS_RUDE_RCU + bool "Force selection of Tasks Rude RCU" + depends on RCU_EXPERT + select TASKS_RUDE_RCU + default n help - This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. Not for manual selection. + This option force-enables a task-based RCU implementation + that uses only context switch (including preemption) and + user-mode execution as quiescent states. It forces IPIs and + context switches on all online CPUs, including idle ones, + so use with caution. Not for manual selection in most cases. config TASKS_RUDE_RCU - def_bool 0 + bool + default n + select IRQ_WORK + +config FORCE_TASKS_TRACE_RCU + bool "Force selection of Tasks Trace RCU" + depends on RCU_EXPERT + select TASKS_TRACE_RCU + default n help This option enables a task-based RCU implementation that uses - only context switch (including preemption) and user-mode - execution as quiescent states. It forces IPIs and context - switches on all online CPUs, including idle ones, so use - with caution. + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the + CPU hotplug code paths. It can force IPIs on online CPUs, + including idle ones, so use with caution. Not for manual + selection in most cases. config TASKS_TRACE_RCU - def_bool 0 + bool + default n select IRQ_WORK - help - This option enables a task-based RCU implementation that uses - explicit rcu_read_lock_trace() read-side markers, and allows - these readers to appear in the idle loop as well as on the CPU - hotplug code paths. It can force IPIs on online CPUs, including - idle ones, so use with caution. config RCU_STALL_COMMON def_bool TREE_RCU @@ -195,6 +220,20 @@ config RCU_BOOST_DELAY Accept the default if unsure. +config RCU_EXP_KTHREAD + bool "Perform RCU expedited work in a real-time kthread" + depends on RCU_BOOST && RCU_EXPERT + default !PREEMPT_RT && NR_CPUS <= 32 + help + Use this option to further reduce the latencies of expedited + grace periods at the expense of being more disruptive. + + This option is disabled by default on PREEMPT_RT=y kernels which + disable expedited grace periods after boot by unconditionally + setting rcupdate.rcu_normal_after_boot=1. + + Accept the default if unsure. + config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU @@ -225,7 +264,7 @@ config RCU_NOCB_CPU config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT + depends on RCU_EXPERT && TASKS_TRACE_RCU default PREEMPT_RT || NR_CPUS < 8 help Use this option to further reduce the number of IPIs sent diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4fd64999300f..9b64e55d4f61 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -28,9 +28,6 @@ config RCU_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -47,9 +44,6 @@ config RCU_TORTURE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests @@ -66,9 +60,6 @@ config RCU_REF_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance tests @@ -91,6 +82,20 @@ config RCU_CPU_STALL_TIMEOUT RCU grace period persists, additional CPU stall warnings are printed at more widely spaced intervals. +config RCU_EXP_CPU_STALL_TIMEOUT + int "Expedited RCU CPU stall timeout in milliseconds" + depends on RCU_STALL_COMMON + range 0 21000 + default 20 if ANDROID + default 0 if !ANDROID + help + If a given expedited RCU grace period extends more than the + specified number of milliseconds, a CPU stall warning is printed. + If the RCU grace period persists, additional CPU stall warnings + are printed at more widely spaced intervals. A value of zero + says to use the RCU_CPU_STALL_TIMEOUT value converted from + seconds to milliseconds. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..4916077119f3 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -23,6 +23,8 @@ #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) +extern int sysctl_sched_rt_runtime; + /* * Return the counter portion of a sequence number previously returned * by rcu_seq_snap() or rcu_seq_current(). @@ -210,7 +212,9 @@ static inline bool rcu_stall_is_suppressed_at_boot(void) extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; +extern int rcu_exp_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +int rcu_exp_jiffies_till_stall_check(void); static inline bool rcu_stall_is_suppressed(void) { @@ -523,6 +527,8 @@ static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { ret static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } +static inline void rcu_gp_slow_register(atomic_t *rgssp) { } +static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); @@ -534,14 +540,19 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +#ifdef CONFIG_RCU_EXP_KTHREAD +extern struct kthread_worker *rcu_exp_gp_kworker; +extern struct kthread_worker *rcu_exp_par_gp_kworker; +#else /* !CONFIG_RCU_EXP_KTHREAD */ extern struct workqueue_struct *rcu_par_gp_wq; +#endif /* CONFIG_RCU_EXP_KTHREAD */ +void rcu_gp_slow_register(atomic_t *rgssp); +void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU -bool rcu_is_nocb_cpu(int cpu); void rcu_bind_current_to_nocb(void); #else -static inline bool rcu_is_nocb_cpu(int cpu) { return false; } static inline void rcu_bind_current_to_nocb(void) { } #endif diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 81145c3ece25..c54ea2b6a36b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -505,10 +505,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); /* - * Callbacks moved, so clean up the misordered ->tails[] pointers - * that now point into the middle of the list of ready-to-invoke - * callbacks. The overall effect is to copy down the later pointers - * into the gap that was created by the now-ready segments. + * Callbacks moved, so there might be an empty RCU_WAIT_TAIL + * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the + * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap + * created by the now-ready-to-invoke segments. */ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index e373fbe44da5..431cee212467 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -56,13 +56,13 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp, int flags) { - rsclp->flags |= flags; + WRITE_ONCE(rsclp->flags, rsclp->flags | flags); } static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp, int flags) { - rsclp->flags &= ~flags; + WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags); } static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 5e4f1f83d38e..277a5bfb37d4 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -268,6 +268,8 @@ static struct rcu_scale_ops srcud_ops = { .name = "srcud" }; +#ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks scalability testing. */ @@ -295,6 +297,16 @@ static struct rcu_scale_ops tasks_ops = { .name = "tasks" }; +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for RCU-tasks-trace scalability testing. */ @@ -324,6 +336,14 @@ static struct rcu_scale_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -797,7 +817,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 422f7e4cc08d..7120165a9342 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -284,7 +284,7 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ +static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ /* * Allocate an element from the rcu_tortures pool. @@ -387,7 +387,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!READ_ONCE(rcu_fwd_cb_nodelay) && + if (!atomic_read(&rcu_fwd_cb_nodelay) && !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); @@ -674,6 +674,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcu" @@ -708,6 +709,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcud" @@ -736,6 +738,50 @@ static struct rcu_torture_ops busted_srcud_ops = { }; /* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + +#ifdef CONFIG_TASKS_RCU + +/* * Definitions for RCU-tasks torture testing. */ @@ -778,47 +824,16 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -/* - * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. - */ +#define TASKS_OPS &tasks_ops, -static void synchronize_rcu_trivial(void) -{ - int cpu; +#else // #ifdef CONFIG_TASKS_RCU - for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); - WARN_ON_ONCE(raw_smp_processor_id() != cpu); - } -} +#define TASKS_OPS -static int rcu_torture_read_lock_trivial(void) __acquires(RCU) -{ - preempt_disable(); - return 0; -} +#endif // #else #ifdef CONFIG_TASKS_RCU -static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) -{ - preempt_enable(); -} -static struct rcu_torture_ops trivial_ops = { - .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock_trivial, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock_trivial, - .readlock_held = torture_readlock_not_held, - .get_gp_seq = rcu_no_completed, - .sync = synchronize_rcu_trivial, - .exp_sync = synchronize_rcu_trivial, - .fqs = NULL, - .stats = NULL, - .irq_capable = 1, - .name = "trivial" -}; +#ifdef CONFIG_TASKS_RUDE_RCU /* * Definitions for rude RCU-tasks torture testing. @@ -849,6 +864,17 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU + + +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for tracing RCU-tasks torture testing. */ @@ -891,6 +917,15 @@ static struct rcu_torture_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU + + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -997,7 +1032,7 @@ static int rcu_torture_boost(void *arg) goto checkwait; /* Wait for the next test interval. */ - oldstarttime = boost_starttime; + oldstarttime = READ_ONCE(boost_starttime); while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); if (stutter_wait("rcu_torture_boost")) @@ -1041,10 +1076,11 @@ static int rcu_torture_boost(void *arg) * interval. Besides, we are running at RT priority, * so delays should be relatively rare. */ - while (oldstarttime == boost_starttime && !kthread_should_stop()) { + while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) { if (mutex_trylock(&boost_mutex)) { if (oldstarttime == boost_starttime) { - boost_starttime = jiffies + test_boost_interval * HZ; + WRITE_ONCE(boost_starttime, + jiffies + test_boost_interval * HZ); n_rcu_torture_boosts++; } mutex_unlock(&boost_mutex); @@ -1175,7 +1211,7 @@ rcu_torture_writer(void *arg) " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, - "rcu_torture_writer: No update-side primitives.\n")) { + "%s: No update-side primitives.\n", __func__)) { /* * No updates primitives, so don't try updating. * The resulting test won't be testing much, hence the @@ -1183,6 +1219,7 @@ rcu_torture_writer(void *arg) */ rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); + return 0; } do { @@ -1276,7 +1313,7 @@ rcu_torture_writer(void *arg) boot_ended = rcu_inkernel_boot_has_ended(); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && - !READ_ONCE(rcu_fwd_cb_nodelay) && + !atomic_read(&rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && boot_ended) @@ -1319,6 +1356,17 @@ rcu_torture_fakewriter(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, MAX_NICE); + if (WARN_ONCE(nsynctypes == 0, + "%s: No update-side primitives.\n", __func__)) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ + torture_kthread_stopping("rcu_torture_fakewriter"); + return 0; + } + do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && @@ -2180,7 +2228,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) if (rfp->n_launders_hist[i].n_launders > 0) break; - mutex_lock(&rcu_fwd_mutex); // Serialize histograms. pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):", __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat); gps_old = rfp->rcu_launder_gp_seq_start; @@ -2193,7 +2240,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) gps_old = gps; } pr_cont("\n"); - mutex_unlock(&rcu_fwd_mutex); } /* Callback function for continuous-flood RCU callbacks. */ @@ -2281,6 +2327,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, unsigned long stopat; static DEFINE_TORTURE_RANDOM(trs); + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); if (!cur_ops->sync) return; // Cannot do need_resched() forward progress testing without ->sync. if (cur_ops->call && cur_ops->cb_barrier) { @@ -2289,7 +2336,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, } /* Tight loop containing cond_resched(). */ - WRITE_ONCE(rcu_fwd_cb_nodelay, true); + atomic_inc(&rcu_fwd_cb_nodelay); cur_ops->sync(); /* Later readers see above write. */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); @@ -2325,6 +2372,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, if (selfpropcb) { WRITE_ONCE(fcs.stop, 1); cur_ops->sync(); /* Wait for running CB to complete. */ + pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id); cur_ops->cb_barrier(); /* Wait for queued callbacks. */ } @@ -2333,7 +2381,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, destroy_rcu_head_on_stack(&fcs.rh); } schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ - WRITE_ONCE(rcu_fwd_cb_nodelay, false); + atomic_dec(&rcu_fwd_cb_nodelay); } /* Carry out call_rcu() forward-progress testing. */ @@ -2353,13 +2401,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) unsigned long stopat; unsigned long stoppedat; + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); if (READ_ONCE(rcu_fwd_emergency_stop)) return; /* Get out of the way quickly, no GP wait! */ if (!cur_ops->call) return; /* Can't do call_rcu() fwd prog without ->call. */ /* Loop continuously posting RCU callbacks. */ - WRITE_ONCE(rcu_fwd_cb_nodelay, true); + atomic_inc(&rcu_fwd_cb_nodelay); cur_ops->sync(); /* Later readers see above write. */ WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES; @@ -2414,6 +2463,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(rfp); @@ -2427,11 +2477,13 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); + mutex_lock(&rcu_fwd_mutex); // Serialize histograms. rcu_torture_fwd_cb_hist(rfp); + mutex_unlock(&rcu_fwd_mutex); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ tick_dep_clear_task(current, TICK_DEP_BIT_RCU); - WRITE_ONCE(rcu_fwd_cb_nodelay, false); + atomic_dec(&rcu_fwd_cb_nodelay); } @@ -2511,7 +2563,7 @@ static int rcu_torture_fwd_prog(void *args) firsttime = false; WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); } else { - while (READ_ONCE(rcu_fwd_seq) == oldseq) + while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop()) schedule_timeout_interruptible(1); oldseq = READ_ONCE(rcu_fwd_seq); } @@ -2905,12 +2957,16 @@ rcu_torture_cleanup(void) int i; if (torture_cleanup_begin()) { - if (cur_ops->cb_barrier != NULL) + if (cur_ops->cb_barrier != NULL) { + pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); + } + rcu_gp_slow_unregister(NULL); return; } if (!cur_ops) { torture_cleanup_end(); + rcu_gp_slow_unregister(NULL); return; } @@ -2961,8 +3017,10 @@ rcu_torture_cleanup(void) * Wait for all RCU callbacks to fire, then do torture-type-specific * cleanup operations. */ - if (cur_ops->cb_barrier != NULL) + if (cur_ops->cb_barrier != NULL) { + pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); + } if (cur_ops->cleanup != NULL) cur_ops->cleanup(); @@ -3005,6 +3063,7 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); + rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -3085,9 +3144,9 @@ rcu_torture_init(void) int flags = 0; unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, - &tasks_tracing_ops, &trivial_ops, + &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, + TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS + &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -3309,6 +3368,7 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); torture_init_end(); + rcu_gp_slow_register(&rcu_fwd_cb_nodelay); return 0; unwind: diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 5489ff7f478e..909644abee67 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -207,6 +207,8 @@ static struct ref_scale_ops srcu_ops = { .name = "srcu" }; +#ifdef CONFIG_TASKS_RCU + // Definitions for RCU Tasks ref scale testing: Empty read markers. // These definitions also work for RCU Rude readers. static void rcu_tasks_ref_scale_read_section(const int nloops) @@ -232,6 +234,16 @@ static struct ref_scale_ops rcu_tasks_ops = { .name = "rcu-tasks" }; +#define RCU_TASKS_OPS &rcu_tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define RCU_TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for RCU Tasks Trace ref scale testing. static void rcu_trace_ref_scale_read_section(const int nloops) { @@ -261,6 +273,14 @@ static struct ref_scale_ops rcu_trace_ops = { .name = "rcu-trace" }; +#define RCU_TRACE_OPS &rcu_trace_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define RCU_TRACE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for reference count static atomic_t refcnt; @@ -790,7 +810,7 @@ ref_scale_init(void) long i; int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, + &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6833d8887181..50ba70f019de 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -24,6 +24,7 @@ #include <linux/smp.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/srcu.h> #include "rcu.h" @@ -38,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444); static ulong counter_wrap_check = (ULONG_MAX >> 2); module_param(counter_wrap_check, ulong, 0444); +/* + * Control conversion to SRCU_SIZE_BIG: + * 0: Don't convert at all. + * 1: Convert at init_srcu_struct() time. + * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + * 3: Decide at boot time based on system shape (default). + * 0x1x: Convert when excessive contention encountered. + */ +#define SRCU_SIZING_NONE 0 +#define SRCU_SIZING_INIT 1 +#define SRCU_SIZING_TORTURE 2 +#define SRCU_SIZING_AUTO 3 +#define SRCU_SIZING_CONTEND 0x10 +#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) +#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) +#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) +#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) +#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) +static int convert_to_big = SRCU_SIZING_AUTO; +module_param(convert_to_big, int, 0444); + +/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ +static int big_cpu_lim __read_mostly = 128; +module_param(big_cpu_lim, int, 0444); + +/* Contention events per jiffy to initiate transition to big. */ +static int small_contention_lim __read_mostly = 100; +module_param(small_contention_lim, int, 0444); + /* Early-boot callback-management, so early that no lock is required! */ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; @@ -48,39 +78,90 @@ static void process_srcu(struct work_struct *work); static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irq_rcu_node(p) \ +#define spin_unlock_irq_rcu_node(p) \ spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ +#define spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ /* - * Initialize SRCU combining tree. Note that statically allocated + * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *ssp) +static void init_srcu_struct_data(struct srcu_struct *ssp) +{ + int cpu; + struct srcu_data *sdp; + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->mynode = NULL; + sdp->cpu = cpu; + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); + sdp->ssp = ssp; + } +} + +/* Invalid seq state, used during snp node initialization */ +#define SRCU_SNP_INIT_SEQ 0x2 + +/* + * Check whether sequence number corresponding to snp node, + * is invalid. + */ +static inline bool srcu_invl_snp_seq(unsigned long s) +{ + return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; +} + +/* + * Allocated and initialize SRCU combining tree. Returns @true if + * allocation succeeded and @false otherwise. + */ +static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) { int cpu; int i; @@ -92,6 +173,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); + ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); + if (!ssp->node) + return false; /* Work out the overall tree geometry. */ ssp->level[0] = &ssp->node[0]; @@ -105,10 +189,10 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { - snp->srcu_have_cbs[i] = 0; + snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; snp->srcu_data_have_cbs[i] = 0; } - snp->srcu_gp_seq_needed_exp = 0; + snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; if (snp == &ssp->node[0]) { @@ -129,39 +213,31 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; snp_first = ssp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); - rcu_segcblist_init(&sdp->srcu_cblist); - sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) snp->grplo = cpu; snp->grphi = cpu; } - sdp->cpu = cpu; - INIT_WORK(&sdp->work, srcu_invoke_callbacks); - timer_setup(&sdp->delay_work, srcu_delay_timer, 0); - sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + return true; } /* * Initialize non-compile-time initialized fields, including the - * associated srcu_node and srcu_data structures. The is_static - * parameter is passed through to init_srcu_struct_nodes(), and - * also tells us that ->sda has already been wired up to srcu_data. + * associated srcu_node and srcu_data structures. The is_static parameter + * tells us that ->sda has already been wired up to srcu_data. */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { + ssp->srcu_size_state = SRCU_SIZE_SMALL; + ssp->node = NULL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; @@ -170,13 +246,25 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_barrier_mutex); atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->work, process_srcu); + ssp->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) return -ENOMEM; - init_srcu_struct_nodes(ssp); + init_srcu_struct_data(ssp); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + return -ENOMEM; + } + } else { + WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); + } + } smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -214,6 +302,86 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* + * Initiate a transition to SRCU_SIZE_BIG with lock held. + */ +static void __srcu_transition_to_big(struct srcu_struct *ssp) +{ + lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); +} + +/* + * Initiate an idempotent transition to SRCU_SIZE_BIG. + */ +static void srcu_transition_to_big(struct srcu_struct *ssp) +{ + unsigned long flags; + + /* Double-checked locking on ->srcu_size-state. */ + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + return; + spin_lock_irqsave_rcu_node(ssp, flags); + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp, flags); + return; + } + __srcu_transition_to_big(ssp); + spin_unlock_irqrestore_rcu_node(ssp, flags); +} + +/* + * Check to see if the just-encountered contention event justifies + * a transition to SRCU_SIZE_BIG. + */ +static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) +{ + unsigned long j; + + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + return; + j = jiffies; + if (ssp->srcu_size_jiffies != j) { + ssp->srcu_size_jiffies = j; + ssp->srcu_n_lock_retries = 0; + } + if (++ssp->srcu_n_lock_retries <= small_contention_lim) + return; + __srcu_transition_to_big(ssp); +} + +/* + * Acquire the specified srcu_data structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +{ + struct srcu_struct *ssp = sdp->ssp; + + if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); + spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(sdp, *flags); +} + +/* + * Acquire the specified srcu_struct structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +{ + if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); +} + +/* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added @@ -343,7 +511,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) return sum; } -#define SRCU_INTERVAL 1 +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. +#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances. /* * Return grace-period delay, zero if there are expedited grace @@ -351,10 +522,18 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), - READ_ONCE(ssp->srcu_gp_seq_needed_exp))) - return 0; - return SRCU_INTERVAL; + unsigned long jbase = SRCU_INTERVAL; + + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + jbase = 0; + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) + jbase += jiffies - READ_ONCE(ssp->srcu_gp_start); + if (!jbase) { + WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) + jbase = 1; + } + return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } /** @@ -382,13 +561,20 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) return; /* Forgot srcu_barrier(), so just leak it! */ } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), + rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } - free_percpu(ssp->sda); - ssp->sda = NULL; + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } + kfree(ssp->node); + ssp->node = NULL; + ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -434,9 +620,13 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(ssp->sda); + struct srcu_data *sdp; int state; + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ @@ -445,6 +635,8 @@ static void srcu_gp_start(struct srcu_struct *ssp) (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ + WRITE_ONCE(ssp->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); state = rcu_seq_state(ssp->srcu_gp_seq); @@ -517,7 +709,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) int idx; unsigned long mask; struct srcu_data *sdp; + unsigned long sgsne; struct srcu_node *snp; + int ss_state; /* Prevent more than one additional grace period. */ mutex_lock(&ssp->srcu_cb_mutex); @@ -526,7 +720,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(ssp); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(ssp); + cbdelay = !!srcu_get_delay(ssp); WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); @@ -537,38 +731,45 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); - cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; - if (last_lvl) - cbs = snp->srcu_have_cbs[idx] == gpseq; - snp->srcu_have_cbs[idx] = gpseq; - rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); - if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); - mask = snp->srcu_data_have_cbs[idx]; - snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); - if (cbs) - srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); - - /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check) && last_lvl) - for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed + 100)) - sdp->srcu_gp_seq_needed = gpseq; - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed_exp + 100)) - sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); - } + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_BARRIER) { + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); + } else { + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + srcu_for_each_node_breadth_first(ssp, snp) { + spin_lock_irq_rcu_node(snp); + cbs = false; + last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + if (last_lvl) + cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + sgsne = snp->srcu_gp_seq_needed_exp; + if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); + if (ss_state < SRCU_SIZE_BIG) + mask = ~0; + else + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + spin_unlock_irq_rcu_node(snp); + if (cbs) + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); + } } + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_irqsave_rcu_node(sdp, flags); + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irqrestore_rcu_node(sdp, flags); + } + /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&ssp->srcu_cb_mutex); @@ -583,6 +784,14 @@ static void srcu_gp_end(struct srcu_struct *ssp) } else { spin_unlock_irq_rcu_node(ssp); } + + /* Transition to big if needed. */ + if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { + if (ss_state == SRCU_SIZE_ALLOC) + init_srcu_struct_nodes(ssp, GFP_KERNEL); + else + smp_store_release(&ssp->srcu_size_state, ss_state + 1); + } } /* @@ -596,20 +805,24 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp unsigned long s) { unsigned long flags; + unsigned long sgsne; - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) || - ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) - return; - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + if (snp) + for (; snp != NULL; snp = snp->srcu_parent) { + sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); + if (rcu_seq_done(&ssp->srcu_gp_seq, s) || + (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) + return; + spin_lock_irqsave_rcu_node(snp, flags); + sgsne = snp->srcu_gp_seq_needed_exp; + if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { + spin_unlock_irqrestore_rcu_node(snp, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); - return; } - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); @@ -630,39 +843,47 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, { unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); - struct srcu_node *snp = sdp->mynode; + unsigned long sgsne; + struct srcu_node *snp; + struct srcu_node *snp_leaf; unsigned long snp_seq; - /* Each pass through the loop does one level of the srcu_node tree. */ - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) - return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + /* Ensure that snp node tree is fully initialized before traversing it */ + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + snp_leaf = NULL; + else + snp_leaf = sdp->mynode; + + if (snp_leaf) + /* Each pass through the loop does one level of the srcu_node tree. */ + for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; - if (snp == sdp->mynode && snp_seq == s) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); - if (snp == sdp->mynode && snp_seq != s) { - srcu_schedule_cbs_sdp(sdp, do_norm - ? SRCU_INTERVAL - : 0); + if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { + if (snp == snp_leaf && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + spin_unlock_irqrestore_rcu_node(snp, flags); + if (snp == snp_leaf && snp_seq != s) { + srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(ssp, snp, s); return; } - if (!do_norm) - srcu_funnel_exp_start(ssp, snp, s); - return; + snp->srcu_have_cbs[idx] = s; + if (snp == snp_leaf) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + sgsne = snp->srcu_gp_seq_needed_exp; + if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(snp, flags); } - snp->srcu_have_cbs[idx] = s; - if (snp == sdp->mynode) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -678,9 +899,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); srcu_gp_start(ssp); + + // And how can that list_add() in the "else" clause + // possibly be safe for concurrent execution? Well, + // it isn't. And it does not have to be. After all, it + // can only be executed during early boot when there is only + // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) queue_delayed_work(rcu_gp_wq, &ssp->work, - srcu_get_delay(ssp)); + !!srcu_get_delay(ssp)); else if (list_empty(&ssp->work.work.entry)) list_add(&ssp->work.work.entry, &srcu_boot_list); } @@ -814,11 +1041,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, bool needgp = false; unsigned long s; struct srcu_data *sdp; + struct srcu_node *sdp_mynode; + int ss_state; check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); - sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_CALL) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, @@ -834,10 +1067,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, needexp = true; } spin_unlock_irqrestore_rcu_node(sdp, flags); + + /* Ensure that snp node tree is fully initialized before traversing it */ + if (ss_state < SRCU_SIZE_WAIT_BARRIER) + sdp_mynode = NULL; + else + sdp_mynode = sdp->mynode; + if (needgp) srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_funnel_exp_start(ssp, sdp_mynode, s); srcu_read_unlock(ssp, idx); return s; } @@ -1097,6 +1337,28 @@ static void srcu_barrier_cb(struct rcu_head *rhp) complete(&ssp->srcu_barrier_completion); } +/* + * Enqueue an srcu_barrier() callback on the specified srcu_data + * structure's ->cblist. but only if that ->cblist already has at least one + * callback enqueued. Note that if a CPU already has callbacks enqueue, + * it must have already registered the need for a future grace period, + * so all we need do is enqueue a callback that will use the same grace + * period as the last callback already in the queue. + */ +static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) +{ + spin_lock_irq_rcu_node(sdp); + atomic_inc(&ssp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); + atomic_dec(&ssp->srcu_barrier_cpu_cnt); + } + spin_unlock_irq_rcu_node(sdp); +} + /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. * @ssp: srcu_struct on which to wait for in-flight callbacks. @@ -1104,7 +1366,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) void srcu_barrier(struct srcu_struct *ssp) { int cpu; - struct srcu_data *sdp; + int idx; unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); check_init_srcu_struct(ssp); @@ -1120,27 +1382,13 @@ void srcu_barrier(struct srcu_struct *ssp) /* Initial count prevents reaching zero until all CBs are posted. */ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); - /* - * Each pass through this loop enqueues a callback, but only - * on CPUs already having callbacks enqueued. Note that if - * a CPU already has callbacks enqueue, it must have already - * registered the need for a future grace period, so all we - * need do is enqueue a callback that will use the same - * grace period as the last callback already in the queue. - */ - for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); - sdp->srcu_barrier_head.func = srcu_barrier_cb; - debug_rcu_head_queue(&sdp->srcu_barrier_head); - if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head)) { - debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); - } - spin_unlock_irq_rcu_node(sdp); - } + idx = srcu_read_lock(ssp); + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); + else + for_each_possible_cpu(cpu) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); + srcu_read_unlock(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) @@ -1214,6 +1462,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) srcu_flip(ssp); spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_n_exp_nodelay = 0; spin_unlock_irq_rcu_node(ssp); } @@ -1228,6 +1477,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } + ssp->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1318,12 +1568,28 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { + unsigned long curdelay; + unsigned long j; struct srcu_struct *ssp; ssp = container_of(work, struct srcu_struct, work.work); srcu_advance_state(ssp); - srcu_reschedule(ssp, srcu_get_delay(ssp)); + curdelay = srcu_get_delay(ssp); + if (curdelay) { + WRITE_ONCE(ssp->reschedule_count, 0); + } else { + j = jiffies; + if (READ_ONCE(ssp->reschedule_jiffies) == j) { + WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); + if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY) + curdelay = 1; + } else { + WRITE_ONCE(ssp->reschedule_count, 1); + WRITE_ONCE(ssp->reschedule_jiffies, j); + } + } + srcu_reschedule(ssp, curdelay); } void srcutorture_get_gp_data(enum rcutorture_type test_type, @@ -1337,43 +1603,69 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +static const char * const srcu_size_state_name[] = { + "SRCU_SIZE_SMALL", + "SRCU_SIZE_ALLOC", + "SRCU_SIZE_WAIT_BARRIER", + "SRCU_SIZE_WAIT_CALL", + "SRCU_SIZE_WAIT_CBS1", + "SRCU_SIZE_WAIT_CBS2", + "SRCU_SIZE_WAIT_CBS3", + "SRCU_SIZE_WAIT_CBS4", + "SRCU_SIZE_BIG", + "SRCU_SIZE_???", +}; + void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; + int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state_idx = ss_state; idx = ssp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *sdp; - - sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(sdp->srcu_unlock_count[!idx]); - u1 = data_race(sdp->srcu_unlock_count[idx]); - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = data_race(sdp->srcu_lock_count[!idx]); - l1 = data_race(sdp->srcu_lock_count[idx]); - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %c)", - cpu, c0, c1, - "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); - s0 += c0; - s1 += c1; + if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) + ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; + pr_alert("%s%s Tree SRCU g%ld state %d (%s)", + tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + srcu_size_state_name[ss_state_idx]); + if (!ssp->sda) { + // Called after cleanup_srcu_struct(), perhaps. + pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); + } else { + pr_cont(" per-CPU(idx=%d):", idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *sdp; + + sdp = per_cpu_ptr(ssp->sda, cpu); + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); } - pr_cont(" T(%ld,%ld)\n", s0, s1); + if (SRCU_SIZING_IS_TORTURE()) + srcu_transition_to_big(ssp); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); @@ -1390,6 +1682,17 @@ void __init srcu_init(void) { struct srcu_struct *ssp; + /* Decide on srcu_struct-size strategy. */ + if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { + if (nr_cpu_ids >= big_cpu_lim) { + convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. + pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); + } else { + convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; + pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); + } + } + /* * Once that is set, call_srcu() can follow the normal path and * queue delayed work. This must follow RCU workqueues creation @@ -1400,6 +1703,8 @@ void __init srcu_init(void) ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); list_del_init(&ssp->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) + ssp->srcu_size_state = SRCU_SIZE_ALLOC; queue_work(rcu_gp_wq, &ssp->work.work); } } diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 33d896d85902..5cefc702158f 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp) * a slowpath during the update. After this function returns, all * subsequent calls to rcu_sync_is_idle() will return false, which * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. + * rcu_sync_exit() re-enables reader fastpaths. * * When called in isolation, rcu_sync_enter() must wait for a grace * period, however, closely spaced calls to rcu_sync_enter() can diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d64f0b1d8cd3..3925e32159b5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -46,7 +46,7 @@ struct rcu_tasks_percpu { /** * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. - * @cbs_wq: Wait queue allowing new callback to get kthread's attention. + * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. @@ -77,7 +77,7 @@ struct rcu_tasks_percpu { * @kname: This flavor's kthread name. */ struct rcu_tasks { - struct wait_queue_head cbs_wq; + struct rcuwait cbs_wait; raw_spinlock_t cbs_gbl_lock; int gp_state; int gp_sleep; @@ -113,17 +113,17 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); #define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ - .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \ + .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \ }; \ static struct rcu_tasks rt_name = \ { \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ .gp_func = gp, \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ .name = n, \ - .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS) + 1, \ + .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \ .percpu_enqueue_lim = 1, \ .percpu_dequeue_lim = 1, \ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ @@ -143,6 +143,11 @@ module_param(rcu_task_ipi_delay, int, 0644); #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +#define RCU_TASK_STALL_INFO (HZ * 10) +static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO; +module_param(rcu_task_stall_info, int, 0644); +static int rcu_task_stall_info_mult __read_mostly = 3; +module_param(rcu_task_stall_info_mult, int, 0444); static int rcu_task_enqueue_lim __read_mostly = -1; module_param(rcu_task_enqueue_lim, int, 0444); @@ -261,14 +266,16 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); rtp = rtpcp->rtpp; - wake_up(&rtp->cbs_wq); + rcuwait_wake_up(&rtp->cbs_wait); } // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { + int chosen_cpu; unsigned long flags; + int ideal_cpu; unsigned long j; bool needadjust = false; bool needwake; @@ -278,8 +285,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rhp->func = func; local_irq_save(flags); rcu_read_lock(); - rtpcp = per_cpu_ptr(rtp->rtpcpu, - smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); + ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); + chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. j = jiffies; @@ -302,7 +310,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim != nr_cpu_ids) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); + WRITE_ONCE(rtp->percpu_enqueue_shift, 0); WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); @@ -417,7 +425,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); @@ -460,7 +468,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu } } - if (rcu_segcblist_empty(&rtpcp->cblist)) + if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); @@ -496,7 +504,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); + housekeeping_affine(current, HK_TYPE_RCU); WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! /* @@ -509,7 +517,9 @@ static int __noreturn rcu_tasks_kthread(void *arg) set_tasks_gp_state(rtp, RTGS_WAIT_CBS); /* If there were none, wait a bit and start over. */ - wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp))); + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); if (needgpcb & 0x2) { // Wait for one grace period. @@ -548,8 +558,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) static void __init rcu_tasks_bootup_oddness(void) { #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + int rtsimc; + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + rtsimc = clamp(rcu_task_stall_info_mult, 1, 10); + if (rtsimc != rcu_task_stall_info_mult) { + pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc); + rcu_task_stall_info_mult = rtsimc; + } #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RCU pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); @@ -568,7 +585,17 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... + int cpu; + bool havecbs = false; + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + havecbs = true; + break; + } + } pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), @@ -576,7 +603,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))], + ".C"[havecbs], s); } #endif // #ifndef CONFIG_TINY_RCU @@ -592,10 +619,15 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t); /* Wait for one RCU-tasks grace period. */ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) { - struct task_struct *g, *t; - unsigned long lastreport; - LIST_HEAD(holdouts); + struct task_struct *g; int fract; + LIST_HEAD(holdouts); + unsigned long j; + unsigned long lastinfo; + unsigned long lastreport; + bool reported = false; + int rtsi; + struct task_struct *t; set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); rtp->pregp_func(); @@ -621,30 +653,50 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * is empty, we are done. */ lastreport = jiffies; + lastinfo = lastreport; + rtsi = READ_ONCE(rcu_task_stall_info); // Start off with initial wait and slowly back off to 1 HZ wait. fract = rtp->init_fract; while (!list_empty(&holdouts)) { + ktime_t exp; bool firstreport; bool needreport; int rtst; - /* Slowly back off waiting for holdouts */ + // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_idle(fract); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + schedule_timeout_idle(fract); + } else { + exp = jiffies_to_nsecs(fract); + __set_current_state(TASK_IDLE); + schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); + } if (fract < HZ) fract++; rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); - if (needreport) + if (needreport) { lastreport = jiffies; + reported = true; + } firstreport = true; WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); rtp->holdouts_func(&holdouts, needreport, &firstreport); + + // Print pre-stall informational messages if needed. + j = jiffies; + if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) { + lastinfo = j; + rtsi = rtsi * rcu_task_stall_info_mult; + pr_info("%s: %s grace period %lu is %lu jiffies old.\n", + __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start); + } } set_tasks_gp_state(rtp, RTGS_POST_GP); @@ -950,6 +1002,9 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { + if (num_online_cpus() <= 1) + return; // Fastpath for only one CPU. + rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4c25a6283b0..c25ba442044a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -87,11 +87,12 @@ static struct rcu_state rcu_state = { .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex), + .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock), .name = RCU_NAME, .abbr = RCU_ABBR, .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), - .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock), + .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -153,7 +154,7 @@ static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); -/* rcuc/rcub kthread realtime priority */ +/* rcuc/rcub/rcuop kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; module_param(kthread_prio, int, 0444); @@ -222,6 +223,16 @@ static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) } /* + * Is the CPU corresponding to the specified rcu_data structure online + * from RCU's perspective? This perspective is given by that structure's + * ->qsmaskinitnext field rather than by the global cpu_online_mask. + */ +static bool rcu_rdp_cpu_online(struct rcu_data *rdp) +{ + return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); +} + +/* * Return true if an RCU grace period is in progress. The READ_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. @@ -1086,9 +1097,8 @@ void rcu_irq_enter_irqson(void) * Just check whether or not this CPU has non-offloaded RCU callbacks * queued. */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) +int rcu_needs_cpu(void) { - *nextevt = KTIME_MAX; return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); } @@ -1167,15 +1177,20 @@ void rcu_request_urgent_qs_task(struct task_struct *t) bool rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; - struct rcu_node *rnp; bool ret = false; if (in_nmi() || !rcu_scheduler_fully_active) return true; preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; - if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1) + /* + * Strictly, we care here about the case where the current CPU is + * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask + * not being up to date. So arch_spin_is_locked() might have a + * false positive if it's held by some *other* CPU, but that's + * OK because that just means a false *negative* on the warning. + */ + if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) ret = true; preempt_enable_notrace(); return ret; @@ -1260,8 +1275,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * For more detail, please refer to the "Hotplug CPU" section * of RCU's Requirements documentation. */ - if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) { - bool onl; + if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) { struct rcu_node *rnp1; pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", @@ -1270,9 +1284,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); - onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", - __func__, rdp->cpu, ".o"[onl], + __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)], (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); return 1; /* Break things loose after complaining. */ @@ -1666,6 +1679,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); + if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -1692,11 +1707,37 @@ static void note_gp_changes(struct rcu_data *rdp) rcu_gp_kthread_wake(); } +static atomic_t *rcu_gp_slow_suppress; + +/* Register a counter to suppress debugging grace-period delays. */ +void rcu_gp_slow_register(atomic_t *rgssp) +{ + WARN_ON_ONCE(rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, rgssp); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_register); + +/* Unregister a counter, with NULL for not caring which. */ +void rcu_gp_slow_unregister(atomic_t *rgssp) +{ + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, NULL); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister); + +static bool rcu_gp_slow_is_suppressed(void) +{ + atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress); + + return rgssp && atomic_read(rgssp); +} + static void rcu_gp_slow(int delay) { - if (delay > 0 && - !(rcu_seq_ctr(rcu_state.gp_seq) % - (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + if (!rcu_gp_slow_is_suppressed() && delay > 0 && + !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_idle(delay); } @@ -1739,7 +1780,6 @@ static void rcu_strict_gp_boundary(void *unused) */ static noinline_for_stack bool rcu_gp_init(void) { - unsigned long firstseq; unsigned long flags; unsigned long oldmask; unsigned long mask; @@ -1782,22 +1822,17 @@ static noinline_for_stack bool rcu_gp_init(void) * of RCU's Requirements documentation. */ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); + /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { - // Wait for CPU-hotplug operations that might have - // started before this grace period did. - smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. - firstseq = READ_ONCE(rnp->ofl_seq); - if (firstseq & 0x1) - while (firstseq == READ_ONCE(rnp->ofl_seq)) - schedule_timeout_idle(1); // Can't wake unless RCU is watching. - smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values. - raw_spin_lock(&rcu_state.ofl_lock); - raw_spin_lock_irq_rcu_node(rnp); + local_irq_save(flags); + arch_spin_lock(&rcu_state.ofl_lock); + raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq_rcu_node(rnp); - raw_spin_unlock(&rcu_state.ofl_lock); + raw_spin_unlock_rcu_node(rnp); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(flags); continue; } @@ -1832,8 +1867,9 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq_rcu_node(rnp); - raw_spin_unlock(&rcu_state.ofl_lock); + raw_spin_unlock_rcu_node(rnp); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(flags); } rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ @@ -2088,14 +2124,29 @@ static noinline void rcu_gp_cleanup(void) /* Advance CBs to reduce false positives below. */ offloaded = rcu_rdp_is_offloaded(rdp); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { + + // We get here if a grace period was needed (“needgp”) + // and the above call to rcu_accelerate_cbs() did not set + // the RCU_GP_FLAG_INIT bit in ->gp_state (which records + // the need for another grace period). The purpose + // of the “offloaded” check is to avoid invoking + // rcu_accelerate_cbs() on an offloaded CPU because we do not + // hold the ->nocb_lock needed to safely access an offloaded + // ->cblist. We do not want to acquire that lock because + // it can be heavily contended during callback floods. + WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); - trace_rcu_grace_period(rcu_state.name, - rcu_state.gp_seq, - TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); } else { - WRITE_ONCE(rcu_state.gp_flags, - rcu_state.gp_flags & RCU_GP_FLAG_INIT); + + // We get here either if there is no need for an + // additional grace period or if rcu_accelerate_cbs() has + // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. + // So all we need to do is to clear all of the other + // ->gp_flags bits. + + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); @@ -2601,6 +2652,13 @@ static void rcu_do_batch(struct rcu_data *rdp) */ void rcu_sched_clock_irq(int user) { + unsigned long j; + + if (IS_ENABLED(CONFIG_PROVE_RCU)) { + j = jiffies; + WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock))); + __this_cpu_write(rcu_data.last_sched_clock, j); + } trace_rcu_utilization(TPS("Start scheduler-tick")); lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -2616,6 +2674,8 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + if (user) + rcu_tasks_classic_qs(current, false); lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); @@ -2850,10 +2910,12 @@ static void rcu_cpu_kthread(unsigned int cpu) { unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); + unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity); int spincnt; trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); for (spincnt = 0; spincnt < 10; spincnt++) { + WRITE_ONCE(*j, jiffies); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); @@ -2874,6 +2936,7 @@ static void rcu_cpu_kthread(unsigned int cpu) schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; + WRITE_ONCE(*j, jiffies); } static struct smp_hotplug_thread rcu_cpu_thread_spec = { @@ -2894,7 +2957,7 @@ static int __init rcu_spawn_core_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; - if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) + if (use_softirq) return 0; WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); @@ -2995,9 +3058,47 @@ static void check_cb_ovld(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); } -/* Helper function for call_rcu() and friends. */ -static void -__call_rcu(struct rcu_head *head, rcu_callback_t func) +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + */ +void call_rcu(struct rcu_head *head, rcu_callback_t func) { static atomic_t doublefrees; unsigned long flags; @@ -3011,7 +3112,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) /* * Probable double call_rcu(), so leak the callback. * Use rcu:rcu_callback trace event to find the previous - * time callback was passed to __call_rcu(). + * time callback was passed to call_rcu(). */ if (atomic_inc_return(&doublefrees) < 4) { pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); @@ -3022,8 +3123,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) } head->func = func; head->next = NULL; - local_irq_save(flags); kasan_record_aux_stack_noalloc(head); + local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ @@ -3060,51 +3161,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) local_irq_restore(flags); } } - -/** - * call_rcu() - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all pre-existing RCU read-side - * critical sections have completed. However, the callback function - * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. - * - * RCU read-side critical sections are delimited by rcu_read_lock() - * and rcu_read_unlock(), and may be nested. In addition, but only in - * v5.0 and later, regions of code across which interrupts, preemption, - * or softirqs have been disabled also serve as RCU read-side critical - * sections. This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing RCU read-side critical section. On systems with more - * than one CPU, this means that when "func()" is invoked, each CPU is - * guaranteed to have executed a full memory barrier since the end of its - * last RCU read-side critical section whose beginning preceded the call - * to call_rcu(). It also means that each CPU executing an RCU read-side - * critical section that continues beyond the start of "func()" must have - * executed a memory barrier after the call_rcu() but before the beginning - * of that RCU read-side critical section. Note that these guarantees - * include CPUs that are offline, idle, or executing in user mode, as - * well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting RCU callback function "func()", then both CPU A and CPU B are - * guaranteed to execute a full memory barrier during the time interval - * between the call to call_rcu() and the invocation of "func()" -- even - * if CPU A and CPU B are the same CPU (but again only if the system has - * more than one CPU). - * - * Implementation of these memory-ordering guarantees is described here: - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. - */ -void call_rcu(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func); -} EXPORT_SYMBOL_GPL(call_rcu); @@ -3713,7 +3769,9 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPTION)) + // Invoking preempt_model_*() too early gets a splat. + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE || + preempt_model_full() || preempt_model_rt()) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); @@ -3984,13 +4042,16 @@ static void rcu_barrier_callback(struct rcu_head *rhp) } /* - * Called with preemption disabled, and from cross-cpu IRQ context. + * If needed, entrain an rcu_barrier() callback on rdp->cblist. */ -static void rcu_barrier_func(void *cpu_in) +static void rcu_barrier_entrain(struct rcu_data *rdp) { - uintptr_t cpu = (uintptr_t)cpu_in; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence); + unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap); + lockdep_assert_held(&rcu_state.barrier_lock); + if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq)) + return; rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); @@ -4000,10 +4061,26 @@ static void rcu_barrier_func(void *cpu_in) atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - rcu_barrier_trace(TPS("IRQNQ"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence); } rcu_nocb_unlock(rdp); + smp_store_release(&rdp->barrier_seq_snap, gseq); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_handler(void *cpu_in) +{ + uintptr_t cpu = (uintptr_t)cpu_in; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(cpu != rdp->cpu); + WARN_ON_ONCE(cpu != smp_processor_id()); + raw_spin_lock(&rcu_state.barrier_lock); + rcu_barrier_entrain(rdp); + raw_spin_unlock(&rcu_state.barrier_lock); } /** @@ -4017,6 +4094,8 @@ static void rcu_barrier_func(void *cpu_in) void rcu_barrier(void) { uintptr_t cpu; + unsigned long flags; + unsigned long gseq; struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); @@ -4027,15 +4106,16 @@ void rcu_barrier(void) /* Did someone else do our work for us? */ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { - rcu_barrier_trace(TPS("EarlyExit"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rcu_state.barrier_mutex); return; } /* Mark the start of the barrier operation. */ + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); rcu_seq_start(&rcu_state.barrier_sequence); + gseq = rcu_state.barrier_sequence; rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* @@ -4047,7 +4127,7 @@ void rcu_barrier(void) */ init_completion(&rcu_state.barrier_completion); atomic_set(&rcu_state.barrier_cpu_count, 2); - cpus_read_lock(); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); /* * Force each CPU with callbacks to register a new callback. @@ -4056,29 +4136,31 @@ void rcu_barrier(void) */ for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (cpu_is_offline(cpu) && - !rcu_rdp_is_offloaded(rdp)) +retry: + if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq) + continue; + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + if (!rcu_segcblist_n_cbs(&rdp->cblist)) { + WRITE_ONCE(rdp->barrier_seq_snap, gseq); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence); continue; - if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { - rcu_barrier_trace(TPS("OnlineQ"), cpu, - rcu_state.barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1); - } else if (rcu_segcblist_n_cbs(&rdp->cblist) && - cpu_is_offline(cpu)) { - rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, - rcu_state.barrier_sequence); - local_irq_disable(); - rcu_barrier_func((void *)cpu); - local_irq_enable(); - } else if (cpu_is_offline(cpu)) { - rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu, - rcu_state.barrier_sequence); - } else { - rcu_barrier_trace(TPS("OnlineNQ"), cpu, - rcu_state.barrier_sequence); } + if (!rcu_rdp_cpu_online(rdp)) { + rcu_barrier_entrain(rdp); + WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence); + continue; + } + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) { + schedule_timeout_uninterruptible(1); + goto retry; + } + WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); + rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); } - cpus_read_unlock(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -4093,6 +4175,12 @@ void rcu_barrier(void) /* Mark the end of the barrier operation. */ rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence); rcu_seq_end(&rcu_state.barrier_sequence); + gseq = rcu_state.barrier_sequence; + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + + WRITE_ONCE(rdp->barrier_seq_snap, gseq); + } /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_state.barrier_mutex); @@ -4140,10 +4228,12 @@ rcu_boot_init_percpu_data(int cpu) INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); + rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); } @@ -4287,12 +4377,13 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + local_irq_save(flags); + arch_spin_lock(&rcu_state.ofl_lock); rcu_dynticks_eqs_online(); - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - raw_spin_lock_irqsave_rcu_node(rnp, flags); + raw_spin_lock(&rcu_state.barrier_lock); + raw_spin_lock_rcu_node(rnp); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); + raw_spin_unlock(&rcu_state.barrier_lock); newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; /* Allow lockless access for expedited grace periods. */ @@ -4304,15 +4395,18 @@ void rcu_cpu_starting(unsigned int cpu) /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ + /* rcu_report_qs_rnp() *really* wants some flags to restore */ + unsigned long flags2; + + local_irq_save(flags2); rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2); } else { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + raw_spin_unlock_rcu_node(rnp); } - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(rnp->ofl_seq & 0x1); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(flags); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } @@ -4326,7 +4420,7 @@ void rcu_cpu_starting(unsigned int cpu) */ void rcu_report_dead(unsigned int cpu) { - unsigned long flags; + unsigned long flags, seq_flags; unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -4340,10 +4434,8 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - raw_spin_lock(&rcu_state.ofl_lock); + local_irq_save(seq_flags); + arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); @@ -4354,10 +4446,8 @@ void rcu_report_dead(unsigned int cpu) } WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - raw_spin_unlock(&rcu_state.ofl_lock); - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(rnp->ofl_seq & 0x1); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(seq_flags); rdp->cpu_started = false; } @@ -4380,7 +4470,9 @@ void rcutree_migrate_callbacks(int cpu) rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ - local_irq_save(flags); + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); + rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); my_rnp = my_rdp->mynode; rcu_nocb_lock(my_rdp); /* irqs already disabled. */ @@ -4390,10 +4482,10 @@ void rcutree_migrate_callbacks(int cpu) needwake = rcu_advance_cbs(my_rnp, rdp) || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */ needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_disable(&rdp->cblist); - WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != - !rcu_segcblist_n_cbs(&my_rdp->cblist)); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); @@ -4434,31 +4526,61 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } +#ifdef CONFIG_RCU_EXP_KTHREAD +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + kthread_destroy_worker(rcu_exp_gp_kworker); + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + +static inline void rcu_alloc_par_gp_wq(void) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +struct workqueue_struct *rcu_par_gp_wq; + +static void __init rcu_start_exp_gp_kworkers(void) +{ +} + +static inline void rcu_alloc_par_gp_wq(void) +{ + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Spawn the kthreads that handle RCU's grace periods. */ static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; - int kthread_prio_in = kthread_prio; struct rcu_node *rnp; struct sched_param sp; struct task_struct *t; - - /* Force priority into range. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 - && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) - kthread_prio = 2; - else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) - kthread_prio = 1; - else if (kthread_prio < 0) - kthread_prio = 0; - else if (kthread_prio > 99) - kthread_prio = 99; - - if (kthread_prio != kthread_prio_in) - pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", - kthread_prio, kthread_prio_in); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); @@ -4476,9 +4598,17 @@ static int __init rcu_spawn_gp_kthread(void) smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); - rcu_spawn_nocb_kthreads(); - rcu_spawn_boost_kthreads(); + /* This is a pre-SMP initcall, we expect a single CPU */ + WARN_ON(num_online_cpus() > 1); + /* + * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu() + * due to rcu_scheduler_fully_active. + */ + rcu_spawn_cpu_nocb_kthread(smp_processor_id()); + rcu_spawn_one_boost_kthread(rdp->mynode); rcu_spawn_core_kthreads(); + /* Create kthread worker for expedited GPs */ + rcu_start_exp_gp_kworkers(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4570,6 +4700,7 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[2]); init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); + mutex_init(&rnp->boost_kthread_mutex); } } @@ -4585,6 +4716,28 @@ static void __init rcu_init_one(void) } /* + * Force priority from the kernel command-line into range. + */ +static void __init sanitize_kthread_prio(void) +{ + int kthread_prio_in = kthread_prio; + + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 + && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) + kthread_prio = 2; + else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + kthread_prio = 1; + else if (kthread_prio < 0) + kthread_prio = 0; + else if (kthread_prio > 99) + kthread_prio = 99; + + if (kthread_prio != kthread_prio_in) + pr_alert("%s: Limited prio to %d from %d\n", + __func__, kthread_prio, kthread_prio_in); +} + +/* * Compute the rcu_node tree geometry from kernel parameters. This cannot * replace the definitions in tree.h because those are needed to size * the ->node array in the rcu_state structure. @@ -4701,7 +4854,6 @@ static void __init rcu_dump_rcu_node_tree(void) } struct workqueue_struct *rcu_gp_wq; -struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { @@ -4738,12 +4890,13 @@ static void __init kfree_rcu_batch_init(void) void __init rcu_init(void) { - int cpu; + int cpu = smp_processor_id(); rcu_early_boot_tests(); kfree_rcu_batch_init(); rcu_bootup_announce(); + sanitize_kthread_prio(); rcu_init_geometry(); rcu_init_one(); if (dump_tree) @@ -4757,17 +4910,15 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) { - rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); - rcutree_online_cpu(cpu); - } + WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. + rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); + rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 486fc901bd08..2ccf5845957d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -10,6 +10,7 @@ */ #include <linux/cache.h> +#include <linux/kthread.h> #include <linux/spinlock.h> #include <linux/rtmutex.h> #include <linux/threads.h> @@ -23,7 +24,11 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { unsigned long rew_s; +#ifdef CONFIG_RCU_EXP_KTHREAD + struct kthread_work rew_work; +#else struct work_struct rew_work; +#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ @@ -56,8 +61,6 @@ struct rcu_node { /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; - unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */ - /* Online CPUs for next grace period. */ unsigned long expmask; /* CPUs or groups that need to check in */ /* to allow the current expedited GP */ /* to complete. */ @@ -110,6 +113,9 @@ struct rcu_node { /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ + struct mutex boost_kthread_mutex; + /* Exclusion for thread spawning and affinity */ + /* manipulation. */ struct task_struct *boost_kthread_task; /* kthread that takes care of priority */ /* boosting for this rcu_node structure. */ @@ -190,6 +196,7 @@ struct rcu_data { bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ + unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */ struct rcu_head barrier_head; int exp_dynticks_snap; /* Double-check need for IPI. */ @@ -203,6 +210,8 @@ struct rcu_data { int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ + struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */ + /* spawning */ /* The following fields are used by call_rcu, hence own cacheline. */ raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp; @@ -237,6 +246,7 @@ struct rcu_data { /* rcuc per-CPU kthread or NULL. */ unsigned int rcu_cpu_kthread_status; char rcu_cpu_has_work; + unsigned long rcuc_activity; /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ @@ -249,6 +259,7 @@ struct rcu_data { unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ short rcu_onl_gp_flags; /* ->gp_flags at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ + unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ int cpu; }; @@ -302,9 +313,8 @@ struct rcu_state { /* The following fields are guarded by the root rcu_node's lock. */ - u8 boost ____cacheline_internodealigned_in_smp; - /* Subject to priority boost. */ - unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_seq ____cacheline_internodealigned_in_smp; + /* Grace-period sequence #. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ @@ -323,6 +333,8 @@ struct rcu_state { /* rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ @@ -355,9 +367,10 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ - raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ + int nocb_is_setup; /* nocb is setup from boot */ }; /* Values for rcu_state structure's gp_flags field. */ @@ -415,7 +428,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); -static void __init rcu_spawn_boost_kthreads(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); @@ -433,7 +445,6 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); -static void __init rcu_spawn_nocb_kthreads(void); static void show_rcu_nocb_state(struct rcu_data *rdp); static void rcu_nocb_lock(struct rcu_data *rdp); static void rcu_nocb_unlock(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 237a79989aba..0f70f62039a9 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -334,15 +334,13 @@ fastpath: * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) { int cpu; unsigned long flags; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -417,13 +415,119 @@ retry_ipi: rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } +static void rcu_exp_sel_wait_wake(unsigned long s); + +#ifdef CONFIG_RCU_EXP_KTHREAD +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_par_gp_kworker); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* + * Use rcu_exp_par_gp_kworker, because flushing a work item from + * another work item on the same kthread worker can result in + * deadlock. + */ + kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + kthread_flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + kthread_init_work(&rew->rew_work, wait_rcu_exp_gp); + kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_par_gp_wq); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); + + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi - rnp->grplo)) + cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ + destroy_work_on_stack(&rew->rew_work); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ static void sync_rcu_exp_select_cpus(void) { - int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); @@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!READ_ONCE(rcu_par_gp_wq) || + if (!rcu_gp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { - /* No workqueues yet or last leaf, do direct call. */ + /* No worker started yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + sync_rcu_exp_select_cpus_queue_work(rnp); rnp->exp_need_flush = true; } - /* Wait for workqueue jobs (if any) to complete. */ + /* Wait for jobs (if any) to complete. */ rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) - flush_work(&rnp->rew.rew_work); + sync_rcu_exp_select_cpus_flush_work(rnp); } /* @@ -496,13 +593,14 @@ static void synchronize_rcu_expedited_wait(void) struct rcu_node *rnp_root = rcu_get_root(); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); - jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_stall = rcu_exp_jiffies_till_stall_check(); jiffies_start = jiffies; if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + mask = READ_ONCE(rnp->expmask); + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->rcu_forced_tick_exp) continue; @@ -570,7 +668,7 @@ static void synchronize_rcu_expedited_wait(void) dump_cpu_task(cpu); } } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; } } @@ -621,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - #ifdef CONFIG_PREEMPT_RCU /* @@ -656,7 +743,7 @@ static void rcu_exp_handler(void *unused) */ if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - rcu_dynticks_curr_cpu_in_eqs()) { + rcu_is_cpu_rrupt_from_idle()) { rcu_report_exp_rdp(rdp); } else { WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); @@ -847,20 +934,19 @@ void synchronize_rcu_expedited(void) } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); + synchronize_rcu_expedited_queue_work(&rew); } /* Wait for expedited grace period to complete. */ rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ + smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); if (likely(!boottime)) - destroy_work_on_stack(&rew.rew_work); + synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index eeafb546a7a0..46694e13398a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -60,9 +60,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ - -static bool rcu_nocb_is_setup; - static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); @@ -72,7 +69,7 @@ static int __init rcu_nocb_setup(char *str) cpumask_setall(rcu_nocb_mask); } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; return 1; } __setup("rcu_nocbs", rcu_nocb_setup); @@ -215,14 +212,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - static bool __wake_nocb_gp(struct rcu_data *rdp_gp, struct rcu_data *rdp, bool force, unsigned long flags) @@ -1169,7 +1158,7 @@ void __init rcu_init_nohz(void) struct rcu_data *rdp; #if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ @@ -1180,10 +1169,10 @@ void __init rcu_init_nohz(void) return; } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; } - if (!rcu_nocb_is_setup) + if (!rcu_state.nocb_is_setup) return; #if defined(CONFIG_NO_HZ_FULL) @@ -1226,6 +1215,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) raw_spin_lock_init(&rdp->nocb_gp_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); rcu_cblist_init(&rdp->nocb_bypass); + mutex_init(&rdp->nocb_gp_kthread_mutex); } /* @@ -1238,8 +1228,9 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_data *rdp_gp; struct task_struct *t; + struct sched_param sp; - if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) + if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup) return; /* If there already is an rcuo kthread, then nothing to do. */ @@ -1247,40 +1238,34 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) return; /* If we didn't spawn the GP kthread first, reorganize! */ + sp.sched_priority = kthread_prio; rdp_gp = rdp->nocb_gp_rdp; + mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); if (!rdp_gp->nocb_gp_kthread) { t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, "rcuog/%d", rdp_gp->cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); return; + } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); + if (kthread_prio) + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) return; + + if (kthread_prio) + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - if (rcu_nocb_is_setup) { - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); - } -} - /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ static int rcu_nocb_gp_stride = -1; module_param(rcu_nocb_gp_stride, int, 0444); @@ -1348,7 +1333,7 @@ static void __init rcu_organize_nocb_kthreads(void) */ void rcu_bind_current_to_nocb(void) { - if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) + if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask)) WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); @@ -1537,10 +1522,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) { } -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - static void show_rcu_nocb_state(struct rcu_data *rdp) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c5b45c2f68a1..c8ba0fe17267 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -330,7 +330,7 @@ void rcu_note_context_switch(bool preempt) * then queue the task as required based on the states * of any ongoing and expedited grace periods. */ - WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); + WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp)); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); trace_rcu_preempt_task(rcu_state.name, t->pid, @@ -486,6 +486,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } else { @@ -556,16 +557,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } - /* Unboost if we were boosted. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex); - /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) rcu_report_exp_rnp(rnp, true); + + /* Unboost if we were boosted. */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) + rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex); } else { local_irq_restore(flags); } @@ -660,7 +661,13 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + IS_ENABLED(CONFIG_PREEMPT_RT)) + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( + rcu_preempt_deferred_qs_handler); + else + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = true; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } @@ -773,7 +780,6 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) int cpu; int i; struct list_head *lhp; - bool onl; struct rcu_data *rdp; struct rcu_node *rnp1; @@ -797,9 +803,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_cont("\n"); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { rdp = per_cpu_ptr(&rcu_data, cpu); - onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", - cpu, ".o"[onl], + cpu, ".o"[rcu_rdp_cpu_online(rdp)], (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); } @@ -996,12 +1001,15 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) */ static void rcu_cpu_kthread_setup(unsigned int cpu) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); #ifdef CONFIG_RCU_BOOST struct sched_param sp; sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); #endif /* #ifdef CONFIG_RCU_BOOST */ + + WRITE_ONCE(rdp->rcuc_activity, jiffies); } #ifdef CONFIG_RCU_BOOST @@ -1123,7 +1131,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { raw_lockdep_assert_held_rcu_node(rnp); - if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + if (!rnp->boost_kthread_task || + (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1172,15 +1181,14 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct sched_param sp; struct task_struct *t; + mutex_lock(&rnp->boost_kthread_mutex); if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) - return; - - rcu_state.boost = 1; + goto out; t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); if (WARN_ON_ONCE(IS_ERR(t))) - return; + goto out; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; @@ -1188,6 +1196,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ + + out: + mutex_unlock(&rnp->boost_kthread_mutex); } /* @@ -1210,29 +1221,19 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; + mutex_lock(&rnp->boost_kthread_mutex); for_each_leaf_node_possible_cpu(rnp, cpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU)); - if (cpumask_weight(cm) == 0) - cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU)); + cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (cpumask_empty(cm)) + cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); set_cpus_allowed_ptr(t, cm); + mutex_unlock(&rnp->boost_kthread_mutex); free_cpumask_var(cm); } -/* - * Spawn boost kthreads -- called as soon as the scheduler is running. - */ -static void __init rcu_spawn_boost_kthreads(void) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) - if (rcu_rnp_online_cpus(rnp)) - rcu_spawn_one_boost_kthread(rnp); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1258,10 +1259,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static void __init rcu_spawn_boost_kthreads(void) -{ -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* @@ -1291,7 +1288,7 @@ static void rcu_bind_gp_kthread(void) { if (!tick_nohz_full_enabled()) return; - housekeeping_affine(current, HK_FLAG_RCU); + housekeeping_affine(current, HK_TYPE_RCU); } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 21bebf7c9030..a001e1e7a992 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -25,6 +25,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly; #define RCU_STALL_MIGHT_DIV 8 #define RCU_STALL_MIGHT_MIN (2 * HZ) +int rcu_exp_jiffies_till_stall_check(void) +{ + int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout); + int exp_stall_delay_delta = 0; + int till_stall_check; + + // Zero says to use rcu_cpu_stall_timeout, but in milliseconds. + if (!cpu_stall_timeout) + cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check()); + + // Limit check must be consistent with the Kconfig limits for + // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range. + // The minimum clamped value is "2UL", because at least one full + // tick has to be guaranteed. + till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ); + + if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout) + WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); + +#ifdef CONFIG_PROVE_RCU + /* Add extra ~25% out of till_stall_check. */ + exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1; +#endif + + return till_stall_check + exp_stall_delay_delta; +} +EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check); + /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) { @@ -379,6 +407,15 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp) return j > 2 * HZ; } +static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -430,6 +467,29 @@ static void print_cpu_stall_info(int cpu) falsepositive ? " (false positive?)" : ""); } +static void rcuc_kthread_dump(struct rcu_data *rdp) +{ + int cpu; + unsigned long j; + struct task_struct *rcuc; + + rcuc = rdp->rcu_cpu_kthread_task; + if (!rcuc) + return; + + cpu = task_cpu(rcuc); + if (cpu_is_offline(cpu) || idle_cpu(cpu)) + return; + + if (!rcu_is_rcuc_kthread_starving(rdp, &j)) + return; + + pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j); + sched_show_task(rcuc); + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); +} + /* Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(void) { @@ -533,9 +593,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); if (ndetected) { rcu_dump_cpu_stacks(); @@ -594,13 +654,16 @@ static void print_cpu_stall(unsigned long gps) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", + pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); + if (!use_softirq) + rcuc_kthread_dump(rdp); + rcu_dump_cpu_stacks(); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 156892c22bb5..fc7fef575606 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -407,6 +407,13 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, } EXPORT_SYMBOL_GPL(__wait_rcu_gp); +void finish_rcuwait(struct rcuwait *w) +{ + rcu_assign_pointer(w->task, NULL); + __set_current_state(TASK_RUNNING); +} +EXPORT_SYMBOL_GPL(finish_rcuwait); + #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head) { @@ -499,6 +506,8 @@ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); +int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; +module_param(rcu_exp_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall diff --git a/kernel/reboot.c b/kernel/reboot.c index 6bcc5d6a6572..3c35445bf5ad 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -23,7 +23,7 @@ * this indicates whether you can reboot with ctrl-alt-del: the default is yes */ -int C_A_D = 1; +static int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); @@ -48,12 +48,20 @@ int reboot_cpu; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; +struct sys_off_handler { + struct notifier_block nb; + int (*sys_off_cb)(struct sys_off_data *data); + void *cb_data; + enum sys_off_mode mode; + bool blocking; + void *list; +}; + /* - * If set, this is used for preparing the system to power off. + * Temporary stub that prevents linkage failure while we're in process + * of removing all uses of legacy pm_power_off() around the kernel. */ - -void (*pm_power_off_prepare)(void); -EXPORT_SYMBOL_GPL(pm_power_off_prepare); +void __weak (*pm_power_off)(void); /** * emergency_restart - reboot the system @@ -281,6 +289,370 @@ void kernel_halt(void) } EXPORT_SYMBOL_GPL(kernel_halt); +/* + * Notifier list for kernel code which wants to be called + * to prepare system for power off. + */ +static BLOCKING_NOTIFIER_HEAD(power_off_prep_handler_list); + +/* + * Notifier list for kernel code which wants to be called + * to power off system. + */ +static ATOMIC_NOTIFIER_HEAD(power_off_handler_list); + +static int sys_off_notify(struct notifier_block *nb, + unsigned long mode, void *cmd) +{ + struct sys_off_handler *handler; + struct sys_off_data data = {}; + + handler = container_of(nb, struct sys_off_handler, nb); + data.cb_data = handler->cb_data; + data.mode = mode; + data.cmd = cmd; + + return handler->sys_off_cb(&data); +} + +static struct sys_off_handler platform_sys_off_handler; + +static struct sys_off_handler *alloc_sys_off_handler(int priority) +{ + struct sys_off_handler *handler; + gfp_t flags; + + /* + * Platforms like m68k can't allocate sys_off handler dynamically + * at the early boot time because memory allocator isn't available yet. + */ + if (priority == SYS_OFF_PRIO_PLATFORM) { + handler = &platform_sys_off_handler; + if (handler->cb_data) + return ERR_PTR(-EBUSY); + } else { + if (system_state > SYSTEM_RUNNING) + flags = GFP_ATOMIC; + else + flags = GFP_KERNEL; + + handler = kzalloc(sizeof(*handler), flags); + if (!handler) + return ERR_PTR(-ENOMEM); + } + + return handler; +} + +static void free_sys_off_handler(struct sys_off_handler *handler) +{ + if (handler == &platform_sys_off_handler) + memset(handler, 0, sizeof(*handler)); + else + kfree(handler); +} + +/** + * register_sys_off_handler - Register sys-off handler + * @mode: Sys-off mode + * @priority: Handler priority + * @callback: Callback function + * @cb_data: Callback argument + * + * Registers system power-off or restart handler that will be invoked + * at the step corresponding to the given sys-off mode. Handler's callback + * should return NOTIFY_DONE to permit execution of the next handler in + * the call chain or NOTIFY_STOP to break the chain (in error case for + * example). + * + * Multiple handlers can be registered at the default priority level. + * + * Only one handler can be registered at the non-default priority level, + * otherwise ERR_PTR(-EBUSY) is returned. + * + * Returns a new instance of struct sys_off_handler on success, or + * an ERR_PTR()-encoded error code otherwise. + */ +struct sys_off_handler * +register_sys_off_handler(enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + struct sys_off_handler *handler; + int err; + + handler = alloc_sys_off_handler(priority); + if (IS_ERR(handler)) + return handler; + + switch (mode) { + case SYS_OFF_MODE_POWER_OFF_PREPARE: + handler->list = &power_off_prep_handler_list; + handler->blocking = true; + break; + + case SYS_OFF_MODE_POWER_OFF: + handler->list = &power_off_handler_list; + break; + + case SYS_OFF_MODE_RESTART: + handler->list = &restart_handler_list; + break; + + default: + free_sys_off_handler(handler); + return ERR_PTR(-EINVAL); + } + + handler->nb.notifier_call = sys_off_notify; + handler->nb.priority = priority; + handler->sys_off_cb = callback; + handler->cb_data = cb_data; + handler->mode = mode; + + if (handler->blocking) { + if (priority == SYS_OFF_PRIO_DEFAULT) + err = blocking_notifier_chain_register(handler->list, + &handler->nb); + else + err = blocking_notifier_chain_register_unique_prio(handler->list, + &handler->nb); + } else { + if (priority == SYS_OFF_PRIO_DEFAULT) + err = atomic_notifier_chain_register(handler->list, + &handler->nb); + else + err = atomic_notifier_chain_register_unique_prio(handler->list, + &handler->nb); + } + + if (err) { + free_sys_off_handler(handler); + return ERR_PTR(err); + } + + return handler; +} +EXPORT_SYMBOL_GPL(register_sys_off_handler); + +/** + * unregister_sys_off_handler - Unregister sys-off handler + * @handler: Sys-off handler + * + * Unregisters given sys-off handler. + */ +void unregister_sys_off_handler(struct sys_off_handler *handler) +{ + int err; + + if (IS_ERR_OR_NULL(handler)) + return; + + if (handler->blocking) + err = blocking_notifier_chain_unregister(handler->list, + &handler->nb); + else + err = atomic_notifier_chain_unregister(handler->list, + &handler->nb); + + /* sanity check, shall never happen */ + WARN_ON(err); + + free_sys_off_handler(handler); +} +EXPORT_SYMBOL_GPL(unregister_sys_off_handler); + +static void devm_unregister_sys_off_handler(void *data) +{ + struct sys_off_handler *handler = data; + + unregister_sys_off_handler(handler); +} + +/** + * devm_register_sys_off_handler - Register sys-off handler + * @dev: Device that registers handler + * @mode: Sys-off mode + * @priority: Handler priority + * @callback: Callback function + * @cb_data: Callback argument + * + * Registers resource-managed sys-off handler. + * + * Returns zero on success, or error code on failure. + */ +int devm_register_sys_off_handler(struct device *dev, + enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + struct sys_off_handler *handler; + + handler = register_sys_off_handler(mode, priority, callback, cb_data); + if (IS_ERR(handler)) + return PTR_ERR(handler); + + return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler, + handler); +} +EXPORT_SYMBOL_GPL(devm_register_sys_off_handler); + +/** + * devm_register_power_off_handler - Register power-off handler + * @dev: Device that registers callback + * @callback: Callback function + * @cb_data: Callback's argument + * + * Registers resource-managed sys-off handler with a default priority + * and using power-off mode. + * + * Returns zero on success, or error code on failure. + */ +int devm_register_power_off_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + return devm_register_sys_off_handler(dev, + SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + callback, cb_data); +} +EXPORT_SYMBOL_GPL(devm_register_power_off_handler); + +/** + * devm_register_restart_handler - Register restart handler + * @dev: Device that registers callback + * @callback: Callback function + * @cb_data: Callback's argument + * + * Registers resource-managed sys-off handler with a default priority + * and using restart mode. + * + * Returns zero on success, or error code on failure. + */ +int devm_register_restart_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + return devm_register_sys_off_handler(dev, + SYS_OFF_MODE_RESTART, + SYS_OFF_PRIO_DEFAULT, + callback, cb_data); +} +EXPORT_SYMBOL_GPL(devm_register_restart_handler); + +static struct sys_off_handler *platform_power_off_handler; + +static int platform_power_off_notify(struct sys_off_data *data) +{ + void (*platform_power_power_off_cb)(void) = data->cb_data; + + platform_power_power_off_cb(); + + return NOTIFY_DONE; +} + +/** + * register_platform_power_off - Register platform-level power-off callback + * @power_off: Power-off callback + * + * Registers power-off callback that will be called as last step + * of the power-off sequence. This callback is expected to be invoked + * for the last resort. Only one platform power-off callback is allowed + * to be registered at a time. + * + * Returns zero on success, or error code on failure. + */ +int register_platform_power_off(void (*power_off)(void)) +{ + struct sys_off_handler *handler; + + handler = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_PLATFORM, + platform_power_off_notify, + power_off); + if (IS_ERR(handler)) + return PTR_ERR(handler); + + platform_power_off_handler = handler; + + return 0; +} +EXPORT_SYMBOL_GPL(register_platform_power_off); + +/** + * unregister_platform_power_off - Unregister platform-level power-off callback + * @power_off: Power-off callback + * + * Unregisters previously registered platform power-off callback. + */ +void unregister_platform_power_off(void (*power_off)(void)) +{ + if (platform_power_off_handler && + platform_power_off_handler->cb_data == power_off) { + unregister_sys_off_handler(platform_power_off_handler); + platform_power_off_handler = NULL; + } +} +EXPORT_SYMBOL_GPL(unregister_platform_power_off); + +static int legacy_pm_power_off(struct sys_off_data *data) +{ + if (pm_power_off) + pm_power_off(); + + return NOTIFY_DONE; +} + +static void do_kernel_power_off_prepare(void) +{ + blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL); +} + +/** + * do_kernel_power_off - Execute kernel power-off handler call chain + * + * Expected to be called as last step of the power-off sequence. + * + * Powers off the system immediately if a power-off handler function has + * been registered. Otherwise does nothing. + */ +void do_kernel_power_off(void) +{ + struct sys_off_handler *sys_off = NULL; + + /* + * Register sys-off handlers for legacy PM callback. This allows + * legacy PM callbacks temporary co-exist with the new sys-off API. + * + * TODO: Remove legacy handlers once all legacy PM users will be + * switched to the sys-off based APIs. + */ + if (pm_power_off) + sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + legacy_pm_power_off, NULL); + + atomic_notifier_call_chain(&power_off_handler_list, 0, NULL); + + unregister_sys_off_handler(sys_off); +} + +/** + * kernel_can_power_off - check whether system can be powered off + * + * Returns true if power-off handler is registered and system can be + * powered off, false otherwise. + */ +bool kernel_can_power_off(void) +{ + return !atomic_notifier_call_chain_is_empty(&power_off_handler_list) || + pm_power_off; +} +EXPORT_SYMBOL_GPL(kernel_can_power_off); + /** * kernel_power_off - power_off the system * @@ -289,8 +661,7 @@ EXPORT_SYMBOL_GPL(kernel_halt); void kernel_power_off(void) { kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); + do_kernel_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); @@ -340,7 +711,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) cmd = LINUX_REBOOT_CMD_HALT; mutex_lock(&system_transition_mutex); @@ -417,7 +788,8 @@ void ctrl_alt_del(void) kill_cad_pid(SIGINT, 1); } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; +#define POWEROFF_CMD_PATH_LEN 256 +static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; static const char reboot_cmd[] = "/sbin/reboot"; static int run_cmd(const char *cmd) @@ -867,6 +1239,33 @@ static struct attribute *reboot_attrs[] = { NULL, }; +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_reboot_table[] = { + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static void __init kernel_reboot_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_reboot_table); +} +#else +#define kernel_reboot_sysctls_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + static const struct attribute_group reboot_attr_group = { .attrs = reboot_attrs, }; @@ -886,6 +1285,8 @@ static int __init reboot_ksysfs_init(void) return ret; } + kernel_reboot_sysctls_init(); + return 0; } late_initcall(reboot_ksysfs_init); diff --git a/kernel/relay.c b/kernel/relay.c index d1a67fbb819d..6a611e779e95 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -440,7 +440,7 @@ int relay_prepare_cpu(unsigned int cpu) mutex_lock(&relay_channels_mutex); list_for_each_entry(chan, &relay_channels, list) { - if ((buf = *per_cpu_ptr(chan->buf, cpu))) + if (*per_cpu_ptr(chan->buf, cpu)) continue; buf = relay_open_buf(chan, cpu); if (!buf) { diff --git a/kernel/resource.c b/kernel/resource.c index 9c08d6e9eef2..34eaee179689 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -56,14 +56,6 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); -/* - * For memory hotplug, there is no way to free resource entries allocated - * by boot mem after the system is up. So for reusing the resource entry - * we need to remember the resource. - */ -static struct resource *bootmem_resource_free; -static DEFINE_SPINLOCK(bootmem_resource_lock); - static struct resource *next_resource(struct resource *p) { if (p->child) @@ -160,36 +152,19 @@ __initcall(ioresources_init); static void free_resource(struct resource *res) { - if (!res) - return; - - if (!PageSlab(virt_to_head_page(res))) { - spin_lock(&bootmem_resource_lock); - res->sibling = bootmem_resource_free; - bootmem_resource_free = res; - spin_unlock(&bootmem_resource_lock); - } else { + /** + * If the resource was allocated using memblock early during boot + * we'll leak it here: we can only return full pages back to the + * buddy and trying to be smart and reusing them eventually in + * alloc_resource() overcomplicates resource handling. + */ + if (res && PageSlab(virt_to_head_page(res))) kfree(res); - } } static struct resource *alloc_resource(gfp_t flags) { - struct resource *res = NULL; - - spin_lock(&bootmem_resource_lock); - if (bootmem_resource_free) { - res = bootmem_resource_free; - bootmem_resource_free = res->sibling; - } - spin_unlock(&bootmem_resource_lock); - - if (res) - memset(res, 0, sizeof(struct resource)); - else - res = kzalloc(sizeof(struct resource), flags); - - return res; + return kzalloc(sizeof(struct resource), flags); } /* Return the conflict entry if you can't request it */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 6d45ac3dae7f..97ac20b4f738 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -128,10 +128,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) int ret; #ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs.ptr64)) + if (get_user(ptr, &t->rseq->rseq_cs)) return -EFAULT; #else - if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) + if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) return -EFAULT; #endif if (!ptr) { @@ -217,9 +217,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs.ptr64); + return put_user(0UL, &t->rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) + if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) return -EFAULT; return 0; #endif diff --git a/kernel/scftorture.c b/kernel/scftorture.c index dcb0410950e4..5d113aa59e77 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -267,9 +267,10 @@ static void scf_handler(void *scfc_in) } this_cpu_inc(scf_invoked_count); if (longwait <= 0) { - if (!(r & 0xffc0)) + if (!(r & 0xffc0)) { udelay(r & 0x3f); - goto out; + goto out; + } } if (r & 0xfff) goto out; diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index c83b37af155b..976092b7bd45 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) -endif # The compilers are complaining about unused variables inside an if(0) scope # block. This is daft, shut them up. @@ -25,18 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle.o fair.o rt.o deadline.o -obj-y += wait.o wait_bit.o swait.o completion.o - -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o -obj-$(CONFIG_SCHEDSTATS) += stats.o -obj-$(CONFIG_SCHED_DEBUG) += debug.o -obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -obj-$(CONFIG_CPU_FREQ) += cpufreq.o -obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o -obj-$(CONFIG_CPU_ISOLATION) += isolation.o -obj-$(CONFIG_PSI) += psi.o -obj-$(CONFIG_SCHED_CORE) += core_sched.o +# +# Build efficiency: +# +# These compilation units have roughly the same size and complexity - so their +# build parallelizes well and finishes roughly at once: +# +obj-y += core.o +obj-y += fair.o +obj-y += build_policy.o +obj-y += build_utility.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 8629b37d118e..4ebaf97f7bd8 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,20 +1,42 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Auto-group scheduling implementation: */ -#include <linux/nospec.h> -#include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_autogroup_sysctls[] = { + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static void __init sched_autogroup_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_autogroup_sysctls); +} +#else +#define sched_autogroup_sysctl_init() do { } while (0) +#endif + void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; + sched_autogroup_sysctl_init(); } void autogroup_free(struct task_group *tg) diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index b96419974a1f..90d69f2c5eaf 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_SCHED_AUTOGROUP_H +#define _KERNEL_SCHED_AUTOGROUP_H + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { @@ -27,6 +30,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { + extern unsigned int sysctl_sched_autogroup_enabled; int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) @@ -58,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) } #endif /* CONFIG_SCHED_AUTOGROUP */ + +#endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c new file mode 100644 index 000000000000..d9dc9ab3773f --- /dev/null +++ b/kernel/sched/build_policy.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are the scheduling policy related scheduler files, built + * in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c and fair.c, the other two big + * compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + * + * core.c and fair.c are built separately. + */ + +/* Headers: */ +#include <linux/sched/clock.h> +#include <linux/sched/cputime.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/posix-timers.h> +#include <linux/sched/rt.h> + +#include <linux/cpuidle.h> +#include <linux/jiffies.h> +#include <linux/livepatch.h> +#include <linux/psi.h> +#include <linux/seqlock_api.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include <linux/tsacct_kern.h> +#include <linux/vtime.h> + +#include <uapi/linux/sched/types.h> + +#include "sched.h" +#include "smp.h" + +#include "autogroup.h" +#include "stats.h" +#include "pelt.h" + +/* Source code modules: */ + +#include "idle.c" + +#include "rt.c" + +#ifdef CONFIG_SMP +# include "cpudeadline.c" +# include "pelt.c" +#endif + +#include "cputime.c" +#include "deadline.c" + diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c new file mode 100644 index 000000000000..99bdd96f454f --- /dev/null +++ b/kernel/sched/build_utility.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are various utility functions of the scheduler, + * built in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c, fair.c, smp.c and policy.c, the other + * big compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + */ +#include <linux/sched/clock.h> +#include <linux/sched/cputime.h> +#include <linux/sched/debug.h> +#include <linux/sched/isolation.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/nohz.h> +#include <linux/sched/mm.h> +#include <linux/sched/rseq_api.h> +#include <linux/sched/task_stack.h> + +#include <linux/cpufreq.h> +#include <linux/cpumask_api.h> +#include <linux/cpuset.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/energy_model.h> +#include <linux/hashtable_api.h> +#include <linux/irq.h> +#include <linux/kobject_api.h> +#include <linux/membarrier.h> +#include <linux/mempolicy.h> +#include <linux/nmi.h> +#include <linux/nospec.h> +#include <linux/proc_fs.h> +#include <linux/psi.h> +#include <linux/psi.h> +#include <linux/ptrace_api.h> +#include <linux/sched_clock.h> +#include <linux/security.h> +#include <linux/spinlock_api.h> +#include <linux/swait_api.h> +#include <linux/timex.h> +#include <linux/utsname.h> +#include <linux/wait_api.h> +#include <linux/workqueue_api.h> + +#include <uapi/linux/prctl.h> +#include <uapi/linux/sched/types.h> + +#include <asm/switch_to.h> + +#include "sched.h" +#include "sched-pelt.h" +#include "stats.h" +#include "autogroup.h" + +#include "clock.c" + +#ifdef CONFIG_CGROUP_CPUACCT +# include "cpuacct.c" +#endif + +#ifdef CONFIG_CPU_FREQ +# include "cpufreq.c" +#endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +# include "cpufreq_schedutil.c" +#endif + +#ifdef CONFIG_SCHED_DEBUG +# include "debug.c" +#endif + +#ifdef CONFIG_SCHEDSTATS +# include "stats.c" +#endif + +#include "loadavg.c" +#include "completion.c" +#include "swait.c" +#include "wait_bit.c" +#include "wait.c" + +#ifdef CONFIG_SMP +# include "cpupri.c" +# include "stop_task.c" +# include "topology.c" +#endif + +#ifdef CONFIG_SCHED_CORE +# include "core_sched.c" +#endif + +#ifdef CONFIG_PSI +# include "psi.c" +#endif + +#ifdef CONFIG_MEMBARRIER +# include "membarrier.c" +#endif + +#ifdef CONFIG_CPU_ISOLATION +# include "isolation.c" +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +# include "autogroup.c" +#endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c2b2859ddd82..e374c0c923da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -53,15 +53,13 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include "sched.h" -#include <linux/sched_clock.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __weak sched_clock(void) +notrace unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); @@ -95,28 +93,28 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -static inline struct sched_clock_data *this_scd(void) +notrace static inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } -static inline struct sched_clock_data *cpu_sdc(int cpu) +notrace static inline struct sched_clock_data *cpu_sdc(int cpu) { return &per_cpu(sched_clock_data, cpu); } -int sched_clock_stable(void) +notrace int sched_clock_stable(void) { return static_branch_likely(&__sched_clock_stable); } -static void __scd_stamp(struct sched_clock_data *scd) +notrace static void __scd_stamp(struct sched_clock_data *scd) { scd->tick_gtod = ktime_get_ns(); scd->tick_raw = sched_clock(); } -static void __set_sched_clock_stable(void) +notrace static void __set_sched_clock_stable(void) { struct sched_clock_data *scd; @@ -151,7 +149,7 @@ static void __set_sched_clock_stable(void) * The only way to fully avoid random clock jumps is to boot with: * "tsc=unstable". */ -static void __sched_clock_work(struct work_struct *work) +notrace static void __sched_clock_work(struct work_struct *work) { struct sched_clock_data *scd; int cpu; @@ -177,7 +175,7 @@ static void __sched_clock_work(struct work_struct *work) static DECLARE_WORK(sched_clock_work, __sched_clock_work); -static void __clear_sched_clock_stable(void) +notrace static void __clear_sched_clock_stable(void) { if (!sched_clock_stable()) return; @@ -186,7 +184,7 @@ static void __clear_sched_clock_stable(void) schedule_work(&sched_clock_work); } -void clear_sched_clock_stable(void) +notrace void clear_sched_clock_stable(void) { __sched_clock_stable_early = 0; @@ -196,7 +194,7 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -static void __sched_clock_gtod_offset(void) +notrace static void __sched_clock_gtod_offset(void) { struct sched_clock_data *scd = this_scd(); @@ -246,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -static inline u64 wrap_min(u64 x, u64 y) +notrace static inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -static inline u64 wrap_max(u64 x, u64 y) +notrace static inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -262,7 +260,7 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 sched_clock_local(struct sched_clock_data *scd) +notrace static u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -289,13 +287,13 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) + if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -static u64 sched_clock_remote(struct sched_clock_data *scd) +notrace static u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; @@ -351,7 +349,7 @@ again: val = remote_clock; } - if (cmpxchg64(ptr, old_val, val) != old_val) + if (!try_cmpxchg64(ptr, &old_val, val)) goto again; return val; @@ -362,7 +360,7 @@ again: * * See cpu_clock(). */ -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; u64 clock; @@ -386,7 +384,7 @@ u64 sched_clock_cpu(int cpu) } EXPORT_SYMBOL_GPL(sched_clock_cpu); -void sched_clock_tick(void) +notrace void sched_clock_tick(void) { struct sched_clock_data *scd; @@ -403,7 +401,7 @@ void sched_clock_tick(void) sched_clock_local(scd); } -void sched_clock_tick_stable(void) +notrace void sched_clock_tick_stable(void) { if (!sched_clock_stable()) return; @@ -423,7 +421,7 @@ void sched_clock_tick_stable(void) /* * We are going deep-idle (irqs are disabled): */ -void sched_clock_idle_sleep_event(void) +notrace void sched_clock_idle_sleep_event(void) { sched_clock_cpu(smp_processor_id()); } @@ -432,7 +430,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(void) +notrace void sched_clock_idle_wakeup_event(void) { unsigned long flags; @@ -458,7 +456,7 @@ void __init sched_clock_init(void) local_irq_enable(); } -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { if (!static_branch_likely(&sched_clock_running)) return 0; @@ -476,7 +474,7 @@ u64 sched_clock_cpu(int cpu) * On bare metal this function should return the same as local_clock. * Architectures and sub-architectures can override this. */ -u64 __weak running_clock(void) +notrace u64 __weak running_clock(void) { return local_clock(); } diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a778554f9dad..35f15c26ed54 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Generic wait-for-completion handler; * @@ -11,7 +12,6 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ -#include "sched.h" /** * complete: - signals a single thread waiting on this completion diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9745613d531c..da0bf6fe9ecd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,26 +6,93 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> -#undef CREATE_TRACE_POINTS +#include <linux/highmem.h> +#include <linux/hrtimer_api.h> +#include <linux/ktime_api.h> +#include <linux/sched/signal.h> +#include <linux/syscalls_api.h> +#include <linux/debug_locks.h> +#include <linux/prefetch.h> +#include <linux/capability.h> +#include <linux/pgtable_api.h> +#include <linux/wait_bit.h> +#include <linux/jiffies.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/hardirq.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> +#include <linux/sched/debug.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/init.h> +#include <linux/sched/isolation.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/mm.h> +#include <linux/sched/nohz.h> +#include <linux/sched/rseq_api.h> +#include <linux/sched/rt.h> -#include "sched.h" - -#include <linux/nospec.h> #include <linux/blkdev.h> +#include <linux/context_tracking.h> +#include <linux/cpuset.h> +#include <linux/delayacct.h> +#include <linux/init_task.h> +#include <linux/interrupt.h> +#include <linux/ioprio.h> +#include <linux/kallsyms.h> #include <linux/kcov.h> +#include <linux/kprobes.h> +#include <linux/llist_api.h> +#include <linux/mmu_context.h> +#include <linux/mmzone.h> +#include <linux/mutex_api.h> +#include <linux/nmi.h> +#include <linux/nospec.h> +#include <linux/perf_event_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/rcuwait_api.h> +#include <linux/sched/wake_q.h> #include <linux/scs.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/vtime.h> +#include <linux/wait_api.h> +#include <linux/workqueue_api.h> + +#ifdef CONFIG_PREEMPT_DYNAMIC +# ifdef CONFIG_GENERIC_ENTRY +# include <linux/entry-common.h> +# endif +#endif + +#include <uapi/linux/sched/types.h> #include <asm/switch_to.h> #include <asm/tlb.h> -#include "../workqueue_internal.h" -#include "../../fs/io-wq.h" -#include "../smpboot.h" +#define CREATE_TRACE_POINTS +#include <linux/sched/rseq_api.h> +#include <trace/events/sched.h> +#undef CREATE_TRACE_POINTS +#include "sched.h" +#include "stats.h" +#include "autogroup.h" + +#include "autogroup.h" #include "pelt.h" #include "smp.h" +#include "stats.h" + +#include "../workqueue_internal.h" +#include "../../fs/io-wq.h" +#include "../smpboot.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -36,6 +103,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); @@ -80,12 +148,6 @@ const_debug unsigned int sysctl_sched_nr_migrate = 8; const_debug unsigned int sysctl_sched_nr_migrate = 32; #endif -/* - * period over which we measure -rt task CPU usage in us. - * default: 1s - */ -unsigned int sysctl_sched_rt_period = 1000000; - __read_mostly int scheduler_running; #ifdef CONFIG_SCHED_CORE @@ -380,13 +442,6 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ /* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s - */ -int sysctl_sched_rt_runtime = 950000; - - -/* * Serialization rules: * * Lock order: @@ -545,10 +600,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) swap(rq1, rq2); raw_spin_rq_lock(rq1); - if (__rq_lockp(rq1) == __rq_lockp(rq2)) - return; + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(rq1, rq2); } #endif @@ -1024,13 +1079,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); rcu_read_lock(); for_each_domain(cpu, sd) { @@ -1046,7 +1101,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); cpu = default_cpu; unlock: rcu_read_unlock(); @@ -1254,10 +1309,10 @@ static void set_load_weight(struct task_struct *p, bool update_load) static DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ -unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ -unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; /* * By default RT tasks run at the maximum performance point/capacity of the @@ -1274,7 +1329,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; * This knob will not override the system default sched_util_clamp_min defined * above. */ -unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; +static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -1404,33 +1459,6 @@ static void uclamp_update_util_min_rt_default(struct task_struct *p) task_rq_unlock(rq, p, &rf); } -static void uclamp_sync_util_min_rt_default(void) -{ - struct task_struct *g, *p; - - /* - * copy_process() sysctl_uclamp - * uclamp_min_rt = X; - * write_lock(&tasklist_lock) read_lock(&tasklist_lock) - * // link thread smp_mb__after_spinlock() - * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); - * sched_post_fork() for_each_process_thread() - * __uclamp_sync_rt() __uclamp_sync_rt() - * - * Ensures that either sched_post_fork() will observe the new - * uclamp_min_rt or for_each_process_thread() will observe the new - * task. - */ - read_lock(&tasklist_lock); - smp_mb__after_spinlock(); - read_unlock(&tasklist_lock); - - rcu_read_lock(); - for_each_process_thread(g, p) - uclamp_update_util_min_rt_default(p); - rcu_read_unlock(); -} - static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { @@ -1710,6 +1738,11 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css) } static void cpu_util_update_eff(struct cgroup_subsys_state *css); +#endif + +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_UCLAMP_TASK +#ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) { struct task_group *tg = &root_task_group; @@ -1727,7 +1760,34 @@ static void uclamp_update_root_tg(void) static void uclamp_update_root_tg(void) { } #endif -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + +static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; @@ -1791,6 +1851,8 @@ done: return result; } +#endif +#endif static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) @@ -2125,7 +2187,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - else if (p->sched_class > rq->curr->sched_class) + else if (sched_class_above(p->sched_class, rq->curr->sched_class)) resched_curr(rq); /* @@ -2343,7 +2405,7 @@ static int migration_cpu_stop(void *data) * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -4279,7 +4341,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING -void set_numabalancing_state(bool enabled) +int sysctl_numa_balancing_mode; + +static void __set_numabalancing_state(bool enabled) { if (enabled) static_branch_enable(&sched_numa_balancing); @@ -4287,13 +4351,22 @@ void set_numabalancing_state(bool enabled) static_branch_disable(&sched_numa_balancing); } +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; + else + sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; + __set_numabalancing_state(enabled); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; - int state = static_branch_likely(&sched_numa_balancing); + int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4303,8 +4376,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_numabalancing_state(state); + if (write) { + sysctl_numa_balancing_mode = state; + __set_numabalancing_state(state); + } return err; } #endif @@ -4352,7 +4427,7 @@ out: __setup("schedstats=", setup_schedstats); #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, +static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -4374,6 +4449,52 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, #endif /* CONFIG_PROC_SYSCTL */ #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_core_sysctls[] = { +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif /* CONFIG_UCLAMP_TASK */ + {} +}; +static int __init sched_core_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_core_sysctls); + return 0; +} +late_initcall(sched_core_sysctl_init); +#endif /* CONFIG_SYSCTL */ + /* * fork()/clone()-time setup: */ @@ -4677,25 +4798,55 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head) static void balance_push(struct rq *rq); +/* + * balance_push_callback is a right abuse of the callback interface and plays + * by significantly different rules. + * + * Where the normal balance_callback's purpose is to be ran in the same context + * that queued it (only later, when it's safe to drop rq->lock again), + * balance_push_callback is specifically targeted at __schedule(). + * + * This abuse is tolerated because it places all the unlikely/odd cases behind + * a single test, namely: rq->balance_callback == NULL. + */ struct callback_head balance_push_callback = { .next = NULL, .func = (void (*)(struct callback_head *))balance_push, }; -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct callback_head * +__splice_balance_callbacks(struct rq *rq, bool split) { struct callback_head *head = rq->balance_callback; + if (likely(!head)) + return NULL; + lockdep_assert_rq_held(rq); - if (head) + /* + * Must not take balance_push_callback off the list when + * splice_balance_callbacks() and balance_callbacks() are not + * in the same rq->lock section. + * + * In that case it would be possible for __schedule() to interleave + * and observe the list empty. + */ + if (split && head == &balance_push_callback) + head = NULL; + else rq->balance_callback = NULL; return head; } +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +{ + return __splice_balance_callbacks(rq, true); +} + static void __balance_callbacks(struct rq *rq) { - do_balance_callbacks(rq, splice_balance_callbacks(rq)); + do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } static inline void balance_callbacks(struct rq *rq, struct callback_head *head) @@ -4834,7 +4985,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; - long prev_state; + unsigned int prev_state; /* * The previous task will have left us with a preempt_count of 2 @@ -5379,7 +5530,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5400,7 +5551,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5611,7 +5762,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ - if (likely(prev->sched_class <= &fair_sched_class && + if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); @@ -5674,6 +5825,8 @@ static inline struct task_struct *pick_task(struct rq *rq) extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); +static void queue_core_balance(struct rq *rq); + static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -5723,7 +5876,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } rq->core_pick = NULL; - return next; + goto out; } put_prev_task_balance(rq, prev, rf); @@ -5773,7 +5926,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) */ WARN_ON_ONCE(fi_before); task_vruntime_update(rq, next, false); - goto done; + goto out_set_next; } } @@ -5892,8 +6045,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) resched_curr(rq_i); } -done: +out_set_next: set_next_task(rq, next); +out: + if (rq->core->core_forceidle_count && next == rq->idle) + queue_core_balance(rq); + return next; } @@ -5922,7 +6079,7 @@ static bool try_steal_cookie(int this, int that) if (p == src->core_pick || p == src->curr) goto next; - if (!cpumask_test_cpu(this, &p->cpus_mask)) + if (!is_cpu_allowed(p, this)) goto next; if (p->core_occupation > dst->idle->core_occupation) @@ -5988,7 +6145,7 @@ static void sched_core_balance(struct rq *rq) static DEFINE_PER_CPU(struct callback_head, core_balance_head); -void queue_core_balance(struct rq *rq) +static void queue_core_balance(struct rq *rq) { if (!sched_core_enabled(rq)) return; @@ -6226,10 +6383,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* * We must load prev->state once (task_struct::state is volatile), such - * that: - * - * - we form a control dependency vs deactivate_task() below. - * - ptrace_{,un}freeze_traced() can change ->state underneath us. + * that we form a control dependency vs deactivate_task() below. */ prev_state = READ_ONCE(prev->__state); if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { @@ -6298,7 +6452,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -6353,8 +6507,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. */ - if (blk_needs_flush_plug(tsk)) - blk_flush_plug(tsk->plug, true); + blk_flush_plug(tsk->plug, true); } static void sched_update_worker(struct task_struct *tsk) @@ -6491,17 +6644,31 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; - preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_dynamic_enabled +#define preempt_schedule_dynamic_enabled preempt_schedule +#define preempt_schedule_dynamic_disabled NULL +#endif +DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); +void __sched notrace dynamic_preempt_schedule(void) +{ + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) + return; + preempt_schedule(); +} +NOKPROBE_SYMBOL(dynamic_preempt_schedule); +EXPORT_SYMBOL(dynamic_preempt_schedule); +#endif #endif - /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -6556,147 +6723,27 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); -EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_notrace_dynamic_enabled +#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +#define preempt_schedule_notrace_dynamic_disabled NULL #endif - -#endif /* CONFIG_PREEMPTION */ - -#ifdef CONFIG_PREEMPT_DYNAMIC - -#include <linux/entry-common.h> - -/* - * SC:cond_resched - * SC:might_resched - * SC:preempt_schedule - * SC:preempt_schedule_notrace - * SC:irqentry_exit_cond_resched - * - * - * NONE: - * cond_resched <- __cond_resched - * might_resched <- RET0 - * preempt_schedule <- NOP - * preempt_schedule_notrace <- NOP - * irqentry_exit_cond_resched <- NOP - * - * VOLUNTARY: - * cond_resched <- __cond_resched - * might_resched <- __cond_resched - * preempt_schedule <- NOP - * preempt_schedule_notrace <- NOP - * irqentry_exit_cond_resched <- NOP - * - * FULL: - * cond_resched <- RET0 - * might_resched <- RET0 - * preempt_schedule <- preempt_schedule - * preempt_schedule_notrace <- preempt_schedule_notrace - * irqentry_exit_cond_resched <- irqentry_exit_cond_resched - */ - -enum { - preempt_dynamic_undefined = -1, - preempt_dynamic_none, - preempt_dynamic_voluntary, - preempt_dynamic_full, -}; - -int preempt_dynamic_mode = preempt_dynamic_undefined; - -int sched_dynamic_mode(const char *str) -{ - if (!strcmp(str, "none")) - return preempt_dynamic_none; - - if (!strcmp(str, "voluntary")) - return preempt_dynamic_voluntary; - - if (!strcmp(str, "full")) - return preempt_dynamic_full; - - return -EINVAL; -} - -void sched_dynamic_update(int mode) -{ - /* - * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in - * the ZERO state, which is invalid. - */ - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - - switch (mode) { - case preempt_dynamic_none: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); - pr_info("Dynamic Preempt: none\n"); - break; - - case preempt_dynamic_voluntary: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); - pr_info("Dynamic Preempt: voluntary\n"); - break; - - case preempt_dynamic_full: - static_call_update(cond_resched, (void *)&__static_call_return0); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: full\n"); - break; - } - - preempt_dynamic_mode = mode; -} - -static int __init setup_preempt_mode(char *str) -{ - int mode = sched_dynamic_mode(str); - if (mode < 0) { - pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); - return 0; - } - - sched_dynamic_update(mode); - return 1; -} -__setup("preempt=", setup_preempt_mode); - -static void __init preempt_dynamic_init(void) +DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); +void __sched notrace dynamic_preempt_schedule_notrace(void) { - if (preempt_dynamic_mode == preempt_dynamic_undefined) { - if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { - sched_dynamic_update(preempt_dynamic_none); - } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { - sched_dynamic_update(preempt_dynamic_voluntary); - } else { - /* Default static call setting, nothing to do */ - WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); - preempt_dynamic_mode = preempt_dynamic_full; - pr_info("Dynamic Preempt: full\n"); - } - } + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) + return; + preempt_schedule_notrace(); } +NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); +EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); +#endif +#endif -#else /* !CONFIG_PREEMPT_DYNAMIC */ - -static inline void preempt_dynamic_init(void) { } - -#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ +#endif /* CONFIG_PREEMPTION */ /* * This is the entry point to schedule() from kernel preemption @@ -8203,11 +8250,35 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define cond_resched_dynamic_enabled __cond_resched +#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); +#define might_resched_dynamic_enabled __cond_resched +#define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); +int __sched dynamic_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_cond_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_cond_resched); + +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); +int __sched dynamic_might_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_might_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_might_resched); +#endif #endif /* @@ -8272,6 +8343,166 @@ int __cond_resched_rwlock_write(rwlock_t *lock) } EXPORT_SYMBOL(__cond_resched_rwlock_write); +#ifdef CONFIG_PREEMPT_DYNAMIC + +#ifdef CONFIG_GENERIC_ENTRY +#include <linux/entry-common.h> +#endif + +/* + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + */ + +enum { + preempt_dynamic_undefined = -1, + preempt_dynamic_none, + preempt_dynamic_voluntary, + preempt_dynamic_full, +}; + +int preempt_dynamic_mode = preempt_dynamic_undefined; + +int sched_dynamic_mode(const char *str) +{ + if (!strcmp(str, "none")) + return preempt_dynamic_none; + + if (!strcmp(str, "voluntary")) + return preempt_dynamic_voluntary; + + if (!strcmp(str, "full")) + return preempt_dynamic_full; + + return -EINVAL; +} + +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#else +#error "Unsupported PREEMPT_DYNAMIC mechanism" +#endif + +void sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + + switch (mode) { + case preempt_dynamic_none: + preempt_dynamic_enable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: none\n"); + break; + + case preempt_dynamic_voluntary: + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: voluntary\n"); + break; + + case preempt_dynamic_full: + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: full\n"); + break; + } + + preempt_dynamic_mode = mode; +} + +static int __init setup_preempt_mode(char *str) +{ + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); + return 0; + } + + sched_dynamic_update(mode); + return 1; +} +__setup("preempt=", setup_preempt_mode); + +static void __init preempt_dynamic_init(void) +{ + if (preempt_dynamic_mode == preempt_dynamic_undefined) { + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { + sched_dynamic_update(preempt_dynamic_none); + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { + sched_dynamic_update(preempt_dynamic_voluntary); + } else { + /* Default static call setting, nothing to do */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); + preempt_dynamic_mode = preempt_dynamic_full; + pr_info("Dynamic Preempt: full\n"); + } + } +} + +#define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ + return preempt_dynamic_mode == preempt_dynamic_##mode; \ + } \ + EXPORT_SYMBOL_GPL(preempt_model_##mode) + +PREEMPT_MODEL_ACCESSOR(none); +PREEMPT_MODEL_ACCESSOR(voluntary); +PREEMPT_MODEL_ACCESSOR(full); + +#else /* !CONFIG_PREEMPT_DYNAMIC */ + +static inline void preempt_dynamic_init(void) { } + +#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ + /** * yield - yield the current processor to other threads. * @@ -8380,9 +8611,7 @@ int io_schedule_prepare(void) int old_iowait = current->in_iowait; current->in_iowait = 1; - if (current->plug) - blk_flush_plug(current->plug, true); - + blk_flush_plug(current->plug, true); return old_iowait; } @@ -8709,7 +8938,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, { int ret = 1; - if (!cpumask_weight(cur)) + if (cpumask_empty(cur)) return ret; ret = dl_cpuset_cpumask_can_shrink(cur, trial); @@ -8737,8 +8966,11 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) - ret = dl_task_can_attach(p, cs_cpus_allowed); + cs_cpus_allowed)) { + int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); + + ret = dl_cpu_busy(cpu, p); + } out: return ret; @@ -9022,8 +9254,10 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - if (dl_cpu_busy(cpu)) - return -EBUSY; + int ret = dl_cpu_busy(cpu, NULL); + + if (ret) + return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; @@ -9053,6 +9287,7 @@ int sched_cpu_activate(unsigned int cpu) set_cpu_active(cpu, true); if (sched_smp_initialized) { + sched_update_numa(cpu, true); sched_domains_numa_masks_set(cpu); cpuset_cpu_active(); } @@ -9131,10 +9366,12 @@ int sched_cpu_deactivate(unsigned int cpu) if (!sched_smp_initialized) return 0; + sched_update_numa(cpu, false); ret = cpuset_cpu_inactive(cpu); if (ret) { balance_push_set(cpu, false); set_cpu_active(cpu, true); + sched_update_numa(cpu, true); return ret; } sched_domains_numa_masks_clear(cpu); @@ -9237,7 +9474,7 @@ int sched_cpu_dying(unsigned int cpu) void __init sched_init_smp(void) { - sched_init_numa(); + sched_init_numa(NUMA_NO_NODE); /* * There's no userspace yet to cause hotplug operations; hence all the @@ -9249,7 +9486,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) BUG(); current->flags &= ~PF_NO_SETAFFINITY; sched_init_granularity(); @@ -9302,11 +9539,11 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ - BUG_ON(&idle_sched_class + 1 != &fair_sched_class || - &fair_sched_class + 1 != &rt_sched_class || - &rt_sched_class + 1 != &dl_sched_class); + BUG_ON(&idle_sched_class != &fair_sched_class + 1 || + &fair_sched_class != &rt_sched_class + 1 || + &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP - BUG_ON(&dl_sched_class + 1 != &stop_sched_class); + BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif wait_bit_init(); @@ -9349,7 +9586,6 @@ void __init sched_init(void) #endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); #ifdef CONFIG_SMP init_defrootdomain(); diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index c8746a9a7ada..38a2cec21014 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -1,8 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/prctl.h> -#include "sched.h" - /* * A simple wrapper around refcount. An allocated sched_core_cookie's * address is used to compute the cookie of the task. diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 3d06c5e4220d..0de9dda09949 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 + /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ -#include <asm/irq_regs.h> -#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { @@ -334,14 +333,13 @@ static struct cftype files[] = { */ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { + unsigned int cpu = task_cpu(tsk); struct cpuacct *ca; - rcu_read_lock(); + lockdep_assert_rq_held(cpu_rq(cpu)); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(*ca->cpuusage, cputime); - - rcu_read_unlock(); + *per_cpu_ptr(ca->cpuusage, cpu) += cputime; } /* @@ -353,10 +351,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { struct cpuacct *ca; - rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) __this_cpu_add(ca->cpustat->cpustat[index], val); - rcu_read_unlock(); } struct cgroup_subsys cpuacct_cgrp_subsys = { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index ceb03d76c0cc..02d970a879ed 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * kernel/sched/cpudl.c + * kernel/sched/cpudeadline.c * * Global CPU deadline management * * Author: Juri Lelli <j.lelli@sssup.it> */ -#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 7c2fe50fd76d..5252fb191fae 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,9 +5,6 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ -#include <linux/cpufreq.h> - -#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 26778884d9ab..3dbf351d12d5 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -6,13 +6,6 @@ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include "sched.h" - -#include <linux/sched/cpufreq.h> -#include <trace/events/power.h> - #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) struct sugov_tunables { @@ -289,6 +282,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) * into the same scale so we can compare. */ boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); if (sg_cpu->util < boost) sg_cpu->util = boost; } @@ -348,8 +342,11 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. + * + * Except when the rq is capped by uclamp_max. */ - if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { + if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && + sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ @@ -395,8 +392,11 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, /* * Do not reduce the target performance level if the CPU has not been * idle recently, as the reduction is likely to be premature then. + * + * Except when the rq is capped by uclamp_max. */ - if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && + sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), @@ -539,7 +539,7 @@ ATTRIBUTE_GROUPS(sugov); static void sugov_tunables_free(struct kobject *kobj) { - struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + struct gov_attr_set *attr_set = to_gov_attr_set(kobj); kfree(to_sugov_tunables(attr_set)); } diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d583f2aa744e..fa9ce9d83683 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -22,7 +22,6 @@ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ -#include "sched.h" /* * p->rt_priority p->prio newpri cpupri diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b7ec42732b28..78a233d43757 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,7 +2,6 @@ /* * Simple CPU accounting cgroup controller */ -#include "sched.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d2c072b0ef01..b5152961b743 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -15,10 +15,40 @@ * Michael Trimarchi <michael@amarulasolutions.com>, * Fabio Checconi <fchecconi@gmail.com> */ -#include "sched.h" -#include "pelt.h" -struct dl_bandwidth def_dl_bandwidth; +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting ridiculously long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_dl_sysctls[] = { + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static int __init sched_dl_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_dl_sysctls); + return 0; +} +late_initcall(sched_dl_sysctl_init); +#endif static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { @@ -130,6 +160,21 @@ static inline bool dl_bw_visited(int cpu, u64 gen) rd->visit_gen = gen; return false; } + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -150,9 +195,38 @@ static inline bool dl_bw_visited(int cpu, u64 gen) { return false; } + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} #endif static inline +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline bool +__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; +} + +static inline void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -408,7 +482,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; - return dl_rq->root.rb_leftmost == &dl_se->rb_node; + return rb_first_cached(&dl_rq->root) == &dl_se->rb_node; } static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); @@ -423,12 +497,10 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); - raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); if (global_rt_runtime() == RUNTIME_INF) dl_b->bw = -1; else dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); - raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); dl_b->total_bw = 0; } @@ -683,15 +755,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } -static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_dl_task(struct rq *rq) -{ -} - static inline void deadline_queue_push_tasks(struct rq *rq) { } @@ -1191,8 +1254,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) return (dl_se->runtime <= 0); } -extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); - /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is @@ -1393,6 +1454,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) timer->function = inactive_task_timer; } +#define __node_2_dle(node) \ + rb_entry((node), struct sched_dl_entity, rb_node) + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -1422,10 +1486,9 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) cpudl_clear(&rq->rd->cpudl, rq->cpu); cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { - struct rb_node *leftmost = dl_rq->root.rb_leftmost; - struct sched_dl_entity *entry; + struct rb_node *leftmost = rb_first_cached(&dl_rq->root); + struct sched_dl_entity *entry = __node_2_dle(leftmost); - entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } @@ -1466,9 +1529,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) dec_dl_migration(dl_se, dl_rq); } -#define __node_2_dle(node) \ - rb_entry((node), struct sched_dl_entity, rb_node) - static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) { return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); @@ -1804,6 +1864,7 @@ out: static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { + struct rq_flags rf; struct rq *rq; if (READ_ONCE(p->__state) != TASK_WAKING) @@ -1815,7 +1876,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_rq_lock(rq); + rq_lock(rq, &rf); if (p->dl.dl_non_contending) { update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); @@ -1831,7 +1892,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused put_task_struct(p); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); } static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) @@ -1931,15 +1992,14 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) deadline_queue_push_tasks(rq); } -static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, - struct dl_rq *dl_rq) +static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) { struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; - return rb_entry(left, struct sched_dl_entity, rb_node); + return __node_2_dle(left); } static struct task_struct *pick_task_dl(struct rq *rq) @@ -1951,7 +2011,7 @@ static struct task_struct *pick_task_dl(struct rq *rq) if (!sched_dl_runnable(rq)) return NULL; - dl_se = pick_next_dl_entity(rq, dl_rq); + dl_se = pick_next_dl_entity(dl_rq); BUG_ON(!dl_se); p = dl_task_of(dl_se); @@ -2034,15 +2094,17 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; + struct rb_node *next_node; if (!has_pushable_dl_tasks(rq)) return NULL; + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); + next_node: if (next_node) { - p = rb_entry(next_node, struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(next_node); if (pick_dl_task(rq, p, cpu)) return p; @@ -2208,8 +2270,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, - struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); @@ -2240,12 +2301,6 @@ static int push_dl_task(struct rq *rq) return 0; retry: - if (is_migration_disabled(next_task)) - return 0; - - if (WARN_ON(next_task == rq->curr)) - return 0; - /* * If next_task preempts rq->curr, and rq->curr * can move away, it makes sense to just reschedule @@ -2258,6 +2313,12 @@ retry: return 0; } + if (is_migration_disabled(next_task)) + return 0; + + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); @@ -2291,13 +2352,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); - - /* - * Update the later_rq clock here, because the clock is used - * by the cpufreq_update_util() inside __add_running_bw(). - */ - update_rq_clock(later_rq); - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); + activate_task(later_rq, next_task, 0); ret = 1; resched_curr(later_rq); @@ -2731,9 +2786,6 @@ void sched_dl_do_global(void) int cpu; unsigned long flags; - def_dl_bandwidth.dl_period = global_rt_period(); - def_dl_bandwidth.dl_runtime = global_rt_runtime(); - if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); @@ -2855,14 +2907,6 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) } /* - * Default limits for DL period; on the top end we guard against small util - * tasks still getting ridiculously long effective runtimes, on the bottom end we - * guard against timer DoS. - */ -unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ -unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ - -/* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or @@ -2955,41 +2999,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) } #ifdef CONFIG_SMP -int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) -{ - unsigned long flags, cap; - unsigned int dest_cpu; - struct dl_bw *dl_b; - bool overflow; - int ret; - - dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cap = dl_bw_capacity(dest_cpu); - overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw); - if (overflow) { - ret = -EBUSY; - } else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - int cpus = dl_bw_cpus(dest_cpu); - - __dl_add(dl_b, p->dl.dl_bw, cpus); - ret = 0; - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); - - return ret; -} - int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -3011,7 +3020,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -bool dl_cpu_busy(unsigned int cpu) +int dl_cpu_busy(int cpu, struct task_struct *p) { unsigned long flags, cap; struct dl_bw *dl_b; @@ -3021,11 +3030,22 @@ bool dl_cpu_busy(unsigned int cpu) dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); cap = dl_bw_capacity(cpu); - overflow = __dl_overflow(dl_b, cap, 0, 0); + overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); + + if (!overflow && p) { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - return overflow; + return overflow ? -EBUSY : 0; } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index aa29211de1bf..bb3d63bdf4ae 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,7 +6,6 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ -#include "sched.h" /* * This allows printing both to /proc/sched_debug and @@ -931,25 +930,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, static void sched_show_numa(struct task_struct *p, struct seq_file *m) { #ifdef CONFIG_NUMA_BALANCING - struct mempolicy *pol; - if (p->mm) P(mm->numa_scan_seq); - task_lock(p); - pol = p->mempolicy; - if (pol && !(pol->flags & MPOL_F_MORON)) - pol = NULL; - mpol_get(pol); - task_unlock(p); - P(numa_pages_migrated); P(numa_preferred_nid); P(total_numa_faults); SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); - mpol_put(pol); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5146163bfabb..77b2048a9326 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,7 +20,40 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ +#include <linux/energy_model.h> +#include <linux/mmap_lock.h> +#include <linux/hugetlb_inline.h> +#include <linux/jiffies.h> +#include <linux/mm_api.h> +#include <linux/highmem.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> +#include <linux/sched/isolation.h> +#include <linux/sched/nohz.h> + +#include <linux/cpuidle.h> +#include <linux/interrupt.h> +#include <linux/mempolicy.h> +#include <linux/mutex_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/ratelimit.h> +#include <linux/task_work.h> + +#include <asm/switch_to.h> + +#include <linux/sched/cond_resched.h> + #include "sched.h" +#include "stats.h" +#include "autogroup.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -141,7 +174,37 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_fair_sysctls[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif + {} +}; + +static int __init sched_fair_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_fair_sysctls); + return 0; +} +late_initcall(sched_fair_sysctl_init); #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) @@ -281,19 +344,6 @@ const struct sched_class fair_sched_class; #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (!path) - return; - - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) - autogroup_path(cfs_rq->tg, path, len); - else if (cfs_rq && cfs_rq->tg->css.cgroup) - cgroup_path(cfs_rq->tg->css.cgroup, path, len); - else - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -461,12 +511,6 @@ static int se_is_idle(struct sched_entity *se) #define for_each_sched_entity(se) \ for (; se; se = NULL) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (path) - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { return true; @@ -1259,10 +1303,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng) /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, - int maxdist, bool task) + int lim_dist, bool task) { unsigned long score = 0; - int node; + int node, max_dist; /* * All nodes are directly connected, and the same distance @@ -1271,6 +1315,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, if (sched_numa_topology_type == NUMA_DIRECT) return 0; + /* sched_max_numa_distance may be changed in parallel. */ + max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, * which should be ok given the number of nodes rarely exceeds 8. @@ -1283,7 +1329,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * The furthest away nodes in the system are not interesting * for placement; nid was already counted. */ - if (dist == sched_max_numa_distance || node == nid) + if (dist >= max_dist || node == nid) continue; /* @@ -1293,8 +1339,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * "hoplimit", only nodes closer by than "hoplimit" are part * of each group. Skip other nodes. */ - if (sched_numa_topology_type == NUMA_BACKPLANE && - dist >= maxdist) + if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) continue; /* Add up the faults from nearby nodes. */ @@ -1312,8 +1357,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * This seems to result in good task placement. */ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { - faults *= (sched_max_numa_distance - dist); - faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + faults *= (max_dist - dist); + faults /= (max_dist - LOCAL_DISTANCE); } score += faults; @@ -1489,6 +1534,7 @@ struct task_numa_env { int src_cpu, src_nid; int dst_cpu, dst_nid; + int imb_numa_nr; struct numa_stats src_stats, dst_stats; @@ -1503,7 +1549,7 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight); + int dst_running, int imb_numa_nr); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1884,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); imbalance = adjust_numa_imbalance(imbalance, dst_running, - env->dst_stats.weight); + env->imb_numa_nr); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -1949,8 +1995,10 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) + if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + env.imb_numa_nr = sd->imb_numa_nr; + } rcu_read_unlock(); /* @@ -1985,7 +2033,7 @@ static int task_numa_migrate(struct task_struct *p) */ ng = deref_curr_numa_group(p); if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -2083,13 +2131,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group) unsigned long faults, max_faults = 0; int nid, active_nodes = 0; - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults > max_faults) max_faults = faults; } - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults * ACTIVE_NODE_FRACTION > max_faults) active_nodes++; @@ -2243,7 +2291,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) dist = sched_max_numa_distance; - for_each_online_node(node) { + for_each_node_state(node, N_CPU) { score = group_weight(p, node, dist); if (score > max_score) { max_score = score; @@ -2262,7 +2310,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) * inside the highest scoring group of nodes. The nodemask tricks * keep the complexity of the search down. */ - nodes = node_online_map; + nodes = node_states[N_CPU]; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; nodemask_t max_group = NODE_MASK_NONE; @@ -2401,6 +2449,21 @@ static void task_numa_placement(struct task_struct *p) } } + /* Cannot migrate task to CPU-less node */ + if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) { + int near_nid = max_nid; + int distance, near_distance = INT_MAX; + + for_each_node_state(nid, N_CPU) { + distance = node_distance(max_nid, nid); + if (distance < near_distance) { + near_nid = nid; + near_distance = distance; + } + } + max_nid = near_nid; + } + if (ng) { numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); @@ -2825,6 +2888,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; + p->numa_pages_migrated = 0; + p->total_numa_faults = 0; RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -2862,7 +2927,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@ -3776,11 +3841,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; @@ -4793,11 +4858,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) + if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } @@ -4811,7 +4876,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@ -5255,7 +5320,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -6491,108 +6556,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } /* - * cpu_util_without: compute cpu utilization without any contributions from *p - * @cpu: the CPU which utilization is requested - * @p: the task which utilization should be discounted - * - * The utilization of a CPU is defined by the utilization of tasks currently - * enqueued on that CPU as well as tasks which are currently sleeping after an - * execution on that CPU. - * - * This method returns the utilization of the specified CPU by discounting the - * utilization of the specified task, whenever the task is currently - * contributing to the CPU utilization. - */ -static unsigned long cpu_util_without(int cpu, struct task_struct *p) -{ - struct cfs_rq *cfs_rq; - unsigned int util; - - /* Task has no contribution or is new */ - if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util_cfs(cpu); - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - /* Discount task's util from CPU's util */ - lsub_positive(&util, task_util(p)); - - /* - * Covered cases: - * - * a) if *p is the only task sleeping on this CPU, then: - * cpu_util (== task_util) > util_est (== 0) - * and thus we return: - * cpu_util_without = (cpu_util - task_util) = 0 - * - * b) if other tasks are SLEEPING on this CPU, which is now exiting - * IDLE, then: - * cpu_util >= task_util - * cpu_util > util_est (== 0) - * and thus we discount *p's blocked utilization to return: - * cpu_util_without = (cpu_util - task_util) >= 0 - * - * c) if other tasks are RUNNABLE on that CPU and - * util_est > cpu_util - * then we use util_est since it returns a more restrictive - * estimation of the spare capacity on that CPU, by just - * considering the expected utilization of tasks already - * runnable on that CPU. - * - * Cases a) and b) are covered by the above code, while case c) is - * covered by the following code when estimated utilization is - * enabled. - */ - if (sched_feat(UTIL_EST)) { - unsigned int estimated = - READ_ONCE(cfs_rq->avg.util_est.enqueued); - - /* - * Despite the following checks we still have a small window - * for a possible race, when an execl's select_task_rq_fair() - * races with LB's detach_task(): - * - * detach_task() - * p->on_rq = TASK_ON_RQ_MIGRATING; - * ---------------------------------- A - * deactivate_task() \ - * dequeue_task() + RaceTime - * util_est_dequeue() / - * ---------------------------------- B - * - * The additional check on "current == p" it's required to - * properly fix the execl regression and it helps in further - * reducing the chances for the above race. - */ - if (unlikely(task_on_rq_queued(p) || current == p)) - lsub_positive(&estimated, _task_util_est(p)); - - util = max(util, estimated); - } - - /* - * Utilization (estimated) can exceed the CPU capacity, thus let's - * clamp to the maximum CPU capacity to ensure consistency with - * cpu_util. - */ - return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - -/* - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) - * to @dst_cpu. + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu + * (@dst_cpu = -1) or migrated to @dst_cpu. */ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) { struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); /* - * If @p migrates from @cpu to another, remove its contribution. Or, - * if @p migrates from another CPU to @cpu, add its contribution. In - * the other cases, @cpu is not impacted by the migration, so the - * util_avg should already be correct. + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its + * contribution. If @p migrates from another CPU to @cpu add its + * contribution. In all the other cases @cpu is not impacted by the + * migration so its util_avg is already correct. */ if (task_cpu(p) == cpu && dst_cpu != cpu) lsub_positive(&util, task_util(p)); @@ -6600,16 +6576,40 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) util += task_util(p); if (sched_feat(UTIL_EST)) { + unsigned long util_est; + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); /* - * During wake-up, the task isn't enqueued yet and doesn't - * appear in the cfs_rq->avg.util_est.enqueued of any rq, - * so just add it (if needed) to "simulate" what will be - * cpu_util after the task has been enqueued. + * During wake-up @p isn't enqueued yet and doesn't contribute + * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p + * has been enqueued. + * + * During exec (@dst_cpu = -1) @p is enqueued and does + * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. + * Remove it to "simulate" cpu_util without @p's contribution. + * + * Despite the task_on_rq_queued(@p) check there is still a + * small window for a possible race when an exec + * select_task_rq_fair() races with LB's detach_task(). + * + * detach_task() + * deactivate_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * -------------------------------- A + * dequeue_task() \ + * dequeue_task_fair() + Race Time + * util_est_dequeue() / + * -------------------------------- B + * + * The additional check "current == p" is required to further + * reduce the race window. */ if (dst_cpu == cpu) util_est += _task_util_est(p); + else if (unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&util_est, _task_util_est(p)); util = max(util, util_est); } @@ -6618,6 +6618,28 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) } /* + * cpu_util_without: compute cpu utilization without any contributions from *p + * @cpu: the CPU which utilization is requested + * @p: the task which utilization should be discounted + * + * The utilization of a CPU is defined by the utilization of tasks currently + * enqueued on that CPU as well as tasks which are currently sleeping after an + * execution on that CPU. + * + * This method returns the utilization of the specified CPU by discounting the + * utilization of the specified task, whenever the task is currently + * contributing to the CPU utilization. + */ +static unsigned long cpu_util_without(int cpu, struct task_struct *p) +{ + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_util_cfs(cpu); + + return cpu_util_next(cpu, p, -1); +} + +/* * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization * landscape of @pd's CPUs after the task migration, and uses the Energy Model @@ -9040,9 +9062,9 @@ static bool update_pick_idlest(struct sched_group *idlest, * This is an approximation as the number of running tasks may not be * related to the number of busy CPUs due to sched_setaffinity. */ -static inline bool allow_numa_imbalance(int dst_running, int dst_weight) +static inline bool allow_numa_imbalance(int running, int imb_numa_nr) { - return (dst_running < (dst_weight >> 2)); + return running <= imb_numa_nr; } /* @@ -9176,12 +9198,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; #endif /* - * Otherwise, keep the task on this node to stay close - * its wakeup source and improve locality. If there is - * a real need of migration, periodic load balance will - * take care of it. + * Otherwise, keep the task close to the wakeup source + * and improve locality if the number of running tasks + * would remain below threshold where an imbalance is + * allowed. If there is a real need of migration, + * periodic load balance will take care of it. */ - if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) + if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr)) return NULL; } @@ -9273,9 +9296,9 @@ next_group: #define NUMA_IMBALANCE_MIN 2 static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight) + int dst_running, int imb_numa_nr) { - if (!allow_numa_imbalance(dst_running, dst_weight)) + if (!allow_numa_imbalance(dst_running, imb_numa_nr)) return imbalance; /* @@ -9387,7 +9410,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running, busiest->group_weight); + local->sum_nr_running + 1, env->sd->imb_numa_nr); } return; @@ -9406,8 +9429,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / local->group_capacity; - sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / - sds->total_capacity; /* * If the local group is more loaded than the selected * busiest group don't try to pull any tasks. @@ -9416,6 +9437,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->imbalance = 0; return; } + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* @@ -9441,7 +9465,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force + * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load @@ -10351,7 +10375,7 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set * anywhere yet. */ @@ -10360,7 +10384,7 @@ static inline int find_new_ilb(void) int ilb; const struct cpumask *hk_mask; - hk_mask = housekeeping_cpumask(HK_FLAG_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_MISC); for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { @@ -10376,7 +10400,7 @@ static inline int find_new_ilb(void) /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). + * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -10589,7 +10613,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) return; /* @@ -10805,7 +10829,7 @@ static void nohz_newidle_balance(struct rq *this_rq) * This CPU doesn't want to be disturbed by scheduler * housekeeping */ - if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) return; /* Will wake up very soon. No time for doing anything else*/ @@ -11827,101 +11851,3 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } - -/* - * Helper functions to facilitate extracting info from tracepoints. - */ - -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) -{ -#ifdef CONFIG_SMP - return cfs_rq ? &cfs_rq->avg : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); - -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) -{ - if (!cfs_rq) { - if (str) - strlcpy(str, "(null)", len); - else - return NULL; - } - - cfs_rq_tg_path(cfs_rq, str, len); - return str; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); - -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) -{ - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_rt : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); - -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_dl : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); - -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) - return rq ? &rq->avg_irq : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); - -int sched_trace_rq_cpu(struct rq *rq) -{ - return rq ? cpu_of(rq) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); - -int sched_trace_rq_cpu_capacity(struct rq *rq) -{ - return rq ? -#ifdef CONFIG_SMP - rq->cpu_capacity -#else - SCHED_CAPACITY_SCALE -#endif - : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); - -const struct cpumask *sched_trace_rd_span(struct root_domain *rd) -{ -#ifdef CONFIG_SMP - return rd ? rd->span : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rd_span); - -int sched_trace_rq_nr_running(struct rq *rq) -{ - return rq ? rq->nr_running : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d17b0a5ce6ac..328cccbee444 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,9 +6,6 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ -#include "sched.h" - -#include <trace/events/power.h> /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; @@ -105,7 +102,7 @@ void __cpuidle default_idle_call(void) * last -- this is very similar to the entry code. */ trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(_THIS_IP_); + lockdep_hardirqs_on_prepare(); rcu_idle_enter(); lockdep_hardirqs_on(_THIS_IP_); @@ -330,7 +327,7 @@ static void do_idle(void) * RCU relies on this call to be done outside of an RCU read-side * critical section. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); schedule_idle(); if (unlikely(klp_patch_pending(current))) @@ -437,7 +434,6 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir { update_idle_core(rq); schedstat_inc(rq->sched_goidle); - queue_core_balance(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 7f06eaf12818..373d42c707bc 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,136 +7,179 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ -#include "sched.h" + +enum hk_flags { + HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), + HK_FLAG_RCU = BIT(HK_TYPE_RCU), + HK_FLAG_MISC = BIT(HK_TYPE_MISC), + HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), + HK_FLAG_TICK = BIT(HK_TYPE_TICK), + HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), + HK_FLAG_WQ = BIT(HK_TYPE_WQ), + HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), + HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), +}; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); EXPORT_SYMBOL_GPL(housekeeping_overridden); -static cpumask_var_t housekeeping_mask; -static unsigned int housekeeping_flags; -bool housekeeping_enabled(enum hk_flags flags) +struct housekeeping { + cpumask_var_t cpumasks[HK_TYPE_MAX]; + unsigned long flags; +}; + +static struct housekeeping housekeeping; + +bool housekeeping_enabled(enum hk_type type) { - return !!(housekeeping_flags & flags); + return !!(housekeeping.flags & BIT(type)); } EXPORT_SYMBOL_GPL(housekeeping_enabled); -int housekeeping_any_cpu(enum hk_flags flags) +int housekeeping_any_cpu(enum hk_type type) { int cpu; if (static_branch_unlikely(&housekeeping_overridden)) { - if (housekeeping_flags & flags) { - cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); + if (housekeeping.flags & BIT(type)) { + cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id()); if (cpu < nr_cpu_ids) return cpu; - return cpumask_any_and(housekeeping_mask, cpu_online_mask); + return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); } } return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); -const struct cpumask *housekeeping_cpumask(enum hk_flags flags) +const struct cpumask *housekeeping_cpumask(enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - return housekeeping_mask; + if (housekeeping.flags & BIT(type)) + return housekeeping.cpumasks[type]; return cpu_possible_mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); -void housekeeping_affine(struct task_struct *t, enum hk_flags flags) +void housekeeping_affine(struct task_struct *t, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - set_cpus_allowed_ptr(t, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]); } EXPORT_SYMBOL_GPL(housekeeping_affine); -bool housekeeping_test_cpu(int cpu, enum hk_flags flags) +bool housekeeping_test_cpu(int cpu, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - return cpumask_test_cpu(cpu, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); void __init housekeeping_init(void) { - if (!housekeeping_flags) + enum hk_type type; + + if (!housekeeping.flags) return; static_branch_enable(&housekeeping_overridden); - if (housekeeping_flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_TICK) sched_tick_offload_init(); - /* We need at least one CPU to handle housekeeping work */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); + for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type])); + } } -static int __init housekeeping_setup(char *str, enum hk_flags flags) +static void __init housekeeping_setup_type(enum hk_type type, + cpumask_var_t housekeeping_staging) { - cpumask_var_t non_housekeeping_mask; - cpumask_var_t tmp; + + alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]); + cpumask_copy(housekeeping.cpumasks[type], + housekeeping_staging); +} + +static int __init housekeeping_setup(char *str, unsigned long flags) +{ + cpumask_var_t non_housekeeping_mask, housekeeping_staging; + int err = 0; + + if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { + if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { + pr_warn("Housekeeping: nohz unsupported." + " Build with CONFIG_NO_HZ_FULL\n"); + return 0; + } + } alloc_bootmem_cpumask_var(&non_housekeeping_mask); if (cpulist_parse(str, non_housekeeping_mask) < 0) { pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + goto free_non_housekeeping_mask; } - alloc_bootmem_cpumask_var(&tmp); - if (!housekeeping_flags) { - alloc_bootmem_cpumask_var(&housekeeping_mask); - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, non_housekeeping_mask); + alloc_bootmem_cpumask_var(&housekeeping_staging); + cpumask_andnot(housekeeping_staging, + cpu_possible_mask, non_housekeeping_mask); - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) { + if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) { + __cpumask_set_cpu(smp_processor_id(), housekeeping_staging); + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); + if (!housekeeping.flags) { pr_warn("Housekeeping: must include one present CPU, " "using boot CPU:%d\n", smp_processor_id()); - __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - } - } else { - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); - if (!cpumask_equal(tmp, housekeeping_mask)) { - pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); - free_bootmem_cpumask_var(tmp); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; } } - free_bootmem_cpumask_var(tmp); - if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { - if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { - tick_nohz_full_setup(non_housekeeping_mask); - } else { - pr_warn("Housekeeping: nohz unsupported." - " Build with CONFIG_NO_HZ_FULL\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + if (!housekeeping.flags) { + /* First setup call ("nohz_full=" or "isolcpus=") */ + enum hk_type type; + + for_each_set_bit(type, &flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); + } else { + /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */ + enum hk_type type; + unsigned long iter_flags = flags & housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) { + if (!cpumask_equal(housekeeping_staging, + housekeeping.cpumasks[type])) { + pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); + goto free_housekeeping_staging; + } } + + iter_flags = flags & ~housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); } - housekeeping_flags |= flags; + if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) + tick_nohz_full_setup(non_housekeeping_mask); + + housekeeping.flags |= flags; + err = 1; +free_housekeeping_staging: + free_bootmem_cpumask_var(housekeeping_staging); +free_non_housekeeping_mask: free_bootmem_cpumask_var(non_housekeeping_mask); - return 1; + return err; } static int __init housekeeping_nohz_full_setup(char *str) { - unsigned int flags; + unsigned long flags; flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC | HK_FLAG_KTHREAD; @@ -147,7 +190,7 @@ __setup("nohz_full=", housekeeping_nohz_full_setup); static int __init housekeeping_isolcpus_setup(char *str) { - unsigned int flags = 0; + unsigned long flags = 0; bool illegal = false; char *par; int len; diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 954b229868d9..52c8f8226b0d 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,7 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ -#include "sched.h" /* * Global load-average calculations diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 3d2825408e3a..0c5be7ebb1dc 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,7 +4,6 @@ * * membarrier system call */ -#include "sched.h" /* * For documentation purposes, here are some membarrier ordering diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a554e3bbab2b..0f310768260c 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -24,10 +24,6 @@ * Author: Vincent Guittot <vincent.guittot@linaro.org> */ -#include <linux/sched.h> -#include "sched.h" -#include "pelt.h" - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index c336f5f481bc..4ff2ed4f8fa1 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq) static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } #else static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e14358178849..a337f3e35997 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -137,21 +137,6 @@ * sampling of the aggregate task states would be. */ -#include "../workqueue_internal.h" -#include <linux/sched/loadavg.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/seqlock.h> -#include <linux/uaccess.h> -#include <linux/cgroup.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/ctype.h> -#include <linux/file.h> -#include <linux/poll.h> -#include <linux/psi.h> -#include "sched.h" - static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); @@ -523,7 +508,7 @@ static void init_triggers(struct psi_group *group, u64 now) static u64 update_triggers(struct psi_group *group, u64 now) { struct psi_trigger *t; - bool new_stall = false; + bool update_total = false; u64 *total = group->total[PSI_POLL]; /* @@ -532,24 +517,35 @@ static u64 update_triggers(struct psi_group *group, u64 now) */ list_for_each_entry(t, &group->triggers, node) { u64 growth; + bool new_stall; - /* Check for stall activity */ - if (group->polling_total[t->state] == total[t->state]) - continue; + new_stall = group->polling_total[t->state] != total[t->state]; + /* Check for stall activity or a previous threshold breach */ + if (!new_stall && !t->pending_event) + continue; /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. + * Check for new stall activity, as well as deferred + * events that occurred in the last window after the + * trigger had already fired (we want to ratelimit + * events without dropping any). */ - new_stall = true; - - /* Calculate growth since last update */ - growth = window_update(&t->win, now, total[t->state]); - if (growth < t->threshold) - continue; - + if (new_stall) { + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + update_total = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + t->pending_event = true; + } /* Limit event signaling to once per window */ if (now < t->last_event_time + t->win.size) continue; @@ -558,9 +554,11 @@ static u64 update_triggers(struct psi_group *group, u64 now) if (cmpxchg(&t->event, 0, 1) == 0) wake_up_interruptible(&t->event_wait); t->last_event_time = now; + /* Reset threshold breach flag once event got generated */ + t->pending_event = false; } - if (new_stall) + if (update_total) memcpy(group->polling_total, total, sizeof(group->polling_total)); @@ -1062,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) mutex_unlock(&group->avgs_lock); for (full = 0; full < 2; full++) { - unsigned long avg[3]; - u64 total; + unsigned long avg[3] = { 0, }; + u64 total = 0; int w; - for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[PSI_AVGS][res * 2 + full], - NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -1119,11 +1120,13 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->state = state; t->threshold = threshold_us * NSEC_PER_USEC; t->win.size = window_us * NSEC_PER_USEC; - window_reset(&t->win, 0, 0, 0); + window_reset(&t->win, sched_clock(), + group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); + t->pending_event = false; mutex_lock(&group->trigger_lock); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7b4f4fbbb404..8c9ed9664840 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,12 +3,8 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ -#include "sched.h" - -#include "pelt.h" int sched_rr_timeslice = RR_TIMESLICE; -int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -16,6 +12,57 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; +/* + * period over which we measure -rt task CPU usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +#ifdef CONFIG_SYSCTL +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static struct ctl_table sched_rt_sysctls[] = { + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, + {} +}; + +static int __init sched_rt_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_rt_sysctls); + return 0; +} +late_initcall(sched_rt_sysctl_init); +#endif + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = @@ -271,8 +318,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP -static void pull_rt_task(struct rq *this_rq); - static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ @@ -429,15 +474,6 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { } -static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_rt_task(struct rq *this_rq) -{ -} - static inline void rt_queue_push_tasks(struct rq *rq) { } @@ -885,6 +921,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + struct rq_flags rf; int skip; /* @@ -899,7 +936,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (skip) continue; - raw_spin_rq_lock(rq); + rq_lock(rq, &rf); update_rq_clock(rq); if (rt_rq->rt_time) { @@ -937,7 +974,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); } if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) @@ -1730,8 +1767,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f rt_queue_push_tasks(rq); } -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq) +static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) { struct rt_prio_array *array = &rt_rq->active; struct sched_rt_entity *next = NULL; @@ -1753,7 +1789,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; do { - rt_se = pick_next_rt_entity(rq, rt_rq); + rt_se = pick_next_rt_entity(rt_rq); BUG_ON(!rt_se); rt_rq = group_rt_rq(rt_se); } while (rt_rq); @@ -2026,6 +2062,16 @@ static int push_rt_task(struct rq *rq, bool pull) return 0; retry: + /* + * It's possible that the next_task slipped in of + * higher priority than current. If that's the case + * just reschedule current. + */ + if (unlikely(next_task->prio < rq->curr->prio)) { + resched_curr(rq); + return 0; + } + if (is_migration_disabled(next_task)) { struct task_struct *push_task = NULL; int cpu; @@ -2033,6 +2079,18 @@ retry: if (!pull || rq->push_busy) return 0; + /* + * Invoking find_lowest_rq() on anything but an RT task doesn't + * make sense. Per the above priority check, curr has to + * be of higher priority than next_task, so no need to + * reschedule when bailing out. + * + * Note that the stoppers are masqueraded as SCHED_FIFO + * (cf. sched_set_stop_task()), so we can't rely on rt_task(). + */ + if (rq->curr->sched_class != &rt_sched_class) + return 0; + cpu = find_lowest_rq(rq->curr); if (cpu == -1 || cpu == rq->cpu) return 0; @@ -2057,16 +2115,6 @@ retry: if (WARN_ON(next_task == rq->curr)) return 0; - /* - * It's possible that the next_task slipped in of - * higher priority than current. If that's the case - * just reschedule current. - */ - if (unlikely(next_task->prio < rq->curr->prio)) { - resched_curr(rq); - return 0; - } - /* We might release rq lock */ get_task_struct(next_task); @@ -2864,6 +2912,7 @@ long sched_group_rt_period(struct task_group *tg) return rt_period_us; } +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { int ret = 0; @@ -2874,6 +2923,7 @@ static int sched_rt_global_constraints(void) return ret; } +#endif /* CONFIG_SYSCTL */ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { @@ -2885,6 +2935,8 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) } #else /* !CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { unsigned long flags; @@ -2902,8 +2954,10 @@ static int sched_rt_global_constraints(void) return 0; } +#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) { if (sysctl_sched_rt_period <= 0) @@ -2928,7 +2982,7 @@ static void sched_rt_do_global(void) raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } -int sched_rt_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_period, old_runtime; @@ -2967,7 +3021,7 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2988,6 +3042,7 @@ int sched_rr_handler(struct ctl_table *table, int write, void *buffer, return ret; } +#endif /* CONFIG_SYSCTL */ #ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index de53be905739..47b89a0fc6e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,86 +2,98 @@ /* * Scheduler internal types and methods: */ -#include <linux/sched.h> +#ifndef _KERNEL_SCHED_SCHED_H +#define _KERNEL_SCHED_SCHED_H +#include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> -#include <linux/sched/clock.h> -#include <linux/sched/coredump.h> #include <linux/sched/cpufreq.h> -#include <linux/sched/cputime.h> #include <linux/sched/deadline.h> -#include <linux/sched/debug.h> -#include <linux/sched/hotplug.h> -#include <linux/sched/idle.h> -#include <linux/sched/init.h> -#include <linux/sched/isolation.h> -#include <linux/sched/jobctl.h> +#include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/mm.h> -#include <linux/sched/nohz.h> -#include <linux/sched/numa_balancing.h> -#include <linux/sched/prio.h> -#include <linux/sched/rt.h> +#include <linux/sched/rseq_api.h> #include <linux/sched/signal.h> #include <linux/sched/smt.h> #include <linux/sched/stat.h> #include <linux/sched/sysctl.h> +#include <linux/sched/task_flags.h> #include <linux/sched/task.h> -#include <linux/sched/task_stack.h> #include <linux/sched/topology.h> -#include <linux/sched/user.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/xacct.h> -#include <uapi/linux/sched/types.h> - -#include <linux/binfmts.h> -#include <linux/bitops.h> -#include <linux/compat.h> -#include <linux/context_tracking.h> +#include <linux/atomic.h> +#include <linux/bitmap.h> +#include <linux/bug.h> +#include <linux/capability.h> +#include <linux/cgroup_api.h> +#include <linux/cgroup.h> #include <linux/cpufreq.h> -#include <linux/cpuidle.h> -#include <linux/cpuset.h> +#include <linux/cpumask_api.h> #include <linux/ctype.h> -#include <linux/debugfs.h> -#include <linux/delayacct.h> -#include <linux/energy_model.h> -#include <linux/init_task.h> -#include <linux/kprobes.h> +#include <linux/file.h> +#include <linux/fs_api.h> +#include <linux/hrtimer_api.h> +#include <linux/interrupt.h> +#include <linux/irq_work.h> +#include <linux/jiffies.h> +#include <linux/kref_api.h> #include <linux/kthread.h> -#include <linux/membarrier.h> -#include <linux/migrate.h> -#include <linux/mmu_context.h> -#include <linux/nmi.h> +#include <linux/ktime_api.h> +#include <linux/lockdep_api.h> +#include <linux/lockdep.h> +#include <linux/minmax.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/mutex_api.h> +#include <linux/plist.h> +#include <linux/poll.h> #include <linux/proc_fs.h> -#include <linux/prefetch.h> #include <linux/profile.h> #include <linux/psi.h> -#include <linux/ratelimit.h> -#include <linux/rcupdate_wait.h> -#include <linux/security.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/seqlock.h> +#include <linux/softirq.h> +#include <linux/spinlock_api.h> +#include <linux/static_key.h> #include <linux/stop_machine.h> -#include <linux/suspend.h> -#include <linux/swait.h> +#include <linux/syscalls_api.h> #include <linux/syscalls.h> -#include <linux/task_work.h> -#include <linux/tsacct_kern.h> +#include <linux/tick.h> +#include <linux/topology.h> +#include <linux/types.h> +#include <linux/u64_stats_sync_api.h> +#include <linux/uaccess.h> +#include <linux/wait_api.h> +#include <linux/wait_bit.h> +#include <linux/workqueue_api.h> + +#include <trace/events/power.h> +#include <trace/events/sched.h> + +#include "../workqueue_internal.h" + +#ifdef CONFIG_CGROUP_SCHED +#include <linux/cgroup.h> +#include <linux/psi.h> +#endif -#include <asm/tlb.h> +#ifdef CONFIG_SCHED_DEBUG +# include <linux/static_key.h> +#endif #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> +# include <asm/paravirt_api_clock.h> #endif #include "cpupri.h" #include "cpudeadline.h" -#include <trace/events/sched.h> - #ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif struct rq; @@ -96,10 +108,17 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern unsigned int sysctl_sched_child_runs_first; + extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); extern void call_trace_sched_update_nr_running(struct rq *rq, int count); + +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; +extern int sched_rr_timeslice; + /* * Helpers for converting nanosecond timing to jiffy resolution */ @@ -301,29 +320,6 @@ struct dl_bw { u64 total_bw; }; -static inline void __dl_update(struct dl_bw *dl_b, s64 bw); - -static inline -void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) -{ - dl_b->total_bw -= tsk_bw; - __dl_update(dl_b, (s32)tsk_bw / cpus); -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) -{ - dl_b->total_bw += tsk_bw; - __dl_update(dl_b, -((s32)tsk_bw / cpus)); -} - -static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, - u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; -} - /* * Verify the fitness of task @p to run on @cpu taking into account the * CPU original capacity and the runtime/deadline ratio of the task. @@ -347,15 +343,11 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); -extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern bool dl_cpu_busy(unsigned int cpu); +extern int dl_cpu_busy(int cpu, struct task_struct *p); #ifdef CONFIG_CGROUP_SCHED -#include <linux/cgroup.h> -#include <linux/psi.h> - struct cfs_rq; struct rt_rq; @@ -618,8 +610,8 @@ struct cfs_rq { s64 runtime_remaining; u64 throttled_clock; - u64 throttled_clock_task; - u64 throttled_clock_task_time; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; int throttled; int throttle_count; struct list_head throttled_list; @@ -1247,8 +1239,6 @@ static inline bool sched_group_cookie_match(struct rq *rq, return false; } -extern void queue_core_balance(struct rq *rq); - static inline bool sched_core_enqueued(struct task_struct *p) { return !RB_EMPTY_NODE(&p->core_node); @@ -1282,10 +1272,6 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } -static inline void queue_core_balance(struct rq *rq) -{ -} - static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) { return true; @@ -1662,12 +1648,14 @@ enum numa_topology_type { extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); -extern void sched_init_numa(void); +extern void sched_init_numa(int offline_node); +extern void sched_update_numa(int cpu, bool online); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); #else -static inline void sched_init_numa(void) { } +static inline void sched_init_numa(int offline_node) { } +static inline void sched_update_numa(int cpu, bool online) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) @@ -1705,6 +1693,11 @@ queue_balance_callback(struct rq *rq, { lockdep_assert_rq_held(rq); + /* + * Don't (re)queue an already queued item; nor queue anything when + * balance_push() is active, see the comment with + * balance_push_callback. + */ if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; @@ -1846,15 +1839,9 @@ static inline void dirty_sched_domain_sysctl(int cpu) #endif extern int sched_update_scaling(void); - -extern void flush_smp_call_function_from_idle(void); - -#else /* !CONFIG_SMP: */ -static inline void flush_smp_call_function_from_idle(void) { } -#endif +#endif /* CONFIG_SMP */ #include "stats.h" -#include "autogroup.h" #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) @@ -1950,7 +1937,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include <linux/static_key.h> # define const_debug __read_mostly #else # define const_debug const @@ -2203,6 +2189,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) * * include/asm-generic/vmlinux.lds.h * + * *CAREFUL* they are laid out in *REVERSE* order!!! + * * Also enforce alignment on the instance, not the type, to guarantee layout. */ #define DEFINE_SCHED_CLASS(name) \ @@ -2211,17 +2199,16 @@ const struct sched_class name##_sched_class \ __section("__" #name "_sched_class") /* Defined in include/asm-generic/vmlinux.lds.h */ -extern struct sched_class __begin_sched_classes[]; -extern struct sched_class __end_sched_classes[]; - -#define sched_class_highest (__end_sched_classes - 1) -#define sched_class_lowest (__begin_sched_classes - 1) +extern struct sched_class __sched_class_highest[]; +extern struct sched_class __sched_class_lowest[]; #define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class--) + for (class = (_from); class < (_to); class++) #define for_each_class(class) \ - for_class_range(class, sched_class_highest, sched_class_lowest) + for_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -2330,8 +2317,8 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); @@ -2500,6 +2487,24 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +#ifdef CONFIG_SCHED_DEBUG +/* + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to + * acquire rq lock instead of rq_lock(). So at the end of these two functions + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. + */ +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) +{ + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ +#ifdef CONFIG_SMP + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); +#endif +} +#else +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} +#endif #ifdef CONFIG_SMP @@ -2565,14 +2570,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - if (__rq_lockp(this_rq) == __rq_lockp(busiest)) - return 0; - - if (likely(raw_spin_rq_trylock(busiest))) + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || + likely(raw_spin_rq_trylock(busiest))) { + double_rq_clock_clear_update(this_rq, busiest); return 0; + } if (rq_order_less(this_rq, busiest)) { raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(this_rq, busiest); return 0; } @@ -2666,6 +2672,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) BUG_ON(rq1 != rq2); raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ + double_rq_clock_clear_update(rq1, rq2); } /* @@ -2747,32 +2754,6 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif -#ifdef CONFIG_SMP -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); - int i; - - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - for_each_cpu_and(i, rd->span, cpu_active_mask) { - struct rq *rq = cpu_rq(i); - - rq->dl.extra_bw += bw; - } -} -#else -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); - - dl->extra_bw += bw; -} -#endif - - #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { u64 total; @@ -2841,88 +2822,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ -#ifdef CONFIG_UCLAMP_TASK -unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); - -/** - * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. - * @rq: The rq to clamp against. Must not be NULL. - * @util: The util value to clamp. - * @p: The task to clamp against. Can be NULL if you want to clamp - * against @rq only. - * - * Clamps the passed @util to the max(@rq, @p) effective uclamp values. - * - * If sched_uclamp_used static key is disabled, then just return the util - * without any clamping since uclamp aggregation at the rq level in the fast - * path is disabled, rendering this operation a NOP. - * - * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It - * will return the correct effective uclamp value of the task even if the - * static key is disabled. - */ -static __always_inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - unsigned long min_util = 0; - unsigned long max_util = 0; - - if (!static_branch_likely(&sched_uclamp_used)) - return util; - - if (p) { - min_util = uclamp_eff_value(p, UCLAMP_MIN); - max_util = uclamp_eff_value(p, UCLAMP_MAX); - - /* - * Ignore last runnable task's max clamp, as this task will - * reset it. Similarly, no need to read the rq's min clamp. - */ - if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) - goto out; - } - - min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); - max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); -out: - /* - * Since CPU's {min,max}_util clamps are MAX aggregated considering - * RUNNABLE tasks with _different_ clamps, we can end up with an - * inversion. Fix it now when the clamps are applied. - */ - if (unlikely(min_util >= max_util)) - return min_util; - - return clamp(util, min_util, max_util); -} - -/* - * When uclamp is compiled in, the aggregation at rq level is 'turned off' - * by default in the fast path and only gets turned on once userspace performs - * an operation that requires it. - * - * Returns true if userspace opted-in to use uclamp and aggregation at rq level - * hence is active. - */ -static inline bool uclamp_is_used(void) -{ - return static_branch_likely(&sched_uclamp_used); -} -#else /* CONFIG_UCLAMP_TASK */ -static inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - return util; -} - -static inline bool uclamp_is_used(void) -{ - return false; -} -#endif /* CONFIG_UCLAMP_TASK */ - #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant # define arch_scale_freq_invariant() true @@ -3020,6 +2919,105 @@ static inline unsigned long cpu_util_rt(struct rq *rq) } #endif +#ifdef CONFIG_UCLAMP_TASK +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); + +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ +static __always_inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) +{ + unsigned long min_util = 0; + unsigned long max_util = 0; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + if (p) { + min_util = uclamp_eff_value(p, UCLAMP_MIN); + max_util = uclamp_eff_value(p, UCLAMP_MAX); + + /* + * Ignore last runnable task's max clamp, as this task will + * reset it. Similarly, no need to read the rq's min clamp. + */ + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + goto out; + } + + min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); + max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); +out: + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} + +/* Is the rq being capped/throttled by uclamp_max? */ +static inline bool uclamp_rq_is_capped(struct rq *rq) +{ + unsigned long rq_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return false; + + rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; +} + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) +{ + return util; +} + +static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } + +static inline bool uclamp_is_used(void) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK */ + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { @@ -3118,3 +3116,4 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +#endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 9620e323162c..2eb23dd0f285 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -7,3 +7,9 @@ extern void sched_ttwu_pending(void *arg); extern void send_call_function_single_ipi(int cpu); + +#ifdef CONFIG_SMP +extern void flush_smp_call_function_queue(void); +#else +static inline void flush_smp_call_function_queue(void) { } +#endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 07dde2928c79..857f837f52cb 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,7 +2,6 @@ /* * /proc/schedstat implementation */ -#include "sched.h" void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 3a3c826dd83a..baa839c1ba96 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_STATS_H +#define _KERNEL_STATS_H #ifdef CONFIG_SCHEDSTATS @@ -298,3 +300,5 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n # define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ + +#endif /* _KERNEL_STATS_H */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 0b165a25f22f..d04073a93eb4 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,7 +7,6 @@ * * See kernel/stop_machine.c */ -#include "sched.h" #ifdef CONFIG_SMP static int diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e1c655f928c7..76b9b796e695 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,7 +2,6 @@ /* * <linux/swait.h> (simple wait queues ) implementation: */ -#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d201a7052a29..05b6c2ad90b9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,7 +2,6 @@ /* * Scheduler topology setup/handling methods */ -#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); @@ -74,7 +73,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_span(group))) { + if (cpumask_empty(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; @@ -207,7 +206,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_STATIC_KEY_FALSE(sched_energy_present); -unsigned int sysctl_sched_energy_aware = 1; +static unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; @@ -221,7 +220,7 @@ void rebuild_sched_domains_energy(void) } #ifdef CONFIG_PROC_SYSCTL -int sched_energy_aware_handler(struct ctl_table *table, int write, +static int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; @@ -238,6 +237,27 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, return ret; } + +static struct ctl_table sched_energy_aware_sysctls[] = { + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_energy_aware_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_energy_aware_sysctls); + return 0; +} + +late_initcall(sched_energy_aware_sysctl_init); #endif static void free_pd(struct perf_domain *pd) @@ -1366,7 +1386,7 @@ static void asym_cpu_capacity_scan(void) list_for_each_entry(entry, &asym_cap_list, link) cpumask_clear(cpu_capacity_span(entry)); - for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN)) + for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) asym_cpu_capacity_update_data(cpu); list_for_each_entry_safe(entry, next, &asym_cap_list, link) { @@ -1492,8 +1512,6 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; - -static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif /* @@ -1651,6 +1669,7 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology_saved; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) @@ -1661,6 +1680,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) return; sched_domain_topology = tl; + sched_domain_topology_saved = NULL; } #ifdef CONFIG_NUMA @@ -1684,8 +1704,12 @@ static void sched_numa_warn(const char *str) for (i = 0; i < nr_node_ids; i++) { printk(KERN_WARNING " "); - for (j = 0; j < nr_node_ids; j++) - printk(KERN_CONT "%02d ", node_distance(i,j)); + for (j = 0; j < nr_node_ids; j++) { + if (!node_state(i, N_CPU) || !node_state(j, N_CPU)) + printk(KERN_CONT "(%02d) ", node_distance(i,j)); + else + printk(KERN_CONT " %02d ", node_distance(i,j)); + } printk(KERN_CONT "\n"); } printk(KERN_WARNING "\n"); @@ -1693,19 +1717,34 @@ static void sched_numa_warn(const char *str) bool find_numa_distance(int distance) { - int i; + bool found = false; + int i, *distances; if (distance == node_distance(0, 0)) return true; + rcu_read_lock(); + distances = rcu_dereference(sched_domains_numa_distance); + if (!distances) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - if (sched_domains_numa_distance[i] == distance) - return true; + if (distances[i] == distance) { + found = true; + break; + } } +unlock: + rcu_read_unlock(); - return false; + return found; } +#define for_each_cpu_node_but(n, nbut) \ + for_each_node_state(n, N_CPU) \ + if (n == nbut) \ + continue; \ + else + /* * A system can have three types of NUMA topology: * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system @@ -1725,7 +1764,7 @@ bool find_numa_distance(int distance) * there is an intermediary node C, which is < N hops away from both * nodes A and B, the system is a glueless mesh. */ -static void init_numa_topology_type(void) +static void init_numa_topology_type(int offline_node) { int a, b, c, n; @@ -1736,14 +1775,14 @@ static void init_numa_topology_type(void) return; } - for_each_online_node(a) { - for_each_online_node(b) { + for_each_cpu_node_but(a, offline_node) { + for_each_cpu_node_but(b, offline_node) { /* Find two nodes furthest removed from each other. */ if (node_distance(a, b) < n) continue; /* Is there an intermediary node between a and b? */ - for_each_online_node(c) { + for_each_cpu_node_but(c, offline_node) { if (node_distance(a, c) < n && node_distance(b, c) < n) { sched_numa_topology_type = @@ -1756,17 +1795,22 @@ static void init_numa_topology_type(void) return; } } + + pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n"); + sched_numa_topology_type = NUMA_DIRECT; } #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) -void sched_init_numa(void) +void sched_init_numa(int offline_node) { struct sched_domain_topology_level *tl; unsigned long *distance_map; int nr_levels = 0; int i, j; + int *distances; + struct cpumask ***masks; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the @@ -1777,12 +1821,13 @@ void sched_init_numa(void) return; bitmap_zero(distance_map, NR_DISTANCE_VALUES); - for (i = 0; i < nr_node_ids; i++) { - for (j = 0; j < nr_node_ids; j++) { + for_each_cpu_node_but(i, offline_node) { + for_each_cpu_node_but(j, offline_node) { int distance = node_distance(i, j); if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { sched_numa_warn("Invalid distance value range"); + bitmap_free(distance_map); return; } @@ -1795,16 +1840,17 @@ void sched_init_numa(void) */ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); - if (!sched_domains_numa_distance) { + distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!distances) { bitmap_free(distance_map); return; } for (i = 0, j = 0; i < nr_levels; i++, j++) { j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); - sched_domains_numa_distance[i] = j; + distances[i] = j; } + rcu_assign_pointer(sched_domains_numa_distance, distances); bitmap_free(distance_map); @@ -1826,8 +1872,8 @@ void sched_init_numa(void) */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); - if (!sched_domains_numa_masks) + masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); + if (!masks) return; /* @@ -1835,31 +1881,20 @@ void sched_init_numa(void) * CPUs of nodes that are that many hops away from us. */ for (i = 0; i < nr_levels; i++) { - sched_domains_numa_masks[i] = - kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); - if (!sched_domains_numa_masks[i]) + masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!masks[i]) return; - for (j = 0; j < nr_node_ids; j++) { + for_each_cpu_node_but(j, offline_node) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); int k; if (!mask) return; - sched_domains_numa_masks[i][j] = mask; - - for_each_node(k) { - /* - * Distance information can be unreliable for - * offline nodes, defer building the node - * masks to its bringup. - * This relies on all unique distance values - * still being visible at init time. - */ - if (!node_online(j)) - continue; + masks[i][j] = mask; + for_each_cpu_node_but(k, offline_node) { if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); @@ -1870,6 +1905,7 @@ void sched_init_numa(void) } } } + rcu_assign_pointer(sched_domains_numa_masks, masks); /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); @@ -1907,59 +1943,67 @@ void sched_init_numa(void) }; } + sched_domain_topology_saved = sched_domain_topology; sched_domain_topology = tl; sched_domains_numa_levels = nr_levels; - sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; - - init_numa_topology_type(); + WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); - sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); - if (!sched_numa_onlined_nodes) - return; - - bitmap_zero(sched_numa_onlined_nodes, nr_node_ids); - for_each_online_node(i) - bitmap_set(sched_numa_onlined_nodes, i, 1); + init_numa_topology_type(offline_node); } -static void __sched_domains_numa_masks_set(unsigned int node) -{ - int i, j; - - /* - * NUMA masks are not built for offline nodes in sched_init_numa(). - * Thus, when a CPU of a never-onlined-before node gets plugged in, - * adding that new CPU to the right NUMA masks is not sufficient: the - * masks of that CPU's node must also be updated. - */ - if (test_bit(node, sched_numa_onlined_nodes)) - return; - bitmap_set(sched_numa_onlined_nodes, node, 1); - - for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) { - if (!node_online(j) || node == j) - continue; +static void sched_reset_numa(void) +{ + int nr_levels, *distances; + struct cpumask ***masks; - if (node_distance(j, node) > sched_domains_numa_distance[i]) + nr_levels = sched_domains_numa_levels; + sched_domains_numa_levels = 0; + sched_max_numa_distance = 0; + sched_numa_topology_type = NUMA_DIRECT; + distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_domains_numa_distance, NULL); + masks = sched_domains_numa_masks; + rcu_assign_pointer(sched_domains_numa_masks, NULL); + if (distances || masks) { + int i, j; + + synchronize_rcu(); + kfree(distances); + for (i = 0; i < nr_levels && masks; i++) { + if (!masks[i]) continue; - - /* Add remote nodes in our masks */ - cpumask_or(sched_domains_numa_masks[i][node], - sched_domains_numa_masks[i][node], - sched_domains_numa_masks[0][j]); + for_each_node(j) + kfree(masks[i][j]); + kfree(masks[i]); } + kfree(masks); } + if (sched_domain_topology_saved) { + kfree(sched_domain_topology); + sched_domain_topology = sched_domain_topology_saved; + sched_domain_topology_saved = NULL; + } +} + +/* + * Call with hotplug lock held + */ +void sched_update_numa(int cpu, bool online) +{ + int node; + node = cpu_to_node(cpu); /* - * A new node has been brought up, potentially changing the topology - * classification. - * - * Note that this is racy vs any use of sched_numa_topology_type :/ + * Scheduler NUMA topology is updated when the first CPU of a + * node is onlined or the last CPU of a node is offlined. */ - init_numa_topology_type(); + if (cpumask_weight(cpumask_of_node(node)) != 1) + return; + + sched_reset_numa(); + sched_init_numa(online ? NUMA_NO_NODE : node); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1967,11 +2011,9 @@ void sched_domains_numa_masks_set(unsigned int cpu) int node = cpu_to_node(cpu); int i, j; - __sched_domains_numa_masks_set(node); - for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { - if (!node_online(j)) + if (!node_state(j, N_CPU)) continue; /* Set ourselves in the remote node's masks */ @@ -1986,8 +2028,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu) int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) - cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + for (j = 0; j < nr_node_ids; j++) { + if (sched_domains_numa_masks[i][j]) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } } } @@ -2001,14 +2045,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu) */ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { - int i, j = cpu_to_node(cpu); + int i, j = cpu_to_node(cpu), found = nr_cpu_ids; + struct cpumask ***masks; + rcu_read_lock(); + masks = rcu_dereference(sched_domains_numa_masks); + if (!masks) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); - if (cpu < nr_cpu_ids) - return cpu; + if (!masks[i][j]) + break; + cpu = cpumask_any_and(cpus, masks[i][j]); + if (cpu < nr_cpu_ids) { + found = cpu; + break; + } } - return nr_cpu_ids; +unlock: + rcu_read_unlock(); + + return found; } #endif /* CONFIG_NUMA */ @@ -2242,6 +2298,57 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + /* + * Calculate an allowed NUMA imbalance such that LLCs do not get + * imbalanced. + */ + for_each_cpu(i, cpu_map) { + unsigned int imb = 0; + unsigned int imb_span = 1; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + struct sched_domain *child = sd->child; + + if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && + (child->flags & SD_SHARE_PKG_RESOURCES)) { + struct sched_domain __rcu *top_p; + unsigned int nr_llcs; + + /* + * For a single LLC per node, allow an + * imbalance up to 25% of the node. This is an + * arbitrary cutoff based on SMT-2 to balance + * between memory bandwidth and avoiding + * premature sharing of HT resources and SMT-4 + * or SMT-8 *may* benefit from a different + * cutoff. + * + * For multiple LLCs, allow an imbalance + * until multiple tasks would share an LLC + * on one node while LLCs on another node + * remain idle. + */ + nr_llcs = sd->span_weight / child->span_weight; + if (nr_llcs == 1) + imb = sd->span_weight >> 2; + else + imb = nr_llcs; + sd->imb_numa_nr = imb; + + /* Set span based on the first NUMA domain. */ + top_p = sd->parent; + while (top_p && !(top_p->flags & SD_NUMA)) { + top_p = top_p->parent; + } + imb_span = top_p ? top_p->span_weight : sd->span_weight; + } else { + int factor = max(1U, (sd->span_weight / imb_span)); + + sd->imb_numa_nr = imb * factor; + } + } + } + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) @@ -2351,7 +2458,7 @@ int sched_init_domains(const struct cpumask *cpu_map) doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; - cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); return err; @@ -2440,7 +2547,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], if (doms_new) { n = 1; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } } else { n = ndoms_new; @@ -2475,7 +2582,7 @@ match1: n = 0; doms_new = &fallback_doms; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } /* Build new domains: */ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index eca38107b32f..9860bb9a847c 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,7 +4,6 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 02ce292b9bc0..d4788f810b55 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only + /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) diff --git a/kernel/scs.c b/kernel/scs.c index 579841be8864..b7e1b096d906 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -32,15 +32,19 @@ static void *__scs_alloc(int node) for (i = 0; i < NR_CACHED_SCS; i++) { s = this_cpu_xchg(scs_cache[i], NULL); if (s) { - kasan_unpoison_vmalloc(s, SCS_SIZE); + s = kasan_unpoison_vmalloc(s, SCS_SIZE, + KASAN_VMALLOC_PROT_NORMAL); memset(s, 0, SCS_SIZE); - return s; + goto out; } } - return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, + s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, GFP_SCS, PAGE_KERNEL, 0, node, __builtin_return_address(0)); + +out: + return kasan_reset_tag(s); } void *scs_alloc(int node) @@ -78,7 +82,7 @@ void scs_free(void *s) if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) return; - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL); vfree_atomic(s); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index db10e73d06e0..e9852d1b4a5e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -42,7 +42,6 @@ #include <linux/pid.h> #include <linux/ptrace.h> #include <linux/capability.h> -#include <linux/tracehook.h> #include <linux/uaccess.h> #include <linux/anon_inodes.h> #include <linux/lockdep.h> @@ -201,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) * the filter can be freed. * @cache: cache of arch/syscall mappings to actions * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged + * @wait_killable_recv: Put notifying process in killable state once the + * notification is received by the userspace listener. * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information @@ -221,6 +222,7 @@ struct seccomp_filter { refcount_t refs; refcount_t users; bool log; + bool wait_killable_recv; struct action_cache cache; struct seccomp_filter *prev; struct bpf_prog *prog; @@ -894,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags, if (flags & SECCOMP_FILTER_FLAG_LOG) filter->log = true; + /* Set wait killable flag, if present. */ + if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) + filter->wait_killable_recv = true; + /* * If there is an existing filter, make it the prev and don't drop its * task reference. @@ -1081,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn complete(&addfd->completion); } +static bool should_sleep_killable(struct seccomp_filter *match, + struct seccomp_knotif *n) +{ + return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -1101,7 +1113,7 @@ static int seccomp_do_user_notification(int this_syscall, n.data = sd; n.id = seccomp_next_notify_id(match); init_completion(&n.ready); - list_add(&n.list, &match->notif->notifications); + list_add_tail(&n.list, &match->notif->notifications); INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); @@ -1111,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall, * This is where we wait for a reply from userspace. */ do { + bool wait_killable = should_sleep_killable(match, &n); + mutex_unlock(&match->notify_lock); - err = wait_for_completion_interruptible(&n.ready); + if (wait_killable) + err = wait_for_completion_killable(&n.ready); + else + err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); - if (err != 0) + + if (err != 0) { + /* + * Check to see if the notifcation got picked up and + * whether we should switch to wait killable. + */ + if (!wait_killable && should_sleep_killable(match, &n)) + continue; + goto interrupted; + } addfd = list_first_entry_or_null(&n.addfd, struct seccomp_kaddfd, list); @@ -1485,6 +1511,9 @@ out: mutex_lock(&filter->notify_lock); knotif = find_notification(filter, unotif.id); if (knotif) { + /* Reset the process to make sure it's not stuck */ + if (should_sleep_killable(filter, knotif)) + complete(&knotif->ready); knotif->state = SECCOMP_NOTIFY_INIT; up(&filter->notif->request); } @@ -1830,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags, ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0)) return -EINVAL; + /* + * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense + * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag. + */ + if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) && + ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0)) + return -EINVAL; + /* Prepare the new filter before holding any locks. */ prepared = seccomp_prepare_user_filter(filter); if (IS_ERR(prepared)) diff --git a/kernel/signal.c b/kernel/signal.c index 9b04631acde8..edb1dc9b00dc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -32,7 +32,7 @@ #include <linux/signal.h> #include <linux/signalfd.h> #include <linux/ratelimit.h> -#include <linux/tracehook.h> +#include <linux/task_work.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> @@ -762,7 +762,10 @@ still_pending: */ void signal_wake_up_state(struct task_struct *t, unsigned int state) { + lockdep_assert_held(&t->sighand->siglock); + set_tsk_thread_flag(t, TIF_SIGPENDING); + /* * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it @@ -884,7 +887,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, static void ptrace_trap_notify(struct task_struct *t) { WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); - assert_spin_locked(&t->sighand->siglock); + lockdep_assert_held(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); @@ -930,9 +933,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) for_each_thread(p, t) { flush_sigqueue_mask(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); - if (likely(!(t->ptrace & PT_SEIZED))) + if (likely(!(t->ptrace & PT_SEIZED))) { + t->jobctl &= ~JOBCTL_STOPPED; wake_up_state(t, __TASK_STOPPED); - else + } else ptrace_trap_notify(t); } @@ -1071,15 +1075,15 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type, bool force) +static int __send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *t, enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; int override_rlimit; int ret = 0, result; - assert_spin_locked(&t->sighand->siglock); + lockdep_assert_held(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, force)) @@ -1212,8 +1216,8 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) return ret; } -static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type) +int send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *t, enum pid_type type) { /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ bool force = false; @@ -1245,7 +1249,7 @@ static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct force = true; } } - return __send_signal(sig, info, t, type, force); + return __send_signal_locked(sig, info, t, type, force); } static void print_fatal_signal(int signr) @@ -1281,12 +1285,6 @@ static int __init setup_print_fatal_signals(char *str) __setup("print-fatal-signals=", setup_print_fatal_signals); -int -__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) -{ - return send_signal(sig, info, p, PIDTYPE_TGID); -} - int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { @@ -1294,7 +1292,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, type); + ret = send_signal_locked(sig, info, p, type); unlock_task_sighand(p, &flags); } @@ -1347,7 +1345,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, if (action->sa.sa_handler == SIG_DFL && (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; - ret = send_signal(sig, info, t, PIDTYPE_PID); + ret = send_signal_locked(sig, info, t, PIDTYPE_PID); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; @@ -1567,7 +1565,7 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false); + ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; @@ -1805,7 +1803,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif -int force_sig_perf(void __user *addr, u32 type, u64 sig_data) +int send_sig_perf(void __user *addr, u32 type, u64 sig_data) { struct kernel_siginfo info; @@ -1817,7 +1815,18 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) info.si_perf_data = sig_data; info.si_perf_type = type; - return force_sig_info(&info); + /* + * Signals generated by perf events should not terminate the whole + * process if SIGTRAP is blocked, however, delivering the signal + * asynchronously is better than not delivering at all. But tell user + * space if the signal was asynchronous, so it can clearly be + * distinguished from normal synchronous ones. + */ + info.si_perf_flags = sigismember(¤t->blocked, info.si_signo) ? + TRAP_PERF_FLAG_ASYNC : + 0; + + return send_sig_info(info.si_signo, &info, current); } /** @@ -2103,7 +2112,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) * parent's namespaces. */ if (valid_signal(sig) && sig) - __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false); + __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); @@ -2173,7 +2182,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, spin_lock_irqsave(&sighand->siglock, flags); if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) - __group_send_sig_info(SIGCHLD, &info, parent); + send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID); /* * Even if SIGCHLD is not generated, we must wake up wait4 calls. */ @@ -2189,10 +2198,12 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * That makes it a way to test a stopped process for * being ptrace-stopped vs being job-control-stopped. * - * If we actually decide not to stop at all because the tracer - * is gone, we keep current->exit_code unless clear_code. + * Returns the signal the ptracer requested the code resume + * with. If the code did not stop because the tracer is gone, + * the stop signal remains unchanged unless clear_code. */ -static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info) +static int ptrace_stop(int exit_code, int why, unsigned long message, + kernel_siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { @@ -2213,10 +2224,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t } /* - * schedule() will not sleep if there is a pending signal that - * can awaken the task. + * After this point ptrace_signal_wake_up or signal_wake_up + * will clear TASK_TRACED if ptrace_unlink happens or a fatal + * signal comes in. Handle previous ptrace_unlinks and fatal + * signals here to prevent ptrace_stop sleeping in schedule. */ + if (!current->ptrace || __fatal_signal_pending(current)) + return exit_code; + set_special_state(TASK_TRACED); + current->jobctl |= JOBCTL_TRACED; /* * We're committing to trapping. TRACED should be visible before @@ -2238,6 +2255,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t */ smp_wmb(); + current->ptrace_message = message; current->last_siginfo = info; current->exit_code = exit_code; @@ -2261,53 +2279,33 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); - if (likely(current->ptrace)) { - /* - * Notify parents of the stop. - * - * While ptraced, there are two parents - the ptracer and - * the real_parent of the group_leader. The ptracer should - * know about every stop while the real parent is only - * interested in the completion of group stop. The states - * for the two don't interact with each other. Notify - * separately unless they're gonna be duplicates. - */ + /* + * Notify parents of the stop. + * + * While ptraced, there are two parents - the ptracer and + * the real_parent of the group_leader. The ptracer should + * know about every stop while the real parent is only + * interested in the completion of group stop. The states + * for the two don't interact with each other. Notify + * separately unless they're gonna be duplicates. + */ + if (current->ptrace) do_notify_parent_cldstop(current, true, why); - if (gstop_done && ptrace_reparented(current)) - do_notify_parent_cldstop(current, false, why); - - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); - read_unlock(&tasklist_lock); - cgroup_enter_frozen(); - preempt_enable_no_resched(); - freezable_schedule(); - cgroup_leave_frozen(true); - } else { - /* - * By the time we got the lock, our tracer went away. - * Don't drop the lock yet, another tracer may come. - * - * If @gstop_done, the ptracer went away between group stop - * completion and here. During detach, it would have set - * JOBCTL_STOP_PENDING on us and we'll re-enter - * TASK_STOPPED in do_signal_stop() on return, so notifying - * the real parent of the group stop completion is enough. - */ - if (gstop_done) - do_notify_parent_cldstop(current, false, why); + if (gstop_done && (!current->ptrace || ptrace_reparented(current))) + do_notify_parent_cldstop(current, false, why); - /* tasklist protects us from ptrace_freeze_traced() */ - __set_current_state(TASK_RUNNING); - if (clear_code) - current->exit_code = 0; - read_unlock(&tasklist_lock); - } + /* + * Don't want to allow preemption here, because + * sys_ptrace() needs this task to be inactive. + * + * XXX: implement read_unlock_no_resched(). + */ + preempt_disable(); + read_unlock(&tasklist_lock); + cgroup_enter_frozen(); + preempt_enable_no_resched(); + freezable_schedule(); + cgroup_leave_frozen(true); /* * We are back. Now reacquire the siglock before touching @@ -2315,10 +2313,13 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t * any signal-sending on another CPU that wants to examine it. */ spin_lock_irq(¤t->sighand->siglock); + exit_code = current->exit_code; current->last_siginfo = NULL; + current->ptrace_message = 0; + current->exit_code = 0; /* LISTENING can be set only during STOP traps, clear it */ - current->jobctl &= ~JOBCTL_LISTENING; + current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN); /* * Queued signals ignored us while we were stopped for tracing. @@ -2326,9 +2327,10 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t * This sets TIF_SIGPENDING, but never clears it. */ recalc_sigpending_tsk(current); + return exit_code; } -static void ptrace_do_notify(int signr, int exit_code, int why) +static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message) { kernel_siginfo_t info; @@ -2339,18 +2341,21 @@ static void ptrace_do_notify(int signr, int exit_code, int why) info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ - ptrace_stop(exit_code, why, 1, &info); + return ptrace_stop(exit_code, why, message, &info); } -void ptrace_notify(int exit_code) +int ptrace_notify(int exit_code, unsigned long message) { + int signr; + BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); - if (unlikely(current->task_works)) + if (unlikely(task_work_pending(current))) task_work_run(); spin_lock_irq(¤t->sighand->siglock); - ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); + signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message); spin_unlock_irq(¤t->sighand->siglock); + return signr; } /** @@ -2447,6 +2452,7 @@ static bool do_signal_stop(int signr) if (task_participate_group_stop(current)) notify = CLD_STOPPED; + current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); @@ -2505,11 +2511,10 @@ static void do_jobctl_trap(void) signr = SIGTRAP; WARN_ON_ONCE(!signr); ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), - CLD_STOPPED); + CLD_STOPPED, 0); } else { WARN_ON_ONCE(!signr); ptrace_stop(signr, CLD_STOPPED, 0, NULL); - current->exit_code = 0; } } @@ -2562,15 +2567,12 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) * comment in dequeue_signal(). */ current->jobctl |= JOBCTL_STOP_DEQUEUED; - ptrace_stop(signr, CLD_TRAPPED, 0, info); + signr = ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ - signr = current->exit_code; if (signr == 0) return signr; - current->exit_code = 0; - /* * Update the siginfo structure if the signal has * changed. If the debugger wanted something @@ -2592,7 +2594,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) /* If the (new) signal is now blocked, requeue it. */ if (sigismember(¤t->blocked, signr) || fatal_signal_pending(current)) { - send_signal(signr, info, current, type); + send_signal_locked(signr, info, current, type); signr = 0; } @@ -2627,20 +2629,12 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; - if (unlikely(current->task_works)) + clear_notify_signal(); + if (unlikely(task_work_pending(current))) task_work_run(); - /* - * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so - * that the arch handlers don't all have to do it. If we get here - * without TIF_SIGPENDING, just exit after running signal work. - */ - if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) { - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) - tracehook_notify_signal(); - if (!task_sigpending(current)) - return false; - } + if (!task_sigpending(current)) + return false; if (unlikely(uprobe_deny_signal())) return false; @@ -2899,7 +2893,8 @@ static void signal_delivered(struct ksignal *ksig, int stepping) set_current_blocked(&blocked); if (current->sas_ss_flags & SS_AUTODISARM) sas_ss_reset(current); - tracehook_signal_handler(stepping); + if (stepping) + ptrace_notify(SIGTRAP, 0); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) @@ -3430,6 +3425,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -3507,6 +3503,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -4720,6 +4717,7 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_pkey); CHECK_OFFSET(si_perf_data); CHECK_OFFSET(si_perf_type); + CHECK_OFFSET(si_perf_flags); /* sigpoll */ CHECK_OFFSET(si_band); @@ -4791,7 +4789,7 @@ void kdb_send_sig(struct task_struct *t, int sig) "the deadlock.\n"); return; } - ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); + ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", diff --git a/kernel/smp.c b/kernel/smp.c index 01a7c1706a58..dd215f439426 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); -static void flush_smp_call_function_queue(bool warn_cpu_offline); +static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { @@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * ensure that the outgoing CPU doesn't go offline with work * still pending. */ - flush_smp_call_function_queue(false); + __flush_smp_call_function_queue(false); irq_work_run(); return 0; } @@ -183,7 +183,9 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); -#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ +module_param(csd_lock_timeout, ulong, 0444); + static atomic_t csd_bug_count = ATOMIC_INIT(0); static u64 cfd_seq; @@ -329,6 +331,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); + unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) @@ -341,7 +344,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * ts2 = sched_clock(); ts_delta = ts2 - *ts1; - if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) return false; firsttime = !*bug_id; @@ -541,11 +544,11 @@ void generic_smp_call_function_single_interrupt(void) { cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_GOTIPI); - flush_smp_call_function_queue(true); + __flush_smp_call_function_queue(true); } /** - * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. @@ -558,7 +561,7 @@ void generic_smp_call_function_single_interrupt(void) * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ -static void flush_smp_call_function_queue(bool warn_cpu_offline) +static void __flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; @@ -579,7 +582,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) /* There shouldn't be any pending callbacks on an offline CPU. */ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && - !warned && !llist_empty(head))) { + !warned && entry != NULL)) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); @@ -681,8 +684,22 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) smp_processor_id(), CFD_SEQ_HDLEND); } -void flush_smp_call_function_from_idle(void) + +/** + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * from task context (idle, migration thread) + * + * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it + * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by + * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to + * handle queued SMP function calls before scheduling. + * + * The migration thread has to ensure that an eventually pending wakeup has + * been handled before it migrates a task. + */ +void flush_smp_call_function_queue(void) { + unsigned int was_pending; unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) @@ -691,9 +708,11 @@ void flush_smp_call_function_from_idle(void) cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_IDLE); local_irq_save(flags); - flush_smp_call_function_queue(true); + /* Get the already pending soft interrupts for RT enabled kernels */ + was_pending = local_softirq_pending(); + __flush_smp_call_function_queue(true); if (local_softirq_pending()) - do_softirq(); + do_softirq_post_smp_call_flush(was_pending); local_irq_restore(flags); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f6bc0bc8a2aa..b9f54544e749 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -392,6 +392,13 @@ int cpu_check_up_prepare(int cpu) */ return -EAGAIN; + case CPU_UP_PREPARE: + /* + * Timeout while waiting for the CPU to show up. Allow to try + * again later. + */ + return 0; + default: /* Should not happen. Famous last words. */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 41f470929e99..9f0aef8aa9ff 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -222,7 +222,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) u32 pending; int curcnt; - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); local_irq_save(flags); @@ -294,6 +294,19 @@ static inline void invoke_softirq(void) wakeup_softirqd(); } +/* + * flush_smp_call_function_queue() can raise a soft interrupt in a function + * call. On RT kernels this is undesired and the only known functionality + * in the block layer which does this is disabled on RT. If soft interrupts + * get raised which haven't been raised before the flush, warn so it can be + * investigated. + */ +void do_softirq_post_smp_call_flush(unsigned int was_pending) +{ + if (WARN_ON_ONCE(was_pending != local_softirq_pending())) + invoke_softirq(); +} + #else /* CONFIG_PREEMPT_RT */ /* @@ -305,7 +318,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); raw_local_irq_save(flags); /* @@ -352,14 +365,14 @@ static void __local_bh_enable(unsigned int cnt) */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(_local_bh_enable); void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_disable(); @@ -618,7 +631,7 @@ static inline void tick_irq_exit(void) /* Make sure that timer wheel updates are propagated */ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { - if (!in_irq()) + if (!in_hardirq()) tick_nohz_irq_exit(); } #endif diff --git a/kernel/stackleak.c b/kernel/stackleak.c index ddb5a7f48d69..c2c33d2202e9 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -70,59 +70,81 @@ late_initcall(stackleak_sysctls_init); #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void noinstr stackleak_erase(void) +static __always_inline void __stackleak_erase(bool on_task_stack) { - /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ - unsigned long kstack_ptr = current->lowest_stack; - unsigned long boundary = (unsigned long)end_of_stack(current); - unsigned int poison_count = 0; - const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); - - if (skip_erasing()) - return; - - /* Check that 'lowest_stack' value is sane */ - if (unlikely(kstack_ptr - boundary >= THREAD_SIZE)) - kstack_ptr = boundary; + const unsigned long task_stack_low = stackleak_task_low_bound(current); + const unsigned long task_stack_high = stackleak_task_high_bound(current); + unsigned long erase_low, erase_high; - /* Search for the poison value in the kernel stack */ - while (kstack_ptr > boundary && poison_count <= depth) { - if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON) - poison_count++; - else - poison_count = 0; - - kstack_ptr -= sizeof(unsigned long); - } - - /* - * One 'long int' at the bottom of the thread stack is reserved and - * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y). - */ - if (kstack_ptr == boundary) - kstack_ptr += sizeof(unsigned long); + erase_low = stackleak_find_top_of_poison(task_stack_low, + current->lowest_stack); #ifdef CONFIG_STACKLEAK_METRICS - current->prev_lowest_stack = kstack_ptr; + current->prev_lowest_stack = erase_low; #endif /* - * Now write the poison value to the kernel stack. Start from - * 'kstack_ptr' and move up till the new 'boundary'. We assume that - * the stack pointer doesn't change when we write poison. + * Write poison to the task's stack between 'erase_low' and + * 'erase_high'. + * + * If we're running on a different stack (e.g. an entry trampoline + * stack) we can erase everything below the pt_regs at the top of the + * task stack. + * + * If we're running on the task stack itself, we must not clobber any + * stack used by this function and its caller. We assume that this + * function has a fixed-size stack frame, and the current stack pointer + * doesn't change while we write poison. */ - if (on_thread_stack()) - boundary = current_stack_pointer; + if (on_task_stack) + erase_high = current_stack_pointer; else - boundary = current_top_of_stack(); + erase_high = task_stack_high; - while (kstack_ptr < boundary) { - *(unsigned long *)kstack_ptr = STACKLEAK_POISON; - kstack_ptr += sizeof(unsigned long); + while (erase_low < erase_high) { + *(unsigned long *)erase_low = STACKLEAK_POISON; + erase_low += sizeof(unsigned long); } /* Reset the 'lowest_stack' value for the next syscall */ - current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; + current->lowest_stack = task_stack_high; +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can be called from the task stack or an entry stack when the task stack is + * no longer in use. + */ +asmlinkage void noinstr stackleak_erase(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(on_thread_stack()); +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can only be called from the task stack. + */ +asmlinkage void noinstr stackleak_erase_on_task_stack(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(true); +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can only be called from a stack other than the task stack. + */ +asmlinkage void noinstr stackleak_erase_off_task_stack(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(false); } void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) @@ -139,8 +161,7 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) /* 'lowest_stack' should be aligned on the register width boundary */ sp = ALIGN(sp, sizeof(unsigned long)); if (sp < current->lowest_stack && - sp >= (unsigned long)task_stack_page(current) + - sizeof(unsigned long)) { + sp >= stackleak_task_low_bound(current)) { current->lowest_stack = sp; } } diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9c625257023d..9ed5ce989415 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -226,15 +226,12 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) .store = store, .size = size, }; - mm_segment_t fs; /* Trace user stack if not a kernel thread */ if (current->flags & PF_KTHREAD) return 0; - fs = force_uaccess_begin(); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); - force_uaccess_end(fs); return c.len; } diff --git a/kernel/static_call.c b/kernel/static_call.c index 43ba0b1e0edb..e9c3e69f3837 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -1,548 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/init.h> #include <linux/static_call.h> -#include <linux/bug.h> -#include <linux/smp.h> -#include <linux/sort.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/cpu.h> -#include <linux/processor.h> -#include <asm/sections.h> - -extern struct static_call_site __start_static_call_sites[], - __stop_static_call_sites[]; -extern struct static_call_tramp_key __start_static_call_tramp_key[], - __stop_static_call_tramp_key[]; - -static bool static_call_initialized; - -/* mutex to protect key modules/sites */ -static DEFINE_MUTEX(static_call_mutex); - -static void static_call_lock(void) -{ - mutex_lock(&static_call_mutex); -} - -static void static_call_unlock(void) -{ - mutex_unlock(&static_call_mutex); -} - -static inline void *static_call_addr(struct static_call_site *site) -{ - return (void *)((long)site->addr + (long)&site->addr); -} - -static inline unsigned long __static_call_key(const struct static_call_site *site) -{ - return (long)site->key + (long)&site->key; -} - -static inline struct static_call_key *static_call_key(const struct static_call_site *site) -{ - return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS); -} - -/* These assume the key is word-aligned. */ -static inline bool static_call_is_init(struct static_call_site *site) -{ - return __static_call_key(site) & STATIC_CALL_SITE_INIT; -} - -static inline bool static_call_is_tail(struct static_call_site *site) -{ - return __static_call_key(site) & STATIC_CALL_SITE_TAIL; -} - -static inline void static_call_set_init(struct static_call_site *site) -{ - site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) - - (long)&site->key; -} - -static int static_call_site_cmp(const void *_a, const void *_b) -{ - const struct static_call_site *a = _a; - const struct static_call_site *b = _b; - const struct static_call_key *key_a = static_call_key(a); - const struct static_call_key *key_b = static_call_key(b); - - if (key_a < key_b) - return -1; - - if (key_a > key_b) - return 1; - - return 0; -} - -static void static_call_site_swap(void *_a, void *_b, int size) -{ - long delta = (unsigned long)_a - (unsigned long)_b; - struct static_call_site *a = _a; - struct static_call_site *b = _b; - struct static_call_site tmp = *a; - - a->addr = b->addr - delta; - a->key = b->key - delta; - - b->addr = tmp.addr + delta; - b->key = tmp.key + delta; -} - -static inline void static_call_sort_entries(struct static_call_site *start, - struct static_call_site *stop) -{ - sort(start, stop - start, sizeof(struct static_call_site), - static_call_site_cmp, static_call_site_swap); -} - -static inline bool static_call_key_has_mods(struct static_call_key *key) -{ - return !(key->type & 1); -} - -static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) -{ - if (!static_call_key_has_mods(key)) - return NULL; - - return key->mods; -} - -static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) -{ - if (static_call_key_has_mods(key)) - return NULL; - - return (struct static_call_site *)(key->type & ~1); -} - -void __static_call_update(struct static_call_key *key, void *tramp, void *func) -{ - struct static_call_site *site, *stop; - struct static_call_mod *site_mod, first; - - cpus_read_lock(); - static_call_lock(); - - if (key->func == func) - goto done; - - key->func = func; - - arch_static_call_transform(NULL, tramp, func, false); - - /* - * If uninitialized, we'll not update the callsites, but they still - * point to the trampoline and we just patched that. - */ - if (WARN_ON_ONCE(!static_call_initialized)) - goto done; - - first = (struct static_call_mod){ - .next = static_call_key_next(key), - .mod = NULL, - .sites = static_call_key_sites(key), - }; - - for (site_mod = &first; site_mod; site_mod = site_mod->next) { - bool init = system_state < SYSTEM_RUNNING; - struct module *mod = site_mod->mod; - - if (!site_mod->sites) { - /* - * This can happen if the static call key is defined in - * a module which doesn't use it. - * - * It also happens in the has_mods case, where the - * 'first' entry has no sites associated with it. - */ - continue; - } - - stop = __stop_static_call_sites; - - if (mod) { -#ifdef CONFIG_MODULES - stop = mod->static_call_sites + - mod->num_static_call_sites; - init = mod->state == MODULE_STATE_COMING; -#endif - } - - for (site = site_mod->sites; - site < stop && static_call_key(site) == key; site++) { - void *site_addr = static_call_addr(site); - - if (!init && static_call_is_init(site)) - continue; - - if (!kernel_text_address((unsigned long)site_addr)) { - /* - * This skips patching built-in __exit, which - * is part of init_section_contains() but is - * not part of kernel_text_address(). - * - * Skipping built-in __exit is fine since it - * will never be executed. - */ - WARN_ONCE(!static_call_is_init(site), - "can't patch static call site at %pS", - site_addr); - continue; - } - - arch_static_call_transform(site_addr, NULL, func, - static_call_is_tail(site)); - } - } - -done: - static_call_unlock(); - cpus_read_unlock(); -} -EXPORT_SYMBOL_GPL(__static_call_update); - -static int __static_call_init(struct module *mod, - struct static_call_site *start, - struct static_call_site *stop) -{ - struct static_call_site *site; - struct static_call_key *key, *prev_key = NULL; - struct static_call_mod *site_mod; - - if (start == stop) - return 0; - - static_call_sort_entries(start, stop); - - for (site = start; site < stop; site++) { - void *site_addr = static_call_addr(site); - - if ((mod && within_module_init((unsigned long)site_addr, mod)) || - (!mod && init_section_contains(site_addr, 1))) - static_call_set_init(site); - - key = static_call_key(site); - if (key != prev_key) { - prev_key = key; - - /* - * For vmlinux (!mod) avoid the allocation by storing - * the sites pointer in the key itself. Also see - * __static_call_update()'s @first. - * - * This allows architectures (eg. x86) to call - * static_call_init() before memory allocation works. - */ - if (!mod) { - key->sites = site; - key->type |= 1; - goto do_transform; - } - - site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); - if (!site_mod) - return -ENOMEM; - - /* - * When the key has a direct sites pointer, extract - * that into an explicit struct static_call_mod, so we - * can have a list of modules. - */ - if (static_call_key_sites(key)) { - site_mod->mod = NULL; - site_mod->next = NULL; - site_mod->sites = static_call_key_sites(key); - - key->mods = site_mod; - - site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); - if (!site_mod) - return -ENOMEM; - } - - site_mod->mod = mod; - site_mod->sites = site; - site_mod->next = static_call_key_next(key); - key->mods = site_mod; - } - -do_transform: - arch_static_call_transform(site_addr, NULL, key->func, - static_call_is_tail(site)); - } - - return 0; -} - -static int addr_conflict(struct static_call_site *site, void *start, void *end) -{ - unsigned long addr = (unsigned long)static_call_addr(site); - - if (addr <= (unsigned long)end && - addr + CALL_INSN_SIZE > (unsigned long)start) - return 1; - - return 0; -} - -static int __static_call_text_reserved(struct static_call_site *iter_start, - struct static_call_site *iter_stop, - void *start, void *end, bool init) -{ - struct static_call_site *iter = iter_start; - - while (iter < iter_stop) { - if (init || !static_call_is_init(iter)) { - if (addr_conflict(iter, start, end)) - return 1; - } - iter++; - } - - return 0; -} - -#ifdef CONFIG_MODULES - -static int __static_call_mod_text_reserved(void *start, void *end) -{ - struct module *mod; - int ret; - - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - - if (!mod) - return 0; - - ret = __static_call_text_reserved(mod->static_call_sites, - mod->static_call_sites + mod->num_static_call_sites, - start, end, mod->state == MODULE_STATE_COMING); - - module_put(mod); - - return ret; -} - -static unsigned long tramp_key_lookup(unsigned long addr) -{ - struct static_call_tramp_key *start = __start_static_call_tramp_key; - struct static_call_tramp_key *stop = __stop_static_call_tramp_key; - struct static_call_tramp_key *tramp_key; - - for (tramp_key = start; tramp_key != stop; tramp_key++) { - unsigned long tramp; - - tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp; - if (tramp == addr) - return (long)tramp_key->key + (long)&tramp_key->key; - } - - return 0; -} - -static int static_call_add_module(struct module *mod) -{ - struct static_call_site *start = mod->static_call_sites; - struct static_call_site *stop = start + mod->num_static_call_sites; - struct static_call_site *site; - - for (site = start; site != stop; site++) { - unsigned long s_key = __static_call_key(site); - unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; - unsigned long key; - - /* - * Is the key is exported, 'addr' points to the key, which - * means modules are allowed to call static_call_update() on - * it. - * - * Otherwise, the key isn't exported, and 'addr' points to the - * trampoline so we need to lookup the key. - * - * We go through this dance to prevent crazy modules from - * abusing sensitive static calls. - */ - if (!kernel_text_address(addr)) - continue; - - key = tramp_key_lookup(addr); - if (!key) { - pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n", - static_call_addr(site)); - return -EINVAL; - } - - key |= s_key & STATIC_CALL_SITE_FLAGS; - site->key = key - (long)&site->key; - } - - return __static_call_init(mod, start, stop); -} - -static void static_call_del_module(struct module *mod) -{ - struct static_call_site *start = mod->static_call_sites; - struct static_call_site *stop = mod->static_call_sites + - mod->num_static_call_sites; - struct static_call_key *key, *prev_key = NULL; - struct static_call_mod *site_mod, **prev; - struct static_call_site *site; - - for (site = start; site < stop; site++) { - key = static_call_key(site); - if (key == prev_key) - continue; - - prev_key = key; - - for (prev = &key->mods, site_mod = key->mods; - site_mod && site_mod->mod != mod; - prev = &site_mod->next, site_mod = site_mod->next) - ; - - if (!site_mod) - continue; - - *prev = site_mod->next; - kfree(site_mod); - } -} - -static int static_call_module_notify(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct module *mod = data; - int ret = 0; - - cpus_read_lock(); - static_call_lock(); - - switch (val) { - case MODULE_STATE_COMING: - ret = static_call_add_module(mod); - if (ret) { - WARN(1, "Failed to allocate memory for static calls"); - static_call_del_module(mod); - } - break; - case MODULE_STATE_GOING: - static_call_del_module(mod); - break; - } - - static_call_unlock(); - cpus_read_unlock(); - - return notifier_from_errno(ret); -} - -static struct notifier_block static_call_module_nb = { - .notifier_call = static_call_module_notify, -}; - -#else - -static inline int __static_call_mod_text_reserved(void *start, void *end) -{ - return 0; -} - -#endif /* CONFIG_MODULES */ - -int static_call_text_reserved(void *start, void *end) -{ - bool init = system_state < SYSTEM_RUNNING; - int ret = __static_call_text_reserved(__start_static_call_sites, - __stop_static_call_sites, start, end, init); - - if (ret) - return ret; - - return __static_call_mod_text_reserved(start, end); -} - -int __init static_call_init(void) -{ - int ret; - - if (static_call_initialized) - return 0; - - cpus_read_lock(); - static_call_lock(); - ret = __static_call_init(NULL, __start_static_call_sites, - __stop_static_call_sites); - static_call_unlock(); - cpus_read_unlock(); - - if (ret) { - pr_err("Failed to allocate memory for static_call!\n"); - BUG(); - } - - static_call_initialized = true; - -#ifdef CONFIG_MODULES - register_module_notifier(&static_call_module_nb); -#endif - return 0; -} -early_initcall(static_call_init); long __static_call_return0(void) { return 0; } - -#ifdef CONFIG_STATIC_CALL_SELFTEST - -static int func_a(int x) -{ - return x+1; -} - -static int func_b(int x) -{ - return x+2; -} - -DEFINE_STATIC_CALL(sc_selftest, func_a); - -static struct static_call_data { - int (*func)(int); - int val; - int expect; -} static_call_data [] __initdata = { - { NULL, 2, 3 }, - { func_b, 2, 4 }, - { func_a, 2, 3 } -}; - -static int __init test_static_call_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { - struct static_call_data *scd = &static_call_data[i]; - - if (scd->func) - static_call_update(sc_selftest, scd->func); - - WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); - } - - return 0; -} -early_initcall(test_static_call_init); - -#endif /* CONFIG_STATIC_CALL_SELFTEST */ +EXPORT_SYMBOL_GPL(__static_call_return0); diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c new file mode 100644 index 000000000000..dc5665b62814 --- /dev/null +++ b/kernel/static_call_inline.c @@ -0,0 +1,543 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/static_call.h> +#include <linux/bug.h> +#include <linux/smp.h> +#include <linux/sort.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/processor.h> +#include <asm/sections.h> + +extern struct static_call_site __start_static_call_sites[], + __stop_static_call_sites[]; +extern struct static_call_tramp_key __start_static_call_tramp_key[], + __stop_static_call_tramp_key[]; + +static bool static_call_initialized; + +/* mutex to protect key modules/sites */ +static DEFINE_MUTEX(static_call_mutex); + +static void static_call_lock(void) +{ + mutex_lock(&static_call_mutex); +} + +static void static_call_unlock(void) +{ + mutex_unlock(&static_call_mutex); +} + +static inline void *static_call_addr(struct static_call_site *site) +{ + return (void *)((long)site->addr + (long)&site->addr); +} + +static inline unsigned long __static_call_key(const struct static_call_site *site) +{ + return (long)site->key + (long)&site->key; +} + +static inline struct static_call_key *static_call_key(const struct static_call_site *site) +{ + return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS); +} + +/* These assume the key is word-aligned. */ +static inline bool static_call_is_init(struct static_call_site *site) +{ + return __static_call_key(site) & STATIC_CALL_SITE_INIT; +} + +static inline bool static_call_is_tail(struct static_call_site *site) +{ + return __static_call_key(site) & STATIC_CALL_SITE_TAIL; +} + +static inline void static_call_set_init(struct static_call_site *site) +{ + site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) - + (long)&site->key; +} + +static int static_call_site_cmp(const void *_a, const void *_b) +{ + const struct static_call_site *a = _a; + const struct static_call_site *b = _b; + const struct static_call_key *key_a = static_call_key(a); + const struct static_call_key *key_b = static_call_key(b); + + if (key_a < key_b) + return -1; + + if (key_a > key_b) + return 1; + + return 0; +} + +static void static_call_site_swap(void *_a, void *_b, int size) +{ + long delta = (unsigned long)_a - (unsigned long)_b; + struct static_call_site *a = _a; + struct static_call_site *b = _b; + struct static_call_site tmp = *a; + + a->addr = b->addr - delta; + a->key = b->key - delta; + + b->addr = tmp.addr + delta; + b->key = tmp.key + delta; +} + +static inline void static_call_sort_entries(struct static_call_site *start, + struct static_call_site *stop) +{ + sort(start, stop - start, sizeof(struct static_call_site), + static_call_site_cmp, static_call_site_swap); +} + +static inline bool static_call_key_has_mods(struct static_call_key *key) +{ + return !(key->type & 1); +} + +static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) +{ + if (!static_call_key_has_mods(key)) + return NULL; + + return key->mods; +} + +static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) +{ + if (static_call_key_has_mods(key)) + return NULL; + + return (struct static_call_site *)(key->type & ~1); +} + +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + struct static_call_site *site, *stop; + struct static_call_mod *site_mod, first; + + cpus_read_lock(); + static_call_lock(); + + if (key->func == func) + goto done; + + key->func = func; + + arch_static_call_transform(NULL, tramp, func, false); + + /* + * If uninitialized, we'll not update the callsites, but they still + * point to the trampoline and we just patched that. + */ + if (WARN_ON_ONCE(!static_call_initialized)) + goto done; + + first = (struct static_call_mod){ + .next = static_call_key_next(key), + .mod = NULL, + .sites = static_call_key_sites(key), + }; + + for (site_mod = &first; site_mod; site_mod = site_mod->next) { + bool init = system_state < SYSTEM_RUNNING; + struct module *mod = site_mod->mod; + + if (!site_mod->sites) { + /* + * This can happen if the static call key is defined in + * a module which doesn't use it. + * + * It also happens in the has_mods case, where the + * 'first' entry has no sites associated with it. + */ + continue; + } + + stop = __stop_static_call_sites; + + if (mod) { +#ifdef CONFIG_MODULES + stop = mod->static_call_sites + + mod->num_static_call_sites; + init = mod->state == MODULE_STATE_COMING; +#endif + } + + for (site = site_mod->sites; + site < stop && static_call_key(site) == key; site++) { + void *site_addr = static_call_addr(site); + + if (!init && static_call_is_init(site)) + continue; + + if (!kernel_text_address((unsigned long)site_addr)) { + /* + * This skips patching built-in __exit, which + * is part of init_section_contains() but is + * not part of kernel_text_address(). + * + * Skipping built-in __exit is fine since it + * will never be executed. + */ + WARN_ONCE(!static_call_is_init(site), + "can't patch static call site at %pS", + site_addr); + continue; + } + + arch_static_call_transform(site_addr, NULL, func, + static_call_is_tail(site)); + } + } + +done: + static_call_unlock(); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(__static_call_update); + +static int __static_call_init(struct module *mod, + struct static_call_site *start, + struct static_call_site *stop) +{ + struct static_call_site *site; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod; + + if (start == stop) + return 0; + + static_call_sort_entries(start, stop); + + for (site = start; site < stop; site++) { + void *site_addr = static_call_addr(site); + + if ((mod && within_module_init((unsigned long)site_addr, mod)) || + (!mod && init_section_contains(site_addr, 1))) + static_call_set_init(site); + + key = static_call_key(site); + if (key != prev_key) { + prev_key = key; + + /* + * For vmlinux (!mod) avoid the allocation by storing + * the sites pointer in the key itself. Also see + * __static_call_update()'s @first. + * + * This allows architectures (eg. x86) to call + * static_call_init() before memory allocation works. + */ + if (!mod) { + key->sites = site; + key->type |= 1; + goto do_transform; + } + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + + /* + * When the key has a direct sites pointer, extract + * that into an explicit struct static_call_mod, so we + * can have a list of modules. + */ + if (static_call_key_sites(key)) { + site_mod->mod = NULL; + site_mod->next = NULL; + site_mod->sites = static_call_key_sites(key); + + key->mods = site_mod; + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + } + + site_mod->mod = mod; + site_mod->sites = site; + site_mod->next = static_call_key_next(key); + key->mods = site_mod; + } + +do_transform: + arch_static_call_transform(site_addr, NULL, key->func, + static_call_is_tail(site)); + } + + return 0; +} + +static int addr_conflict(struct static_call_site *site, void *start, void *end) +{ + unsigned long addr = (unsigned long)static_call_addr(site); + + if (addr <= (unsigned long)end && + addr + CALL_INSN_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +static int __static_call_text_reserved(struct static_call_site *iter_start, + struct static_call_site *iter_stop, + void *start, void *end, bool init) +{ + struct static_call_site *iter = iter_start; + + while (iter < iter_stop) { + if (init || !static_call_is_init(iter)) { + if (addr_conflict(iter, start, end)) + return 1; + } + iter++; + } + + return 0; +} + +#ifdef CONFIG_MODULES + +static int __static_call_mod_text_reserved(void *start, void *end) +{ + struct module *mod; + int ret; + + preempt_disable(); + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + preempt_enable(); + + if (!mod) + return 0; + + ret = __static_call_text_reserved(mod->static_call_sites, + mod->static_call_sites + mod->num_static_call_sites, + start, end, mod->state == MODULE_STATE_COMING); + + module_put(mod); + + return ret; +} + +static unsigned long tramp_key_lookup(unsigned long addr) +{ + struct static_call_tramp_key *start = __start_static_call_tramp_key; + struct static_call_tramp_key *stop = __stop_static_call_tramp_key; + struct static_call_tramp_key *tramp_key; + + for (tramp_key = start; tramp_key != stop; tramp_key++) { + unsigned long tramp; + + tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp; + if (tramp == addr) + return (long)tramp_key->key + (long)&tramp_key->key; + } + + return 0; +} + +static int static_call_add_module(struct module *mod) +{ + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = start + mod->num_static_call_sites; + struct static_call_site *site; + + for (site = start; site != stop; site++) { + unsigned long s_key = __static_call_key(site); + unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; + unsigned long key; + + /* + * Is the key is exported, 'addr' points to the key, which + * means modules are allowed to call static_call_update() on + * it. + * + * Otherwise, the key isn't exported, and 'addr' points to the + * trampoline so we need to lookup the key. + * + * We go through this dance to prevent crazy modules from + * abusing sensitive static calls. + */ + if (!kernel_text_address(addr)) + continue; + + key = tramp_key_lookup(addr); + if (!key) { + pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n", + static_call_addr(site)); + return -EINVAL; + } + + key |= s_key & STATIC_CALL_SITE_FLAGS; + site->key = key - (long)&site->key; + } + + return __static_call_init(mod, start, stop); +} + +static void static_call_del_module(struct module *mod) +{ + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = mod->static_call_sites + + mod->num_static_call_sites; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod, **prev; + struct static_call_site *site; + + for (site = start; site < stop; site++) { + key = static_call_key(site); + if (key == prev_key) + continue; + + prev_key = key; + + for (prev = &key->mods, site_mod = key->mods; + site_mod && site_mod->mod != mod; + prev = &site_mod->next, site_mod = site_mod->next) + ; + + if (!site_mod) + continue; + + *prev = site_mod->next; + kfree(site_mod); + } +} + +static int static_call_module_notify(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + cpus_read_lock(); + static_call_lock(); + + switch (val) { + case MODULE_STATE_COMING: + ret = static_call_add_module(mod); + if (ret) { + WARN(1, "Failed to allocate memory for static calls"); + static_call_del_module(mod); + } + break; + case MODULE_STATE_GOING: + static_call_del_module(mod); + break; + } + + static_call_unlock(); + cpus_read_unlock(); + + return notifier_from_errno(ret); +} + +static struct notifier_block static_call_module_nb = { + .notifier_call = static_call_module_notify, +}; + +#else + +static inline int __static_call_mod_text_reserved(void *start, void *end) +{ + return 0; +} + +#endif /* CONFIG_MODULES */ + +int static_call_text_reserved(void *start, void *end) +{ + bool init = system_state < SYSTEM_RUNNING; + int ret = __static_call_text_reserved(__start_static_call_sites, + __stop_static_call_sites, start, end, init); + + if (ret) + return ret; + + return __static_call_mod_text_reserved(start, end); +} + +int __init static_call_init(void) +{ + int ret; + + if (static_call_initialized) + return 0; + + cpus_read_lock(); + static_call_lock(); + ret = __static_call_init(NULL, __start_static_call_sites, + __stop_static_call_sites); + static_call_unlock(); + cpus_read_unlock(); + + if (ret) { + pr_err("Failed to allocate memory for static_call!\n"); + BUG(); + } + + static_call_initialized = true; + +#ifdef CONFIG_MODULES + register_module_notifier(&static_call_module_nb); +#endif + return 0; +} +early_initcall(static_call_init); + +#ifdef CONFIG_STATIC_CALL_SELFTEST + +static int func_a(int x) +{ + return x+1; +} + +static int func_b(int x) +{ + return x+2; +} + +DEFINE_STATIC_CALL(sc_selftest, func_a); + +static struct static_call_data { + int (*func)(int); + int val; + int expect; +} static_call_data [] __initdata = { + { NULL, 2, 3 }, + { func_b, 2, 4 }, + { func_a, 2, 3 } +}; + +static int __init test_static_call_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { + struct static_call_data *scd = &static_call_data[i]; + + if (scd->func) + static_call_update(sc_selftest, scd->func); + + WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); + } + + return 0; +} +early_initcall(test_static_call_init); + +#endif /* CONFIG_STATIC_CALL_SELFTEST */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index cbc30271ea4d..cedb17ba158a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -535,8 +535,6 @@ void stop_machine_park(int cpu) kthread_park(stopper->thread); } -extern void sched_set_stop_task(int cpu, struct task_struct *stop); - static void cpu_stop_create(unsigned int cpu) { sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); @@ -633,6 +631,27 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) } EXPORT_SYMBOL_GPL(stop_machine); +#ifdef CONFIG_SCHED_SMT +int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + + struct multi_stop_data msdata = { + .fn = fn, + .data = data, + .num_threads = cpumask_weight(smt_mask), + .active_cpus = smt_mask, + }; + + lockdep_assert_cpus_held(); + + /* Set the initial state and stop all online cpus. */ + set_state(&msdata, MULTI_STOP_PREPARE); + return stop_cpus(smt_mask, multi_cpu_stop, &msdata); +} +EXPORT_SYMBOL_GPL(stop_core_cpuslocked); +#endif + /** * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU * @fn: the function to run diff --git a/kernel/sys.c b/kernel/sys.c index 5b0e172c4d47..b911fa6d81ab 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -117,6 +117,12 @@ #ifndef SVE_GET_VL # define SVE_GET_VL() (-EINVAL) #endif +#ifndef SME_SET_VL +# define SME_SET_VL(a) (-EINVAL) +#endif +#ifndef SME_GET_VL +# define SME_GET_VL() (-EINVAL) +#endif #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif @@ -1424,6 +1430,68 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) return errno; } +/* make sure you are allowed to change @tsk limits before calling this */ +static int do_prlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim, struct rlimit *old_rlim) +{ + struct rlimit *rlim; + int retval = 0; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + if (new_rlim) { + if (new_rlim->rlim_cur > new_rlim->rlim_max) + return -EINVAL; + if (resource == RLIMIT_NOFILE && + new_rlim->rlim_max > sysctl_nr_open) + return -EPERM; + } + + /* Holding a refcount on tsk protects tsk->signal from disappearing. */ + rlim = tsk->signal->rlim + resource; + task_lock(tsk->group_leader); + if (new_rlim) { + /* + * Keep the capable check against init_user_ns until cgroups can + * contain all limits. + */ + if (new_rlim->rlim_max > rlim->rlim_max && + !capable(CAP_SYS_RESOURCE)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk, resource, new_rlim); + } + if (!retval) { + if (old_rlim) + *old_rlim = *rlim; + if (new_rlim) + *rlim = *new_rlim; + } + task_unlock(tsk->group_leader); + + /* + * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not + * infinite. In case of RLIM_INFINITY the posix CPU timer code + * ignores the rlimit. + */ + if (!retval && new_rlim && resource == RLIMIT_CPU && + new_rlim->rlim_cur != RLIM_INFINITY && + IS_ENABLED(CONFIG_POSIX_TIMERS)) { + /* + * update_rlimit_cpu can fail if the task is exiting, but there + * may be other tasks in the thread group that are not exiting, + * and they need their cpu timers adjusted. + * + * The group_leader is the last task to be released, so if we + * cannot update_rlimit_cpu on it, then the entire process is + * exiting and we do not need to update at all. + */ + update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur); + } + + return retval; +} + SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit value; @@ -1567,63 +1635,6 @@ static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) rlim->rlim_max = (unsigned long)rlim64->rlim_max; } -/* make sure you are allowed to change @tsk limits before calling this */ -int do_prlimit(struct task_struct *tsk, unsigned int resource, - struct rlimit *new_rlim, struct rlimit *old_rlim) -{ - struct rlimit *rlim; - int retval = 0; - - if (resource >= RLIM_NLIMITS) - return -EINVAL; - if (new_rlim) { - if (new_rlim->rlim_cur > new_rlim->rlim_max) - return -EINVAL; - if (resource == RLIMIT_NOFILE && - new_rlim->rlim_max > sysctl_nr_open) - return -EPERM; - } - - /* protect tsk->signal and tsk->sighand from disappearing */ - read_lock(&tasklist_lock); - if (!tsk->sighand) { - retval = -ESRCH; - goto out; - } - - rlim = tsk->signal->rlim + resource; - task_lock(tsk->group_leader); - if (new_rlim) { - /* Keep the capable check against init_user_ns until - cgroups can contain all limits */ - if (new_rlim->rlim_max > rlim->rlim_max && - !capable(CAP_SYS_RESOURCE)) - retval = -EPERM; - if (!retval) - retval = security_task_setrlimit(tsk, resource, new_rlim); - } - if (!retval) { - if (old_rlim) - *old_rlim = *rlim; - if (new_rlim) - *rlim = *new_rlim; - } - task_unlock(tsk->group_leader); - - /* - * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not - * infinite. In case of RLIM_INFINITY the posix CPU timer code - * ignores the rlimit. - */ - if (!retval && new_rlim && resource == RLIMIT_CPU && - new_rlim->rlim_cur != RLIM_INFINITY && - IS_ENABLED(CONFIG_POSIX_TIMERS)) - update_rlimit_cpu(tsk, new_rlim->rlim_cur); -out: - read_unlock(&tasklist_lock); - return retval; -} - /* rcu lock must be held */ static int check_prlimit_permission(struct task_struct *task, unsigned int flags) @@ -2536,6 +2547,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SVE_GET_VL: error = SVE_GET_VL(); break; + case PR_SME_SET_VL: + error = SME_SET_VL(arg2); + break; + case PR_SME_GET_VL: + error = SME_GET_VL(); + break; case PR_GET_SPECULATION_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 730ab56d9e92..e52b6e372c60 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,13 +61,9 @@ #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> -#include <linux/kexec.h> -#include <linux/bpf.h> #include <linux/mount.h> #include <linux/userfaultfd_k.h> -#include <linux/latencytop.h> #include <linux/pid.h> -#include <linux/delayacct.h> #include "../lib/kstrtox.h" @@ -82,15 +78,9 @@ #ifdef CONFIG_SPARC #include <asm/setup.h> #endif -#ifdef CONFIG_BSD_PROCESS_ACCT -#include <linux/acct.h> -#endif #ifdef CONFIG_RT_MUTEXES #include <linux/rtmutex.h> #endif -#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) -#include <linux/lockdep.h> -#endif #if defined(CONFIG_SYSCTL) @@ -100,8 +90,6 @@ static const int six_hundred_forty_kb = 640 * 1024; #endif -/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -148,66 +136,6 @@ static const int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ -#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) -static int bpf_stats_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct static_key *key = (struct static_key *)table->data; - static int saved_val; - int val, ret; - struct ctl_table tmp = { - .data = &val, - .maxlen = sizeof(val), - .mode = table->mode, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&bpf_stats_enabled_mutex); - val = saved_val; - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret && val != saved_val) { - if (val) - static_key_slow_inc(key); - else - static_key_slow_dec(key); - saved_val = val; - } - mutex_unlock(&bpf_stats_enabled_mutex); - return ret; -} - -void __weak unpriv_ebpf_notify(int new_state) -{ -} - -static int bpf_unpriv_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret, unpriv_enable = *(int *)table->data; - bool locked_state = unpriv_enable == 1; - struct ctl_table tmp = *table; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - tmp.data = &unpriv_enable; - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { - if (locked_state && unpriv_enable != 1) - return -EPERM; - *(int *)table->data = unpriv_enable; - } - - unpriv_ebpf_notify(unpriv_enable); - - return ret; -} -#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ - /* * /proc/sys support */ @@ -1659,35 +1587,6 @@ int proc_do_static_key(struct ctl_table *table, int write, } static struct ctl_table kern_table[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHEDSTATS - { - .procname = "sched_schedstats", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_schedstats, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SCHEDSTATS */ -#ifdef CONFIG_TASK_DELAY_ACCT - { - .procname = "task_delayacct", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_delayacct, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_TASK_DELAY_ACCT */ #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing", @@ -1696,118 +1595,10 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_numa_balancing, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ { - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rt_runtime_us", - .data = &sysctl_sched_rt_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_deadline_period_max_us", - .data = &sysctl_sched_dl_period_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_deadline_period_min_us", - .data = &sysctl_sched_dl_period_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_rr_timeslice_ms", - .data = &sysctl_sched_rr_timeslice, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rr_handler, - }, -#ifdef CONFIG_UCLAMP_TASK - { - .procname = "sched_util_clamp_min", - .data = &sysctl_sched_uclamp_util_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_max", - .data = &sysctl_sched_uclamp_util_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_min_rt_default", - .data = &sysctl_sched_uclamp_util_min_rt_default, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, -#endif -#ifdef CONFIG_SCHED_AUTOGROUP - { - .procname = "sched_autogroup_enabled", - .data = &sysctl_sched_autogroup_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_CFS_BANDWIDTH - { - .procname = "sched_cfs_bandwidth_slice_us", - .data = &sysctl_sched_cfs_bandwidth_slice, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, -#endif -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - { - .procname = "sched_energy_aware", - .data = &sysctl_sched_energy_aware, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_energy_aware_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", - .data = &prove_locking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_LOCK_STAT - { - .procname = "lock_stat", - .data = &lock_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { .procname = "panic", .data = &panic_timeout, .maxlen = sizeof(int), @@ -1831,24 +1622,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif -#ifdef CONFIG_LATENCYTOP - { - .procname = "latencytop", - .data = &latencytop_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_latencytop, - }, -#endif -#ifdef CONFIG_BLK_DEV_INITRD - { - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "print-fatal-signals", .data = &print_fatal_signals, @@ -1906,22 +1679,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_FUNCTION_TRACER - { - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ftrace_enable_sysctl, - }, -#endif #ifdef CONFIG_STACK_TRACER { .procname = "stack_tracer_enabled", @@ -1954,18 +1711,6 @@ static struct ctl_table kern_table[] = { .proc_handler = tracepoint_printk_sysctl, }, #endif -#ifdef CONFIG_KEXEC_CORE - { - .procname = "kexec_load_disabled", - .data = &kexec_load_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MODULES { .procname = "modprobe", @@ -1994,15 +1739,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_BSD_PROCESS_ACCT - { - .procname = "acct", - .data = &acct_parm, - .maxlen = 3*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_MAGIC_SYSRQ { .procname = "sysrq", @@ -2060,17 +1796,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SMP - { - .procname = "oops_all_cpu_backtrace", - .data = &sysctl_oops_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, @@ -2219,13 +1944,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, #ifdef CONFIG_KEYS { .procname = "keys", @@ -2299,35 +2017,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = timer_migration_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_BPF_SYSCALL - { - .procname = "unprivileged_bpf_disabled", - .data = &sysctl_unprivileged_bpf_disabled, - .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), - .mode = 0644, - .proc_handler = bpf_unpriv_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "bpf_stats_enabled", - .data = &bpf_stats_enabled_key.key, - .maxlen = sizeof(bpf_stats_enabled_key), - .mode = 0644, - .proc_handler = bpf_stats_handler, - }, -#endif #if defined(CONFIG_TREE_RCU) { .procname = "panic_on_rcu_stall", @@ -2364,29 +2053,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_TWO, }, { - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), @@ -2409,55 +2075,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = SYSCTL_LONG_ONE, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = (void *)&dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, .maxlen = sizeof(dirtytime_expire_interval), @@ -2629,13 +2246,6 @@ static struct ctl_table vm_table[] = { }, #endif { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), @@ -2732,17 +2342,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MEMORY_FAILURE { .procname = "memory_failure_early_kill", diff --git a/kernel/task_work.c b/kernel/task_work.c index 1698fbe6f0e1..dff75bcde151 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/spinlock.h> #include <linux/task_work.h> -#include <linux/tracehook.h> +#include <linux/resume_user_mode.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ @@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify - * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the - * it will interrupt the targeted task and run the task_work. @TWA_RESUME - * work is run only when the task exits the kernel and returns to user mode, - * or before entering guest mode. Fails if the @task is exiting/exited and thus - * it can't process this @work. Otherwise @work->func() will be called when the - * @task goes through one of the aforementioned transitions, or exits. + * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. + * + * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted + * task and run the task_work, regardless of whether the task is currently + * running in the kernel or userspace. + * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a + * reschedule IPI to force the targeted task to reschedule and run task_work. + * This can be advantageous if there's no strict requirement that the + * task_work be run as soon as possible, just whenever the task enters the + * kernel anyway. + * @TWA_RESUME work is run only when the task exits the kernel and returns to + * user mode, or before entering guest mode. + * + * Fails if the @task is exiting/exited and thus it can't process this @work. + * Otherwise @work->func() will be called when the @task goes through one of + * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism @@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL: set_notify_signal(task); break; + case TWA_SIGNAL_NO_IPI: + __set_notify_signal(task); + break; default: WARN_ON_ONCE(1); break; @@ -78,7 +91,7 @@ task_work_cancel_match(struct task_struct *task, struct callback_head *work; unsigned long flags; - if (likely(!task->task_works)) + if (likely(!task_work_pending(task))) return NULL; /* * If cmpxchg() fails we continue without updating pprev. diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2b4898b4752e..f7e246336218 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/taskstats_kern.h> #include <linux/tsacct_kern.h> +#include <linux/acct.h> #include <linux/delayacct.h> #include <linux/cpumask.h> #include <linux/percpu.h> @@ -113,13 +114,14 @@ static void send_cpu_listeners(struct sk_buff *skb, struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); - int rc, delcount = 0; + int delcount = 0; genlmsg_end(skb, reply); - rc = 0; down_read(&listeners->sem); list_for_each_entry(s, &listeners->list, list) { + int rc; + skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { skb_next = skb_clone(skb_cur, GFP_KERNEL); @@ -152,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } +static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + /* No idea if I'm allowed to access that here, now. */ + struct file *exe_file = get_task_exe_file(tsk); + + if (exe_file) { + /* Following cp_new_stat64() in stat.c . */ + stats->ac_exe_dev = + huge_encode_dev(exe_file->f_inode->i_sb->s_dev); + stats->ac_exe_inode = exe_file->f_inode->i_ino; + fput(exe_file); + } else { + stats->ac_exe_dev = 0; + stats->ac_exe_inode = 0; + } +} + static void fill_stats(struct user_namespace *user_ns, struct pid_namespace *pid_ns, struct task_struct *tsk, struct taskstats *stats) @@ -174,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns, /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); + + /* add executable info */ + exe_add_tsk(stats, tsk); } static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) @@ -619,6 +641,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) goto err; fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); + if (group_dead) + stats->ac_flag |= AGROUP; /* * Doesn't matter if tsk is the leader or the last group member leaving @@ -664,6 +688,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .netnsok = true, }; /* Needed early in initialization */ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 04bfd62f5e5c..27b7868b5c30 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -181,5 +181,14 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. +config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US + int "Clocksource watchdog maximum allowable skew (in μs)" + depends on CLOCKSOURCE_WATCHDOG + range 50 1000 + default 100 + help + Specify the maximum amount of allowable watchdog skew in + microseconds before reporting the clocksource to be unstable. + endmenu endif diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 003ccf338d20..5d85014d59b5 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -690,7 +690,7 @@ static ssize_t unbind_device_store(struct device *dev, { char name[CS_NAME_LEN]; ssize_t ret = sysfs_get_uname(buf, name, count); - struct clock_event_device *ce; + struct clock_event_device *ce = NULL, *iter; if (ret < 0) return ret; @@ -698,9 +698,10 @@ static ssize_t unbind_device_store(struct device *dev, ret = -ENODEV; mutex_lock(&clockevents_mutex); raw_spin_lock_irq(&clockevents_lock); - list_for_each_entry(ce, &clockevent_devices, list) { - if (!strcmp(ce->name, name)) { - ret = __clockevents_try_unbind(ce, dev->id); + list_for_each_entry(iter, &clockevent_devices, list) { + if (!strcmp(iter->name, name)) { + ret = __clockevents_try_unbind(iter, dev->id); + ce = iter; break; } } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1cf73807b450..cee5da1e54c4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -107,7 +107,13 @@ static u64 suspend_start; * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as * a lower bound for cs->uncertainty_margin values when registering clocks. */ -#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC) +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#else +#define MAX_SKEW_USEC 100 +#endif + +#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); @@ -337,7 +343,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cpus_read_lock(); preempt_disable(); clocksource_verify_choose_cpus(); - if (cpumask_weight(&cpus_chosen) == 0) { + if (cpumask_empty(&cpus_chosen)) { preempt_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 96b4e7810426..cb925e8ef9a8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -15,6 +15,7 @@ #include <linux/workqueue.h> #include <linux/compat.h> #include <linux/sched/deadline.h> +#include <linux/task_work.h> #include "posix-timers.h" @@ -34,14 +35,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if * necessary. Needs siglock protection since other code may update the * expiration cache as well. + * + * Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and + * we cannot lock_task_sighand. Cannot fail if task is current. */ -void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) +int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) { u64 nsecs = rlim_new * NSEC_PER_SEC; + unsigned long irq_fl; - spin_lock_irq(&task->sighand->siglock); + if (!lock_task_sighand(task, &irq_fl)) + return -ESRCH; set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL); - spin_unlock_irq(&task->sighand->siglock); + unlock_task_sighand(task, &irq_fl); + return 0; } /* @@ -863,7 +870,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) { if (tsk->dl.dl_overrun) { tsk->dl.dl_overrun = 0; - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); } } @@ -877,7 +884,7 @@ static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) rt ? "RT" : "CPU", hard ? "hard" : "soft", current->comm, task_pid_nr(current)); } - __group_send_sig_info(signo, SEND_SIG_PRIV, current); + send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID); return true; } @@ -951,7 +958,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, trace_itimer_expire(signo == SIGPROF ? ITIMER_PROF : ITIMER_VIRTUAL, task_tgid(tsk), cur_time); - __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); } if (it->expires && it->expires < *expires) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index b1b9b12899f5..8464c5acc913 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -8,6 +8,7 @@ #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/kernel.h> +#include <linux/math.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/sched/clock.h> @@ -199,15 +200,13 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) r = rate; if (r >= 4000000) { - r /= 1000000; + r = DIV_ROUND_CLOSEST(r, 1000000); r_unit = 'M'; + } else if (r >= 4000) { + r = DIV_ROUND_CLOSEST(r, 1000); + r_unit = 'k'; } else { - if (r >= 1000) { - r /= 1000; - r_unit = 'k'; - } else { - r_unit = ' '; - } + r_unit = ' '; } /* Calculate the ns resolution of this counter */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 17a283ce2b20..30049580cd62 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -169,6 +169,8 @@ static ktime_t tick_init_jiffy_update(void) return period; } +#define MAX_STALLED_JIFFIES 5 + static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -186,7 +188,7 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL - WARN_ON(tick_nohz_full_running); + WARN_ON_ONCE(tick_nohz_full_running); #endif tick_do_timer_cpu = cpu; } @@ -196,6 +198,21 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); + /* + * If jiffies update stalled for too long (timekeeper in stop_machine() + * or VMEXIT'ed for several msecs), force an update. + */ + if (ts->last_tick_jiffies != jiffies) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } else { + if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { + tick_do_update_jiffies64(now); + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } + } + if (ts->inidle) ts->got_idle_tick = 1; } @@ -509,7 +526,6 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } -EXPORT_SYMBOL_GPL(tick_nohz_full_setup); static int tick_nohz_cpu_down(unsigned int cpu) { @@ -768,7 +784,7 @@ static inline bool local_timer_softirq_pending(void) static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { - u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; + u64 basemono, next_tick, delta, expires; unsigned long basejiff; unsigned int seq; @@ -791,7 +807,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) * minimal delta which brings us back to this place * immediately. Lather, rinse and repeat... */ - if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { @@ -802,10 +818,8 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) * disabled this also looks at the next expiring * hrtimer. */ - next_tmr = get_next_timer_interrupt(basejiff, basemono); - ts->next_timer = next_tmr; - /* Take the next rcu event into account */ - next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; + next_tick = get_next_timer_interrupt(basejiff, basemono); + ts->next_timer = next_tick; } /* @@ -913,6 +927,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); + else + tick_program_event(KTIME_MAX, 1); return; } @@ -984,6 +1000,45 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) __tick_nohz_full_update_tick(ts, ktime_get()); } +/* + * A pending softirq outside an IRQ (or softirq disabled section) context + * should be waiting for ksoftirqd to handle it. Therefore we shouldn't + * reach here due to the need_resched() early check in can_stop_idle_tick(). + * + * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the + * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, + * triggering the below since wakep_softirqd() is ignored. + * + */ +static bool report_idle_softirq(void) +{ + static int ratelimit; + unsigned int pending = local_softirq_pending(); + + if (likely(!pending)) + return false; + + /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ + if (!cpu_active(smp_processor_id())) { + pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; + if (!pending) + return false; + } + + if (ratelimit < 10) + return false; + + /* On RT, softirqs handling may be waiting on some lock */ + if (!local_bh_blocked()) + return false; + + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + + return true; +} + static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* @@ -1010,17 +1065,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (need_resched()) return false; - if (unlikely(local_softirq_pending())) { - static int ratelimit; - - if (ratelimit < 10 && !local_bh_blocked() && - (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } + if (unlikely(report_idle_softirq())) return false; - } if (tick_nohz_full_enabled()) { /* @@ -1319,9 +1365,15 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); - /* No need to reprogram if we are running tickless */ - if (unlikely(ts->tick_stopped)) + if (unlikely(ts->tick_stopped)) { + /* + * The clockevent device is not reprogrammed, so change the + * clock event device to ONESHOT_STOPPED to avoid spurious + * interrupts on devices which might not be truly one shot. + */ + tick_program_event(KTIME_MAX, 1); return; + } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); @@ -1493,7 +1545,7 @@ void tick_cancel_sched_timer(int cpu) } #endif -/** +/* * Async notification about clocksource changes */ void tick_clock_notify(void) @@ -1514,7 +1566,7 @@ void tick_oneshot_notify(void) set_bit(0, &ts->check_clocks); } -/** +/* * Check, if a change happened, which makes oneshot possible. * * Called cyclic from the hrtimer softirq (driven by the timer diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index d952ae393423..504649513399 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -49,6 +49,8 @@ enum tick_nohz_mode { * @timer_expires_base: Base time clock monotonic for @timer_expires * @next_timer: Expiry time of next expiring timer for debugging purpose only * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick + * @last_tick_jiffies: Value of jiffies seen on last tick + * @stalled_jiffies: Number of stalled jiffies detected across ticks */ struct tick_sched { struct hrtimer sched_timer; @@ -77,6 +79,8 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; atomic_t tick_dep_mask; + unsigned long last_tick_jiffies; + unsigned int stalled_jiffies; }; extern struct tick_sched *tick_get_tick_sched(int cpu); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dcdcb85121e4..8e4b3c32fcf9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -17,6 +17,7 @@ #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> +#include <linux/timex.h> #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> @@ -429,6 +430,14 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, memcpy(base + 1, base, sizeof(*base)); } +static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr) +{ + u64 delta, cycles = tk_clock_read(tkr); + + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); +} + static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; @@ -439,12 +448,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - - now += timekeeping_delta_to_ns(tkr, - clocksource_delta( - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); + now += fast_tk_get_delta_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; @@ -482,7 +486,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) * of the following timestamps. Callers need to be aware of that and * deal with it. */ -u64 ktime_get_mono_fast_ns(void) +u64 notrace ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } @@ -494,7 +498,7 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); * Contrary to ktime_get_mono_fast_ns() this is always correct because the * conversion factor is not affected by NTP/PTP correction. */ -u64 ktime_get_raw_fast_ns(void) +u64 notrace ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } @@ -528,10 +532,27 @@ u64 notrace ktime_get_boot_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; - return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); + return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot))); } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); +/** + * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock. + * + * The same limitations as described for ktime_get_boot_fast_ns() apply. The + * mono time and the TAI offset are not read atomically which may yield wrong + * readouts. However, an update of the TAI offset is an rare event e.g., caused + * by settime or adjtimex with an offset. The user of this function has to deal + * with the possibility of wrong timestamps in post processing. + */ +u64 notrace ktime_get_tai_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai))); +} +EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); + static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) { struct tk_read_base *tkr; @@ -543,10 +564,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) tkr = tkf->base + (seq & 0x01); basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - - delta = timekeeping_delta_to_ns(tkr, - clocksource_delta(tk_clock_read(tkr), - tkr->cycle_last, tkr->mask)); + delta = fast_tk_get_delta_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) @@ -2380,6 +2398,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return 0; } +/** + * random_get_entropy_fallback - Returns the raw clock source value, + * used by random.c for platforms with no valid random_get_entropy(). + */ +unsigned long random_get_entropy_fallback(void) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + struct clocksource *clock = READ_ONCE(tkr->clock); + + if (unlikely(timekeeping_suspended || !clock)) + return 0; + return clock->read(clock); +} +EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 85f1021ad459..717fcb9fb14a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -44,6 +44,7 @@ #include <linux/slab.h> #include <linux/compat.h> #include <linux/random.h> +#include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -223,7 +224,7 @@ static void timer_update_keys(struct work_struct *work); static DECLARE_WORK(timer_update_work, timer_update_keys); #ifdef CONFIG_SMP -unsigned int sysctl_timer_migration = 1; +static unsigned int sysctl_timer_migration = 1; DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); @@ -234,7 +235,42 @@ static void timers_update_migration(void) else static_branch_disable(&timers_migration_enabled); } -#else + +#ifdef CONFIG_SYSCTL +static int timer_migration_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&timer_keys_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(); + mutex_unlock(&timer_keys_mutex); + return ret; +} + +static struct ctl_table timer_sysctl[] = { + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init timer_sysctl_init(void) +{ + register_sysctl("kernel", timer_sysctl); + return 0; +} +device_initcall(timer_sysctl_init); +#endif /* CONFIG_SYSCTL */ +#else /* CONFIG_SMP */ static inline void timers_update_migration(void) { } #endif /* !CONFIG_SMP */ @@ -251,19 +287,6 @@ void timers_update_nohz(void) schedule_work(&timer_update_work); } -int timer_migration_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - mutex_lock(&timer_keys_mutex); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!ret && write) - timers_update_migration(); - mutex_unlock(&timer_keys_mutex); - return ret; -} - static inline bool is_timers_nohz_active(void) { return static_branch_unlikely(&timers_nohz_active); @@ -502,7 +525,7 @@ static inline unsigned calc_index(unsigned long expires, unsigned lvl, * * Round up with level granularity to prevent this. */ - expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + expires = (expires >> LVL_SHIFT(lvl)) + 1; *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } @@ -615,9 +638,39 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer static const struct debug_obj_descr timer_debug_descr; +struct timer_hint { + void (*function)(struct timer_list *t); + long offset; +}; + +#define TIMER_HINT(fn, container, timr, hintfn) \ + { \ + .function = fn, \ + .offset = offsetof(container, hintfn) - \ + offsetof(container, timr) \ + } + +static const struct timer_hint timer_hints[] = { + TIMER_HINT(delayed_work_timer_fn, + struct delayed_work, timer, work.func), + TIMER_HINT(kthread_delayed_work_timer_fn, + struct kthread_delayed_work, timer, work.func), +}; + static void *timer_debug_hint(void *addr) { - return ((struct timer_list *) addr)->function; + struct timer_list *timer = addr; + int i; + + for (i = 0; i < ARRAY_SIZE(timer_hints); i++) { + if (timer_hints[i].function == timer->function) { + void (**fn)(void) = addr + timer_hints[i].offset; + + return *fn; + } + } + + return timer->function; } static bool timer_is_static_object(void *addr) @@ -1722,11 +1775,14 @@ static inline void __run_timers(struct timer_base *base) time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); /* - * The only possible reason for not finding any expired - * timer at this clk is that all matching timers have been - * dequeued. + * The two possible reasons for not finding any expired + * timer at this clk are that all matching timers have been + * dequeued or no timer has been queued since + * base::next_expiry was set to base::clk + + * NEXT_TIMER_MAX_DELTA. */ - WARN_ON_ONCE(!levels && !base->next_expiry_recalc); + WARN_ON_ONCE(!levels && !base->next_expiry_recalc + && base->timers_pending); base->clk++; base->next_expiry = __next_timer_interrupt(base); @@ -1777,8 +1833,6 @@ void update_process_times(int user_tick) { struct task_struct *p = current; - PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); - /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); @@ -1950,6 +2004,7 @@ int timers_prepare_cpu(unsigned int cpu) base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; } diff --git a/kernel/torture.c b/kernel/torture.c index ef27a6c82451..789aeb0e1159 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -911,7 +911,7 @@ void torture_kthread_stopping(char *title) { char buf[128]; - snprintf(buf, sizeof(buf), "Stopping %s", title); + snprintf(buf, sizeof(buf), "%s is stopping", title); VERBOSE_TOROUT_STRING(buf); while (!kthread_should_stop()) { torture_shutdown_absorb(title); @@ -931,12 +931,14 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, int ret = 0; VERBOSE_TOROUT_STRING(m); - *tp = kthread_run(fn, arg, "%s", s); + *tp = kthread_create(fn, arg, "%s", s); if (IS_ERR(*tp)) { ret = PTR_ERR(*tp); TOROUT_ERRSTRING(f); *tp = NULL; + return ret; } + wake_up_process(*tp); // Process is sleeping, so ordering provided. torture_shuffle_task_register(*tp); return ret; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a5eb5e7fd624..debbbb083286 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool +config HAVE_RETHOOK + bool + +config RETHOOK + bool + depends on HAVE_RETHOOK + help + Enable generic return hooking feature. This is an internal + API, which will be used by other function-entry hooking + features like fprobe and kprobes. + config HAVE_FUNCTION_TRACER bool help @@ -133,6 +144,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK + select TASKS_RCU if PREEMPTION config GENERIC_TRACER bool @@ -236,6 +248,21 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config FPROBE + bool "Kernel Function Probe (fprobe)" + depends on FUNCTION_TRACER + depends on DYNAMIC_FTRACE_WITH_REGS + depends on HAVE_RETHOOK + select RETHOOK + default n + help + This option enables kernel function probe (fprobe) based on ftrace. + The fprobe is similar to kprobes, but probes only for kernel function + entries and exits. This also can probe multiple functions by one + fprobe. + + If unsure, say N. + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER @@ -702,6 +729,7 @@ config FTRACE_MCOUNT_USE_OBJTOOL depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC depends on FTRACE_MCOUNT_RECORD + select OBJTOOL config FTRACE_MCOUNT_USE_RECORDMCOUNT def_bool y @@ -737,6 +765,21 @@ config SYNTH_EVENTS If in doubt, say N. +config USER_EVENTS + bool "User trace events" + select TRACING + select DYNAMIC_EVENTS + depends on BROKEN || COMPILE_TEST # API needs to be straighten out + help + User trace events are user-defined trace events that + can be used like an existing kernel trace event. User trace + events are generated by writing to a tracefs file. User + processes can determine if their tracing events should be + generated by memory mapping a tracefs file and checking for + an associated byte being non-zero. + + If in doubt, say N. + config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index bedc5caceec7..0d261774d6f3 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,6 +31,10 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE GCOV_PROFILE := y endif +# Functions in this file could be invoked from early interrupt +# code and produce random code coverage. +KCOV_INSTRUMENT_trace_preemptirq.o := n + CFLAGS_bpf_trace.o := -I$(src) CFLAGS_trace_benchmark.o := -I$(src) @@ -82,6 +86,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o +obj-$(CONFIG_USER_EVENTS) += trace_events_user.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o @@ -97,6 +102,8 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o +obj-$(CONFIG_FPROBE) += fprobe.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 21dea90eaa93..fe04c6f96ca5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -145,13 +145,14 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, - const char *fmt, ...) +void __blk_trace_note_message(struct blk_trace *bt, + struct cgroup_subsys_state *css, const char *fmt, ...) { int n; va_list args; unsigned long flags; char *buf; + u64 cgid = 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer_enabled)) @@ -170,17 +171,16 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_id(blkcg->css.cgroup) : 1); -#else - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); + if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + cgid = cgroup_id(css->cgroup); + else + cgid = 1; #endif + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(__trace_note_message); +EXPORT_SYMBOL_GPL(__blk_trace_note_message); static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) @@ -411,7 +411,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, NULL, "%s", msg); + __blk_trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; @@ -770,19 +770,17 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - - mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { + struct cgroup_subsys_state *blkcg_css; struct blk_trace *bt; /* We don't use the 'bt' value here except as an optimization... */ @@ -790,9 +788,10 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; - if (!bio->bi_blkg) + blkcg_css = bio_blkcg_css(bio); + if (!blkcg_css) return 0; - return cgroup_id(bio_blkcg(bio)->css.cgroup); + return cgroup_id(blkcg_css->cgroup); } #else static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) @@ -1902,7 +1901,6 @@ void blk_fill_rwbs(char *rwbs, unsigned int op) switch (op & REQ_OP_MASK) { case REQ_OP_WRITE: - case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; break; case REQ_OP_DISCARD: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 21aa30644219..88589d74a892 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,6 +17,9 @@ #include <linux/error-injection.h> #include <linux/btf_ids.h> #include <linux/bpf_lsm.h> +#include <linux/fprobe.h> +#include <linux/bsearch.h> +#include <linux/sort.h> #include <net/bpf_sk_storage.h> @@ -77,6 +80,8 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags, const struct btf **btf, s32 *btf_id); +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx); +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx); /** * trace_call_bpf - invoke BPF program @@ -124,7 +129,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run); + rcu_read_lock(); + ret = bpf_prog_run_array(rcu_dereference(call->prog_array), + ctx, bpf_prog_run); + rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -332,8 +340,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(uaccess_kernel())) - return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; @@ -835,8 +841,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) */ if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(uaccess_kernel())) - return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; @@ -1036,6 +1040,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs) +{ + return bpf_kprobe_multi_entry_ip(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = { + .func = bpf_get_func_ip_kprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs) +{ + return bpf_kprobe_multi_cookie(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = { + .func = bpf_get_attach_cookie_kprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; @@ -1063,6 +1091,21 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { + .func = bpf_get_attach_cookie_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { #ifndef CONFIG_X86 @@ -1154,6 +1197,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: @@ -1235,6 +1280,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + case BPF_FUNC_copy_from_user_task: + return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_per_cpu_ptr: @@ -1277,9 +1324,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: - return &bpf_get_func_ip_proto_kprobe; + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ? + &bpf_get_func_ip_proto_kprobe_multi : + &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - return &bpf_get_attach_cookie_proto_trace; + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ? + &bpf_get_attach_cookie_proto_kmulti : + &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1562,6 +1613,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { extern const struct bpf_func_proto bpf_skb_output_proto; extern const struct bpf_func_proto bpf_xdp_output_proto; +extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) @@ -1653,6 +1705,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_udp6_sock_proto; case BPF_FUNC_skc_to_unix_sock: return &bpf_skc_to_unix_sock_proto; + case BPF_FUNC_skc_to_mptcp_sock: + return &bpf_skc_to_mptcp_sock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_tracing_proto; case BPF_FUNC_sk_storage_delete: @@ -1661,6 +1715,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_from_file_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; + case BPF_FUNC_xdp_get_buff_len: + return &bpf_xdp_get_buff_len_trace_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? @@ -1682,6 +1738,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) @@ -2176,3 +2234,362 @@ static int __init bpf_event_init(void) fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ + +#ifdef CONFIG_FPROBE +struct bpf_kprobe_multi_link { + struct bpf_link link; + struct fprobe fp; + unsigned long *addrs; + u64 *cookies; + u32 cnt; +}; + +struct bpf_kprobe_multi_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_kprobe_multi_link *link; + unsigned long entry_ip; +}; + +struct user_syms { + const char **syms; + char *buf; +}; + +static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) +{ + unsigned long __user usymbol; + const char **syms = NULL; + char *buf = NULL, *p; + int err = -ENOMEM; + unsigned int i; + + syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL); + if (!syms) + goto error; + + buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL); + if (!buf) + goto error; + + for (p = buf, i = 0; i < cnt; i++) { + if (__get_user(usymbol, usyms + i)) { + err = -EFAULT; + goto error; + } + err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN); + if (err == KSYM_NAME_LEN) + err = -E2BIG; + if (err < 0) + goto error; + syms[i] = p; + p += err + 1; + } + + us->syms = syms; + us->buf = buf; + return 0; + +error: + if (err) { + kvfree(syms); + kvfree(buf); + } + return err; +} + +static void free_user_syms(struct user_syms *us) +{ + kvfree(us->syms); + kvfree(us->buf); +} + +static void bpf_kprobe_multi_link_release(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + unregister_fprobe(&kmulti_link->fp); +} + +static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + kvfree(kmulti_link->addrs); + kvfree(kmulti_link->cookies); + kfree(kmulti_link); +} + +static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { + .release = bpf_kprobe_multi_link_release, + .dealloc = bpf_kprobe_multi_link_dealloc, +}; + +static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) +{ + const struct bpf_kprobe_multi_link *link = priv; + unsigned long *addr_a = a, *addr_b = b; + u64 *cookie_a, *cookie_b; + + cookie_a = link->cookies + (addr_a - link->addrs); + cookie_b = link->cookies + (addr_b - link->addrs); + + /* swap addr_a/addr_b and cookie_a/cookie_b values */ + swap(*addr_a, *addr_b); + swap(*cookie_a, *cookie_b); +} + +static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) +{ + const unsigned long *addr_a = a, *addr_b = b; + + if (*addr_a == *addr_b) + return 0; + return *addr_a < *addr_b ? -1 : 1; +} + +static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv) +{ + return __bpf_kprobe_multi_cookie_cmp(a, b); +} + +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + struct bpf_kprobe_multi_run_ctx *run_ctx; + struct bpf_kprobe_multi_link *link; + u64 *cookie, entry_ip; + unsigned long *addr; + + if (WARN_ON_ONCE(!ctx)) + return 0; + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + link = run_ctx->link; + if (!link->cookies) + return 0; + entry_ip = run_ctx->entry_ip; + addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip), + __bpf_kprobe_multi_cookie_cmp); + if (!addr) + return 0; + cookie = link->cookies + (addr - link->addrs); + return *cookie; +} + +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + struct bpf_kprobe_multi_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + return run_ctx->entry_ip; +} + +static int +kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, + unsigned long entry_ip, struct pt_regs *regs) +{ + struct bpf_kprobe_multi_run_ctx run_ctx = { + .link = link, + .entry_ip = entry_ip, + }; + struct bpf_run_ctx *old_run_ctx; + int err; + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + err = 0; + goto out; + } + + migrate_disable(); + rcu_read_lock(); + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + err = bpf_prog_run(link->link.prog, regs); + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + + out: + __this_cpu_dec(bpf_prog_active); + return err; +} + +static void +kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct bpf_kprobe_multi_link *link; + + link = container_of(fp, struct bpf_kprobe_multi_link, fp); + kprobe_multi_link_prog_run(link, entry_ip, regs); +} + +static int symbols_cmp_r(const void *a, const void *b, const void *priv) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + +struct multi_symbols_sort { + const char **funcs; + u64 *cookies; +}; + +static void symbols_swap_r(void *a, void *b, int size, const void *priv) +{ + const struct multi_symbols_sort *data = priv; + const char **name_a = a, **name_b = b; + + swap(*name_a, *name_b); + + /* If defined, swap also related cookies. */ + if (data->cookies) { + u64 *cookie_a, *cookie_b; + + cookie_a = data->cookies + (name_a - data->funcs); + cookie_b = data->cookies + (name_b - data->funcs); + swap(*cookie_a, *cookie_b); + } +} + +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_kprobe_multi_link *link = NULL; + struct bpf_link_primer link_primer; + void __user *ucookies; + unsigned long *addrs; + u32 flags, cnt, size; + void __user *uaddrs; + u64 *cookies = NULL; + void __user *usyms; + int err; + + /* no support for 32bit archs yet */ + if (sizeof(u64) != sizeof(void *)) + return -EOPNOTSUPP; + + if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + + flags = attr->link_create.kprobe_multi.flags; + if (flags & ~BPF_F_KPROBE_MULTI_RETURN) + return -EINVAL; + + uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs); + usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms); + if (!!uaddrs == !!usyms) + return -EINVAL; + + cnt = attr->link_create.kprobe_multi.cnt; + if (!cnt) + return -EINVAL; + + size = cnt * sizeof(*addrs); + addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); + if (ucookies) { + cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, size)) { + err = -EFAULT; + goto error; + } + } + + if (uaddrs) { + if (copy_from_user(addrs, uaddrs, size)) { + err = -EFAULT; + goto error; + } + } else { + struct multi_symbols_sort data = { + .cookies = cookies, + }; + struct user_syms us; + + err = copy_user_syms(&us, usyms, cnt); + if (err) + goto error; + + if (cookies) + data.funcs = us.syms; + + sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r, + symbols_swap_r, &data); + + err = ftrace_lookup_symbols(us.syms, cnt, addrs); + free_user_syms(&us); + if (err) + goto error; + } + + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, + &bpf_kprobe_multi_link_lops, prog); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + if (flags & BPF_F_KPROBE_MULTI_RETURN) + link->fp.exit_handler = kprobe_multi_link_handler; + else + link->fp.entry_handler = kprobe_multi_link_handler; + + link->addrs = addrs; + link->cookies = cookies; + link->cnt = cnt; + + if (cookies) { + /* + * Sorting addresses will trigger sorting cookies as well + * (check bpf_kprobe_multi_cookie_swap). This way we can + * find cookie based on the address in bpf_get_attach_cookie + * helper. + */ + sort_r(addrs, cnt, sizeof(*addrs), + bpf_kprobe_multi_cookie_cmp, + bpf_kprobe_multi_cookie_swap, + link); + } + + err = register_fprobe_ips(&link->fp, addrs, cnt); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + + return bpf_link_settle(&link_primer); + +error: + kfree(link); + kvfree(addrs); + kvfree(cookies); + return err; +} +#else /* !CONFIG_FPROBE */ +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + return 0; +} +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + return 0; +} +#endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 22061d38fc00..218cd95bf8e4 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -7,6 +7,7 @@ * * Highly modified by Steven Rostedt (VMware). */ +#include <linux/jump_label.h> #include <linux/suspend.h> #include <linux/ftrace.h> #include <linux/slab.h> @@ -23,23 +24,31 @@ #define ASSIGN_OPS_HASH(opsname, val) #endif -static bool kill_ftrace_graph; +DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph); int ftrace_graph_active; /* Both enabled by default (can be cleared by function_graph tracer flags */ static bool fgraph_sleep_time = true; -/** - * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called - * - * ftrace_graph_stop() is called when a severe error is detected in - * the function graph tracing. This function is called by the critical - * paths of function graph to keep those paths from doing any more harm. +#ifdef CONFIG_DYNAMIC_FTRACE +/* + * archs can override this function if they must do something + * to enable hook for graph tracer. + */ +int __weak ftrace_enable_ftrace_graph_caller(void) +{ + return 0; +} + +/* + * archs can override this function if they must do something + * to disable hook for graph tracer. */ -bool ftrace_graph_is_dead(void) +int __weak ftrace_disable_ftrace_graph_caller(void) { - return kill_ftrace_graph; + return 0; } +#endif /** * ftrace_graph_stop - set to permanently disable function graph tracing @@ -51,7 +60,7 @@ bool ftrace_graph_is_dead(void) */ void ftrace_graph_stop(void) { - kill_ftrace_graph = true; + static_branch_enable(&kill_ftrace_graph); } /* Add a function return address to the trace stack on thread info.*/ @@ -415,7 +424,9 @@ free: static void ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) { unsigned long long timestamp; int index; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c new file mode 100644 index 000000000000..aac63ca9c3d1 --- /dev/null +++ b/kernel/trace/fprobe.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fprobe - Simple ftrace probe wrapper for function entry. + */ +#define pr_fmt(fmt) "fprobe: " fmt + +#include <linux/err.h> +#include <linux/fprobe.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/rethook.h> +#include <linux/slab.h> +#include <linux/sort.h> + +#include "trace.h" + +struct fprobe_rethook_node { + struct rethook_node node; + unsigned long entry_ip; +}; + +static void fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe_rethook_node *fpr; + struct rethook_node *rh; + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + + if (fp->entry_handler) + fp->entry_handler(fp, ip, ftrace_get_regs(fregs)); + + if (fp->exit_handler) { + rh = rethook_try_get(fp->rethook); + if (!rh) { + fp->nmissed++; + goto out; + } + fpr = container_of(rh, struct fprobe_rethook_node, node); + fpr->entry_ip = ip; + rethook_hook(rh, ftrace_get_regs(fregs), true); + } + +out: + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(fprobe_handler); + +static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe *fp = container_of(ops, struct fprobe, ops); + + if (unlikely(kprobe_running())) { + fp->nmissed++; + return; + } + kprobe_busy_begin(); + fprobe_handler(ip, parent_ip, ops, fregs); + kprobe_busy_end(); +} + +static void fprobe_exit_handler(struct rethook_node *rh, void *data, + struct pt_regs *regs) +{ + struct fprobe *fp = (struct fprobe *)data; + struct fprobe_rethook_node *fpr; + + if (!fp || fprobe_disabled(fp)) + return; + + fpr = container_of(rh, struct fprobe_rethook_node, node); + + fp->exit_handler(fp, fpr->entry_ip, regs); +} +NOKPROBE_SYMBOL(fprobe_exit_handler); + +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + +/* Convert ftrace location address from symbols */ +static unsigned long *get_ftrace_locations(const char **syms, int num) +{ + unsigned long *addrs; + + /* Convert symbols to symbol address */ + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return ERR_PTR(-ENOMEM); + + /* ftrace_lookup_symbols expects sorted symbols */ + sort(syms, num, sizeof(*syms), symbols_cmp, NULL); + + if (!ftrace_lookup_symbols(syms, num, addrs)) + return addrs; + + kfree(addrs); + return ERR_PTR(-ENOENT); +} + +static void fprobe_init(struct fprobe *fp) +{ + fp->nmissed = 0; + if (fprobe_shared_with_kprobes(fp)) + fp->ops.func = fprobe_kprobe_handler; + else + fp->ops.func = fprobe_handler; + fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; +} + +static int fprobe_init_rethook(struct fprobe *fp, int num) +{ + int i, size; + + if (num < 0) + return -EINVAL; + + if (!fp->exit_handler) { + fp->rethook = NULL; + return 0; + } + + /* Initialize rethook if needed */ + size = num * num_possible_cpus() * 2; + if (size < 0) + return -E2BIG; + + fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); + for (i = 0; i < size; i++) { + struct fprobe_rethook_node *node; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) { + rethook_free(fp->rethook); + fp->rethook = NULL; + return -ENOMEM; + } + rethook_add_node(fp->rethook, &node->node); + } + return 0; +} + +static void fprobe_fail_cleanup(struct fprobe *fp) +{ + if (fp->rethook) { + /* Don't need to cleanup rethook->handler because this is not used. */ + rethook_free(fp->rethook); + fp->rethook = NULL; + } + ftrace_free_filter(&fp->ops); +} + +/** + * register_fprobe() - Register fprobe to ftrace by pattern. + * @fp: A fprobe data structure to be registered. + * @filter: A wildcard pattern of probed symbols. + * @notfilter: A wildcard pattern of NOT probed symbols. + * + * Register @fp to ftrace for enabling the probe on the symbols matched to @filter. + * If @notfilter is not NULL, the symbols matched the @notfilter are not probed. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) +{ + struct ftrace_hash *hash; + unsigned char *str; + int ret, len; + + if (!fp || !filter) + return -EINVAL; + + fprobe_init(fp); + + len = strlen(filter); + str = kstrdup(filter, GFP_KERNEL); + ret = ftrace_set_filter(&fp->ops, str, len, 0); + kfree(str); + if (ret) + return ret; + + if (notfilter) { + len = strlen(notfilter); + str = kstrdup(notfilter, GFP_KERNEL); + ret = ftrace_set_notrace(&fp->ops, str, len, 0); + kfree(str); + if (ret) + goto out; + } + + /* TODO: + * correctly calculate the total number of filtered symbols + * from both filter and notfilter. + */ + hash = rcu_access_pointer(fp->ops.local_hash.filter_hash); + if (WARN_ON_ONCE(!hash)) + goto out; + + ret = fprobe_init_rethook(fp, (int)hash->count); + if (!ret) + ret = register_ftrace_function(&fp->ops); + +out: + if (ret) + fprobe_fail_cleanup(fp); + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe); + +/** + * register_fprobe_ips() - Register fprobe to ftrace by address. + * @fp: A fprobe data structure to be registered. + * @addrs: An array of target ftrace location addresses. + * @num: The number of entries of @addrs. + * + * Register @fp to ftrace for enabling the probe on the address given by @addrs. + * The @addrs must be the addresses of ftrace location address, which may be + * the symbol address + arch-dependent offset. + * If you unsure what this mean, please use other registration functions. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) +{ + int ret; + + if (!fp || !addrs || num <= 0) + return -EINVAL; + + fprobe_init(fp); + + ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + if (ret) + return ret; + + ret = fprobe_init_rethook(fp, num); + if (!ret) + ret = register_ftrace_function(&fp->ops); + + if (ret) + fprobe_fail_cleanup(fp); + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_ips); + +/** + * register_fprobe_syms() - Register fprobe to ftrace by symbols. + * @fp: A fprobe data structure to be registered. + * @syms: An array of target symbols. + * @num: The number of entries of @syms. + * + * Register @fp to the symbols given by @syms array. This will be useful if + * you are sure the symbols exist in the kernel. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) +{ + unsigned long *addrs; + int ret; + + if (!fp || !syms || num <= 0) + return -EINVAL; + + addrs = get_ftrace_locations(syms, num); + if (IS_ERR(addrs)) + return PTR_ERR(addrs); + + ret = register_fprobe_ips(fp, addrs, num); + + kfree(addrs); + + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_syms); + +/** + * unregister_fprobe() - Unregister fprobe from ftrace + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret; + + if (!fp || fp->ops.func != fprobe_handler) + return -EINVAL; + + /* + * rethook_free() starts disabling the rethook, but the rethook handlers + * may be running on other processors at this point. To make sure that all + * current running handlers are finished, call unregister_ftrace_function() + * after this. + */ + if (fp->rethook) + rethook_free(fp->rethook); + + ret = unregister_ftrace_function(&fp->ops); + if (ret < 0) + return ret; + + ftrace_free_filter(&fp->ops); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6105b7036482..601ccf1b2f09 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -45,6 +45,8 @@ #include "trace_output.h" #include "trace_stat.h" +#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__" + #define FTRACE_WARN_ON(cond) \ ({ \ int ___r = cond; \ @@ -86,7 +88,7 @@ struct ftrace_ops ftrace_list_end __read_mostly = { /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; -static int last_ftrace_enabled; +static int __maybe_unused last_ftrace_enabled; /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; @@ -119,7 +121,7 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; struct ftrace_ops global_ops; -/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */ +/* Defined by vmlinux.lds.h see the comment above arch_ftrace_ops_list_func for details */ void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); @@ -952,7 +954,6 @@ static struct tracer_stat function_stats __initdata = { static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; - struct dentry *entry; char *name; int ret; int cpu; @@ -983,11 +984,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } } - entry = tracefs_create_file("function_profile_enabled", - TRACE_MODE_WRITE, d_tracer, NULL, - &ftrace_profile_fops); - if (!entry) - pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); + trace_create_file("function_profile_enabled", + TRACE_MODE_WRITE, d_tracer, NULL, + &ftrace_profile_fops); } #else /* CONFIG_FUNCTION_PROFILER */ @@ -1568,17 +1567,34 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) } /** - * ftrace_location - return true if the ip giving is a traced location + * ftrace_location - return the ftrace location * @ip: the instruction pointer to check * - * Returns rec->ip if @ip given is a pointer to a ftrace location. - * That is, the instruction that is either a NOP or call to - * the function tracer. It checks the ftrace internal tables to - * determine if the address belongs or not. + * If @ip matches the ftrace location, return @ip. + * If @ip matches sym+0, return sym's ftrace location. + * Otherwise, return 0. */ unsigned long ftrace_location(unsigned long ip) { - return ftrace_location_range(ip, ip); + struct dyn_ftrace *rec; + unsigned long offset; + unsigned long size; + + rec = lookup_rec(ip, ip); + if (!rec) { + if (!kallsyms_lookup_size_offset(ip, &size, &offset)) + goto out; + + /* map sym+0 to __fentry__ */ + if (!offset) + rec = lookup_rec(ip, ip + size - 1); + } + + if (rec) + return rec->ip; + +out: + return 0; } /** @@ -2690,18 +2706,16 @@ ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec) * archs can override this function if they must do something * before the modifying code is performed. */ -int __weak ftrace_arch_code_modify_prepare(void) +void __weak ftrace_arch_code_modify_prepare(void) { - return 0; } /* * archs can override this function if they must do something * after the modifying code is performed. */ -int __weak ftrace_arch_code_modify_post_process(void) +void __weak ftrace_arch_code_modify_post_process(void) { - return 0; } void ftrace_modify_all_code(int command) @@ -2787,12 +2801,7 @@ void __weak arch_ftrace_update_code(int command) static void ftrace_run_update_code(int command) { - int ret; - - ret = ftrace_arch_code_modify_prepare(); - FTRACE_WARN_ON(ret); - if (ret) - return; + ftrace_arch_code_modify_prepare(); /* * By default we use stop_machine() to modify the code. @@ -2802,8 +2811,7 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); - ret = ftrace_arch_code_modify_post_process(); - FTRACE_WARN_ON(ret); + ftrace_arch_code_modify_post_process(); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, @@ -3048,40 +3056,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) return 0; } -static void ftrace_startup_sysctl(void) -{ - int command; - - if (unlikely(ftrace_disabled)) - return; - - /* Force update next time */ - saved_ftrace_func = NULL; - /* ftrace_start_up is true if we want ftrace running */ - if (ftrace_start_up) { - command = FTRACE_UPDATE_CALLS; - if (ftrace_graph_active) - command |= FTRACE_START_FUNC_RET; - ftrace_startup_enable(command); - } -} - -static void ftrace_shutdown_sysctl(void) -{ - int command; - - if (unlikely(ftrace_disabled)) - return; - - /* ftrace_start_up is true if ftrace is running */ - if (ftrace_start_up) { - command = FTRACE_DISABLE_CALLS; - if (ftrace_graph_active) - command |= FTRACE_STOP_FUNC_RET; - ftrace_run_update_code(command); - } -} - static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; unsigned long ftrace_number_of_pages; @@ -3648,6 +3622,105 @@ static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, seq_printf(m, " ->%pS", ptr); } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +/* + * Weak functions can still have an mcount/fentry that is saved in + * the __mcount_loc section. These can be detected by having a + * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the + * symbol found by kallsyms is not the function that the mcount/fentry + * is part of. The offset is much greater in these cases. + * + * Test the record to make sure that the ip points to a valid kallsyms + * and if not, mark it disabled. + */ +static int test_for_valid_rec(struct dyn_ftrace *rec) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned long offset; + const char *ret; + + ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str); + + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + rec->flags |= FTRACE_FL_DISABLED; + return 0; + } + return 1; +} + +static struct workqueue_struct *ftrace_check_wq __initdata; +static struct work_struct ftrace_check_work __initdata; + +/* + * Scan all the mcount/fentry entries to make sure they are valid. + */ +static __init void ftrace_check_work_func(struct work_struct *work) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + test_for_valid_rec(rec); + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static int __init ftrace_check_for_weak_functions(void) +{ + INIT_WORK(&ftrace_check_work, ftrace_check_work_func); + + ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0); + + queue_work(ftrace_check_wq, &ftrace_check_work); + return 0; +} + +static int __init ftrace_check_sync(void) +{ + /* Make sure the ftrace_check updates are finished */ + if (ftrace_check_wq) + destroy_workqueue(ftrace_check_wq); + return 0; +} + +late_initcall_sync(ftrace_check_sync); +subsys_initcall(ftrace_check_for_weak_functions); + +static int print_rec(struct seq_file *m, unsigned long ip) +{ + unsigned long offset; + char str[KSYM_SYMBOL_LEN]; + char *modname; + const char *ret; + + ret = kallsyms_lookup(ip, NULL, &offset, &modname, str); + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld", + FTRACE_INVALID_FUNCTION, offset); + ret = NULL; + } + + seq_puts(m, str); + if (modname) + seq_printf(m, " [%s]", modname); + return ret == NULL ? -1 : 0; +} +#else +static inline int test_for_valid_rec(struct dyn_ftrace *rec) +{ + return 1; +} + +static inline int print_rec(struct seq_file *m, unsigned long ip) +{ + seq_printf(m, "%ps", (void *)ip); + return 0; +} +#endif + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -3672,7 +3745,13 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%ps", (void *)rec->ip); + if (print_rec(m, rec->ip)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); + seq_putc(m, '\n'); + return 0; + } + if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; @@ -3990,6 +4069,24 @@ add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, return 0; } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + unsigned long offset; + + kallsyms_lookup(ip, NULL, &offset, modname, str); + if (offset > FTRACE_MCOUNT_MAX_OFFSET) + return -1; + return 0; +} +#else +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + kallsyms_lookup(ip, NULL, NULL, modname, str); + return 0; +} +#endif + static int ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, struct ftrace_glob *mod_g, int exclude_mod) @@ -3997,7 +4094,12 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, char str[KSYM_SYMBOL_LEN]; char *modname; - kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + if (lookup_ip(rec->ip, &modname, str)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING && + !(rec->flags & FTRACE_FL_DISABLED)); + return 0; + } if (mod_g) { int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0; @@ -4448,7 +4550,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to remove the data from * * Returns the data if it is found, otherwise NULL. - * Note, if the data pointer is used as the data itself, (see + * Note, if the data pointer is used as the data itself, (see * ftrace_func_mapper_find_ip(), then the return value may be meaningless, * if the data pointer was set to zero. */ @@ -4543,8 +4645,8 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, struct ftrace_probe_ops *probe_ops, void *data) { + struct ftrace_func_probe *probe = NULL, *iter; struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; struct ftrace_hash *hash; @@ -4563,11 +4665,13 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, mutex_lock(&ftrace_lock); /* Check if the probe_ops is already registered */ - list_for_each_entry(probe, &tr->func_probes, list) { - if (probe->probe_ops == probe_ops) + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; break; + } } - if (&probe->list == &tr->func_probes) { + if (!probe) { probe = kzalloc(sizeof(*probe), GFP_KERNEL); if (!probe) { mutex_unlock(&ftrace_lock); @@ -4685,9 +4789,9 @@ int unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, struct ftrace_probe_ops *probe_ops) { + struct ftrace_func_probe *probe = NULL, *iter; struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_glob func_g; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; @@ -4715,11 +4819,13 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, mutex_lock(&ftrace_lock); /* Check if the probe_ops is already registered */ - list_for_each_entry(probe, &tr->func_probes, list) { - if (probe->probe_ops == probe_ops) + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; break; + } } - if (&probe->list == &tr->func_probes) + if (!probe) goto err_unlock_ftrace; ret = -EINVAL; @@ -4958,11 +5064,12 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static int -ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) { struct ftrace_func_entry *entry; - if (!ftrace_location(ip)) + ip = ftrace_location(ip); + if (!ip) return -EINVAL; if (remove) { @@ -4977,8 +5084,29 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) } static int +ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips, + unsigned int cnt, int remove) +{ + unsigned int i; + int err; + + for (i = 0; i < cnt; i++) { + err = __ftrace_match_addr(hash, ips[i], remove); + if (err) { + /* + * This expects the @hash is a temporary hash and if this + * fails the caller must free the @hash. + */ + return err; + } + } + return 0; +} + +static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, - unsigned long ip, int remove, int reset, int enable) + unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -5008,8 +5136,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, ret = -EINVAL; goto out_regex_unlock; } - if (ip) { - ret = ftrace_match_addr(hash, ip, remove); + if (ips) { + ret = ftrace_match_addr(hash, ips, cnt, remove); if (ret < 0) goto out_regex_unlock; } @@ -5026,10 +5154,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } static int -ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, - int reset, int enable) +ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) { - return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable); } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -5110,11 +5238,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) struct ftrace_func_entry *entry; struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; - int ret = -EBUSY; + int ret = -ENODEV; mutex_lock(&direct_mutex); + ip = ftrace_location(ip); + if (!ip) + goto out_unlock; + /* See if there's a direct function at @ip already */ + ret = -EBUSY; if (ftrace_find_rec_direct(ip)) goto out_unlock; @@ -5151,8 +5284,6 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) goto out_unlock; ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); - if (ret) - remove_hash_entry(direct_functions, entry); if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { ret = register_ftrace_function(&direct_ops); @@ -5161,6 +5292,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } if (ret) { + remove_hash_entry(direct_functions, entry); kfree(entry); if (!direct->count) { list_del_rcu(&direct->next); @@ -5222,6 +5354,10 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) mutex_lock(&direct_mutex); + ip = ftrace_location(ip); + if (!ip) + goto out_unlock; + entry = find_direct_entry(&ip, NULL); if (!entry) goto out_unlock; @@ -5354,6 +5490,11 @@ int modify_ftrace_direct(unsigned long ip, mutex_lock(&direct_mutex); mutex_lock(&ftrace_lock); + + ip = ftrace_location(ip); + if (!ip) + goto out_unlock; + entry = find_direct_entry(&ip, &rec); if (!entry) goto out_unlock; @@ -5634,11 +5775,30 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) { ftrace_ops_init(ops); - return ftrace_set_addr(ops, ip, remove, reset, 1); + return ftrace_set_addr(ops, &ip, 1, remove, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); /** + * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses + * @ops - the ops to set the filter with + * @ips - the array of addresses to add to or remove from the filter. + * @cnt - the number of addresses in @ips + * @remove - non zero to remove ips from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ips array or any ip specified within is NULL , it fails to update filter. + */ +int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips, + unsigned int cnt, int remove, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_addr(ops, ips, cnt, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ips); + +/** * ftrace_ops_set_global_filter - setup ops to use global filters * @ops - the ops which will use the global filters * @@ -5659,7 +5819,7 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { - return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); + return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable); } /** @@ -6755,6 +6915,13 @@ void ftrace_module_enable(struct module *mod) !within_module_init(rec->ip, mod)) break; + /* Weak functions should still be ignored */ + if (!test_for_valid_rec(rec)) { + /* Clear all other flags. Should not be enabled anyway */ + rec->flags = FTRACE_FL_DISABLED; + continue; + } + cnt = 0; /* @@ -6791,11 +6958,16 @@ void ftrace_module_enable(struct module *mod) void ftrace_module_init(struct module *mod) { + int ret; + if (ftrace_disabled || !mod->num_ftrace_callsites) return; - ftrace_process_locs(mod, mod->ftrace_callsites, - mod->ftrace_callsites + mod->num_ftrace_callsites); + ret = ftrace_process_locs(mod, mod->ftrace_callsites, + mod->ftrace_callsites + mod->num_ftrace_callsites); + if (ret) + pr_warn("ftrace: failed to allocate entries for module '%s' functions\n", + mod->name); } static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, @@ -7096,6 +7268,8 @@ void __init ftrace_free_init_mem(void) void *start = (void *)(&__init_begin); void *end = (void *)(&__init_end); + ftrace_boot_snapshot(); + ftrace_free_mem(NULL, start, end); } @@ -7126,15 +7300,19 @@ void __init ftrace_init(void) pr_info("ftrace: allocating %ld entries in %ld pages\n", count, count / ENTRIES_PER_PAGE + 1); - last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); + if (ret) { + pr_warn("ftrace: failed to allocate entries for functions\n"); + goto failed; + } pr_info("ftrace: allocated %ld pages with %ld groups\n", ftrace_number_of_pages, ftrace_number_of_groups); + last_ftrace_enabled = ftrace_enabled = 1; + set_ftrace_early_filters(); return; @@ -7193,9 +7371,6 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_all(int command) { } -# define ftrace_startup_sysctl() do { } while (0) -# define ftrace_shutdown_sysctl() do { } while (0) - static void ftrace_update_trampoline(struct ftrace_ops *ops) { } @@ -7346,7 +7521,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) static void ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *pid_list; @@ -7833,6 +8010,118 @@ int unregister_ftrace_function(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(unregister_ftrace_function); +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + +struct kallsyms_data { + unsigned long *addrs; + const char **syms; + size_t cnt; + size_t found; +}; + +static int kallsyms_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct kallsyms_data *args = data; + const char **sym; + int idx; + + sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp); + if (!sym) + return 0; + + idx = sym - args->syms; + if (args->addrs[idx]) + return 0; + + addr = ftrace_location(addr); + if (!addr) + return 0; + + args->addrs[idx] = addr; + args->found++; + return args->found == args->cnt ? 1 : 0; +} + +/** + * ftrace_lookup_symbols - Lookup addresses for array of symbols + * + * @sorted_syms: array of symbols pointers symbols to resolve, + * must be alphabetically sorted + * @cnt: number of symbols/addresses in @syms/@addrs arrays + * @addrs: array for storing resulting addresses + * + * This function looks up addresses for array of symbols provided in + * @syms array (must be alphabetically sorted) and stores them in + * @addrs array, which needs to be big enough to store at least @cnt + * addresses. + * + * This function returns 0 if all provided symbols are found, + * -ESRCH otherwise. + */ +int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) +{ + struct kallsyms_data args; + int err; + + memset(addrs, 0, sizeof(*addrs) * cnt); + args.addrs = addrs; + args.syms = sorted_syms; + args.cnt = cnt; + args.found = 0; + err = kallsyms_on_each_symbol(kallsyms_callback, &args); + if (err < 0) + return err; + return args.found == args.cnt ? 0 : -ESRCH; +} + +#ifdef CONFIG_SYSCTL + +#ifdef CONFIG_DYNAMIC_FTRACE +static void ftrace_startup_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_startup_enable(command); + } +} + +static void ftrace_shutdown_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } +} +#else +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + static bool is_permanent_ops_registered(void) { struct ftrace_ops *op; @@ -7845,7 +8134,7 @@ static bool is_permanent_ops_registered(void) return false; } -int +static int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -7888,3 +8177,22 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_lock); return ret; } + +static struct ctl_table ftrace_sysctls[] = { + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, + {} +}; + +static int __init ftrace_sysctl_init(void) +{ + register_sysctl_init("kernel", ftrace_sysctls); + return 0; +} +late_initcall(ftrace_sysctl_init); +#endif diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index a2ef1d18126a..95106d02b32d 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -118,9 +118,9 @@ static inline unsigned int pid_join(unsigned int upper1, /** * trace_pid_list_is_set - test if the pid is set in the list * @pid_list: The pid list to test - * @pid: The pid to to see if set in the list. + * @pid: The pid to see if set in the list. * - * Tests if @pid is is set in the @pid_list. This is usually called + * Tests if @pid is set in the @pid_list. This is usually called * from the scheduler when a task is scheduled. Its pid is checked * if it should be traced or not. * diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c new file mode 100644 index 000000000000..c69d82273ce7 --- /dev/null +++ b/kernel/trace/rethook.c @@ -0,0 +1,326 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "rethook: " fmt + +#include <linux/bug.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/preempt.h> +#include <linux/rethook.h> +#include <linux/slab.h> +#include <linux/sort.h> + +/* Return hook list (shadow stack by list) */ + +/* + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any kretprobe instances associated with + * this task. These left over instances represent probed functions that + * have been called but will never return. + */ +void rethook_flush_task(struct task_struct *tk) +{ + struct rethook_node *rhn; + struct llist_node *node; + + node = __llist_del_all(&tk->rethooks); + while (node) { + rhn = container_of(node, struct rethook_node, llist); + node = node->next; + preempt_disable(); + rethook_recycle(rhn); + preempt_enable(); + } +} + +static void rethook_free_rcu(struct rcu_head *head) +{ + struct rethook *rh = container_of(head, struct rethook, rcu); + struct rethook_node *rhn; + struct freelist_node *node; + int count = 1; + + node = rh->pool.head; + while (node) { + rhn = container_of(node, struct rethook_node, freelist); + node = node->next; + kfree(rhn); + count++; + } + + /* The rh->ref is the number of pooled node + 1 */ + if (refcount_sub_and_test(count, &rh->ref)) + kfree(rh); +} + +/** + * rethook_free() - Free struct rethook. + * @rh: the struct rethook to be freed. + * + * Free the rethook. Before calling this function, user must ensure the + * @rh::data is cleaned if needed (or, the handler can access it after + * calling this function.) This function will set the @rh to be freed + * after all rethook_node are freed (not soon). And the caller must + * not touch @rh after calling this. + */ +void rethook_free(struct rethook *rh) +{ + WRITE_ONCE(rh->handler, NULL); + + call_rcu(&rh->rcu, rethook_free_rcu); +} + +/** + * rethook_alloc() - Allocate struct rethook. + * @data: a data to pass the @handler when hooking the return. + * @handler: the return hook callback function. + * + * Allocate and initialize a new rethook with @data and @handler. + * Return NULL if memory allocation fails or @handler is NULL. + * Note that @handler == NULL means this rethook is going to be freed. + */ +struct rethook *rethook_alloc(void *data, rethook_handler_t handler) +{ + struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); + + if (!rh || !handler) + return NULL; + + rh->data = data; + rh->handler = handler; + rh->pool.head = NULL; + refcount_set(&rh->ref, 1); + + return rh; +} + +/** + * rethook_add_node() - Add a new node to the rethook. + * @rh: the struct rethook. + * @node: the struct rethook_node to be added. + * + * Add @node to @rh. User must allocate @node (as a part of user's + * data structure.) The @node fields are initialized in this function. + */ +void rethook_add_node(struct rethook *rh, struct rethook_node *node) +{ + node->rethook = rh; + freelist_add(&node->freelist, &rh->pool); + refcount_inc(&rh->ref); +} + +static void free_rethook_node_rcu(struct rcu_head *head) +{ + struct rethook_node *node = container_of(head, struct rethook_node, rcu); + + if (refcount_dec_and_test(&node->rethook->ref)) + kfree(node->rethook); + kfree(node); +} + +/** + * rethook_recycle() - return the node to rethook. + * @node: The struct rethook_node to be returned. + * + * Return back the @node to @node::rethook. If the @node::rethook is already + * marked as freed, this will free the @node. + */ +void rethook_recycle(struct rethook_node *node) +{ + lockdep_assert_preemption_disabled(); + + if (likely(READ_ONCE(node->rethook->handler))) + freelist_add(&node->freelist, &node->rethook->pool); + else + call_rcu(&node->rcu, free_rethook_node_rcu); +} +NOKPROBE_SYMBOL(rethook_recycle); + +/** + * rethook_try_get() - get an unused rethook node. + * @rh: The struct rethook which pools the nodes. + * + * Get an unused rethook node from @rh. If the node pool is empty, this + * will return NULL. Caller must disable preemption. + */ +struct rethook_node *rethook_try_get(struct rethook *rh) +{ + rethook_handler_t handler = READ_ONCE(rh->handler); + struct freelist_node *fn; + + lockdep_assert_preemption_disabled(); + + /* Check whether @rh is going to be freed. */ + if (unlikely(!handler)) + return NULL; + + /* + * This expects the caller will set up a rethook on a function entry. + * When the function returns, the rethook will eventually be reclaimed + * or released in the rethook_recycle() with call_rcu(). + * This means the caller must be run in the RCU-availabe context. + */ + if (unlikely(!rcu_is_watching())) + return NULL; + + fn = freelist_try_get(&rh->pool); + if (!fn) + return NULL; + + return container_of(fn, struct rethook_node, freelist); +} +NOKPROBE_SYMBOL(rethook_try_get); + +/** + * rethook_hook() - Hook the current function return. + * @node: The struct rethook node to hook the function return. + * @regs: The struct pt_regs for the function entry. + * @mcount: True if this is called from mcount(ftrace) context. + * + * Hook the current running function return. This must be called when the + * function entry (or at least @regs must be the registers of the function + * entry.) @mcount is used for identifying the context. If this is called + * from ftrace (mcount) callback, @mcount must be set true. If this is called + * from the real function entry (e.g. kprobes) @mcount must be set false. + * This is because the way to hook the function return depends on the context. + */ +void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount) +{ + arch_rethook_prepare(node, regs, mcount); + __llist_add(&node->llist, ¤t->rethooks); +} +NOKPROBE_SYMBOL(rethook_hook); + +/* This assumes the 'tsk' is the current task or is not running. */ +static unsigned long __rethook_find_ret_addr(struct task_struct *tsk, + struct llist_node **cur) +{ + struct rethook_node *rh = NULL; + struct llist_node *node = *cur; + + if (!node) + node = tsk->rethooks.first; + else + node = node->next; + + while (node) { + rh = container_of(node, struct rethook_node, llist); + if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) { + *cur = node; + return rh->ret_addr; + } + node = node->next; + } + return 0; +} +NOKPROBE_SYMBOL(__rethook_find_ret_addr); + +/** + * rethook_find_ret_addr -- Find correct return address modified by rethook + * @tsk: Target task + * @frame: A frame pointer + * @cur: a storage of the loop cursor llist_node pointer for next call + * + * Find the correct return address modified by a rethook on @tsk in unsigned + * long type. + * The @tsk must be 'current' or a task which is not running. @frame is a hint + * to get the currect return address - which is compared with the + * rethook::frame field. The @cur is a loop cursor for searching the + * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the + * first call, but '@cur' itself must NOT NULL. + * + * Returns found address value or zero if not found. + */ +unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame, + struct llist_node **cur) +{ + struct rethook_node *rhn = NULL; + unsigned long ret; + + if (WARN_ON_ONCE(!cur)) + return 0; + + if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + return 0; + + do { + ret = __rethook_find_ret_addr(tsk, cur); + if (!ret) + break; + rhn = container_of(*cur, struct rethook_node, llist); + } while (rhn->frame != frame); + + return ret; +} +NOKPROBE_SYMBOL(rethook_find_ret_addr); + +void __weak arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* + * Do nothing by default. If the architecture which uses a + * frame pointer to record real return address on the stack, + * it should fill this function to fixup the return address + * so that stacktrace works from the rethook handler. + */ +} + +/* This function will be called from each arch-defined trampoline. */ +unsigned long rethook_trampoline_handler(struct pt_regs *regs, + unsigned long frame) +{ + struct llist_node *first, *node = NULL; + unsigned long correct_ret_addr; + rethook_handler_t handler; + struct rethook_node *rhn; + + correct_ret_addr = __rethook_find_ret_addr(current, &node); + if (!correct_ret_addr) { + pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n"); + BUG_ON(1); + } + + instruction_pointer_set(regs, correct_ret_addr); + + /* + * These loops must be protected from rethook_free_rcu() because those + * are accessing 'rhn->rethook'. + */ + preempt_disable(); + + /* + * Run the handler on the shadow stack. Do not unlink the list here because + * stackdump inside the handlers needs to decode it. + */ + first = current->rethooks.first; + while (first) { + rhn = container_of(first, struct rethook_node, llist); + if (WARN_ON_ONCE(rhn->frame != frame)) + break; + handler = READ_ONCE(rhn->rethook->handler); + if (handler) + handler(rhn, rhn->rethook->data, regs); + + if (first == node) + break; + first = first->next; + } + + /* Fixup registers for returning to correct address. */ + arch_rethook_fixup_return(regs, correct_ret_addr); + + /* Unlink used shadow stack */ + first = current->rethooks.first; + current->rethooks.first = node->next; + node->next = NULL; + + while (first) { + rhn = container_of(first, struct rethook_node, llist); + first = first->next; + rethook_recycle(rhn); + } + preempt_enable(); + + return correct_ret_addr; +} +NOKPROBE_SYMBOL(rethook_trampoline_handler); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05dfc7a12d3d..d59b6a328b7f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -29,6 +29,14 @@ #include <asm/local.h> +/* + * The "absolute" timestamp in the buffer is only 59 bits. + * If a clock has the 5 MSBs set, it needs to be saved and + * reinserted. + */ +#define TS_MSB (0xf8ULL << 56) +#define ABS_TS_MASK (~TS_MSB) + static void update_pages_handler(struct work_struct *work); /* @@ -468,6 +476,7 @@ struct rb_time_struct { local_t cnt; local_t top; local_t bottom; + local_t msb; }; #else #include <asm/local64.h> @@ -569,7 +578,6 @@ struct ring_buffer_iter { * For the ring buffer, 64 bit required operations for the time is * the following: * - * - Only need 59 bits (uses 60 to make it even). * - Reads may fail if it interrupted a modification of the time stamp. * It will succeed if it did not interrupt another write even if * the read itself is interrupted by a write. @@ -594,6 +602,7 @@ struct ring_buffer_iter { */ #define RB_TIME_SHIFT 30 #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) +#define RB_TIME_MSB_SHIFT 60 static inline int rb_time_cnt(unsigned long val) { @@ -613,7 +622,7 @@ static inline u64 rb_time_val(unsigned long top, unsigned long bottom) static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) { - unsigned long top, bottom; + unsigned long top, bottom, msb; unsigned long c; /* @@ -625,6 +634,7 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) c = local_read(&t->cnt); top = local_read(&t->top); bottom = local_read(&t->bottom); + msb = local_read(&t->msb); } while (c != local_read(&t->cnt)); *cnt = rb_time_cnt(top); @@ -633,7 +643,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) if (*cnt != rb_time_cnt(bottom)) return false; - *ret = rb_time_val(top, bottom); + /* The shift to msb will lose its cnt bits */ + *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); return true; } @@ -649,10 +660,12 @@ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); } -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, + unsigned long *msb) { *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); + *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); } static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) @@ -663,15 +676,16 @@ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long static void rb_time_set(rb_time_t *t, u64 val) { - unsigned long cnt, top, bottom; + unsigned long cnt, top, bottom, msb; - rb_time_split(val, &top, &bottom); + rb_time_split(val, &top, &bottom, &msb); /* Writes always succeed with a valid number even if it gets interrupted. */ do { cnt = local_inc_return(&t->cnt); rb_time_val_set(&t->top, top, cnt); rb_time_val_set(&t->bottom, bottom, cnt); + rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); } while (cnt != local_read(&t->cnt)); } @@ -686,8 +700,8 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) { - unsigned long cnt, top, bottom; - unsigned long cnt2, top2, bottom2; + unsigned long cnt, top, bottom, msb; + unsigned long cnt2, top2, bottom2, msb2; u64 val; /* The cmpxchg always fails if it interrupted an update */ @@ -703,16 +717,18 @@ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) cnt2 = cnt + 1; - rb_time_split(val, &top, &bottom); + rb_time_split(val, &top, &bottom, &msb); top = rb_time_val_cnt(top, cnt); bottom = rb_time_val_cnt(bottom, cnt); - rb_time_split(set, &top2, &bottom2); + rb_time_split(set, &top2, &bottom2, &msb2); top2 = rb_time_val_cnt(top2, cnt2); bottom2 = rb_time_val_cnt(bottom2, cnt2); if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) return false; + if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) + return false; if (!rb_time_read_cmpxchg(&t->top, top, top2)) return false; if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) @@ -783,6 +799,24 @@ static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, } #endif +/* + * The absolute time stamp drops the 5 MSBs and some clocks may + * require them. The rb_fix_abs_ts() will take a previous full + * time stamp, and add the 5 MSB of that time stamp on to the + * saved absolute time stamp. Then they are compared in case of + * the unlikely event that the latest time stamp incremented + * the 5 MSB. + */ +static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts) +{ + if (save_ts & TS_MSB) { + abs |= save_ts & TS_MSB; + /* Check for overflow */ + if (unlikely(abs < save_ts)) + abs += 1ULL << 59; + } + return abs; +} static inline u64 rb_time_stamp(struct trace_buffer *buffer); @@ -811,8 +845,10 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, u64 ts; /* If the event includes an absolute time, then just use that */ - if (event->type_len == RINGBUF_TYPE_TIME_STAMP) - return rb_event_time_stamp(event); + if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { + ts = rb_event_time_stamp(event); + return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp); + } nest = local_read(&cpu_buffer->committing); verify_event(cpu_buffer, event); @@ -2754,8 +2790,15 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); if (unlikely(info->delta > (1ULL << 59))) { + /* + * Some timers can use more than 59 bits, and when a timestamp + * is added to the buffer, it will lose those bits. + */ + if (abs && (info->ts & TS_MSB)) { + info->delta &= ABS_TS_MASK; + /* did the clock go backwards */ - if (info->before == info->after && info->before > info->ts) { + } else if (info->before == info->after && info->before > info->ts) { /* not interrupted */ static int once; @@ -3304,7 +3347,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = delta; + ts = rb_fix_abs_ts(delta, ts); pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; @@ -3380,7 +3423,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = delta; + ts = rb_fix_abs_ts(delta, ts); break; case RINGBUF_TYPE_PADDING: @@ -4367,6 +4410,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp); cpu_buffer->read_stamp = delta; return; @@ -4397,6 +4441,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, iter->read_stamp); iter->read_stamp = delta; return; @@ -4650,6 +4695,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -4741,6 +4787,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -6011,10 +6058,10 @@ static __init int test_ringbuffer(void) pr_info(" total events: %ld\n", total_lost + total_read); pr_info(" recorded len bytes: %ld\n", total_len); pr_info(" recorded size bytes: %ld\n", total_size); - if (total_lost) + if (total_lost) { pr_info(" With dropped events, record len and size may not match\n" " alloced and written from above\n"); - if (!total_lost) { + } else { if (RB_WARN_ON(buffer, total_len != total_alloc || total_size != total_written)) break; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eb44418574f9..a8cfac0611bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -185,6 +185,7 @@ static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static bool allocate_snapshot; +static bool snapshot_at_boot; static int __init set_cmdline_ftrace(char *str) { @@ -230,6 +231,15 @@ static int __init boot_alloc_snapshot(char *str) __setup("alloc_snapshot", boot_alloc_snapshot); +static int __init boot_snapshot(char *str) +{ + snapshot_at_boot = true; + boot_alloc_snapshot(str); + return 1; +} +__setup("ftrace_boot_snapshot", boot_snapshot); + + static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) @@ -711,13 +721,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pos = 0; ret = trace_get_user(&parser, ubuf, cnt, &pos); - if (ret < 0 || !trace_parser_loaded(&parser)) + if (ret < 0) break; read += ret; ubuf += ret; cnt -= ret; + if (!trace_parser_loaded(&parser)) + break; + ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; @@ -743,7 +756,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, if (!nr_pids) { /* Cleared the list of pids */ trace_pid_list_free(pid_list); - read = ret; pid_list = NULL; } @@ -1164,7 +1176,7 @@ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) EXPORT_SYMBOL_GPL(tracing_snapshot_cond); /** - * tracing_snapshot_cond_data - get the user data associated with a snapshot + * tracing_cond_snapshot_data - get the user data associated with a snapshot * @tr: The tracing instance * * When the user enables a conditional snapshot using @@ -1532,6 +1544,7 @@ static struct { { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, { ktime_get_boot_fast_ns, "boot", 1 }, + { ktime_get_tai_fast_ns, "tai", 1 }, ARCH_TRACE_CLOCKS }; @@ -2825,7 +2838,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); -static DEFINE_SPINLOCK(tracepoint_iter_lock); +static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); static DEFINE_MUTEX(tracepoint_printk_mutex); static void output_printk(struct trace_event_buffer *fbuffer) @@ -2853,14 +2866,14 @@ static void output_printk(struct trace_event_buffer *fbuffer) event = &fbuffer->trace_file->event_call->event; - spin_lock_irqsave(&tracepoint_iter_lock, flags); + raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); iter->ent = fbuffer->entry; event_call->event.funcs->trace(iter, 0, event); trace_seq_putc(&iter->seq, 0); printk("%s", iter->seq.buffer); - spin_unlock_irqrestore(&tracepoint_iter_lock, flags); + raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } int tracepoint_printk_sysctl(struct ctl_table *table, int write, @@ -3663,12 +3676,17 @@ static char *trace_iter_expand_format(struct trace_iterator *iter) } /* Returns true if the string is safe to dereference from an event */ -static bool trace_safe_str(struct trace_iterator *iter, const char *str) +static bool trace_safe_str(struct trace_iterator *iter, const char *str, + bool star, int len) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; + /* Ignore strings with no length */ + if (star && !len) + return true; + /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) @@ -3854,7 +3872,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, * instead. See samples/trace_events/trace-events-sample.h * for reference. */ - if (WARN_ONCE(!trace_safe_str(iter, str), + if (WARN_ONCE(!trace_safe_str(iter, str, star, len), "fmt: '%s' current_buffer: '%s'", fmt, show_buffer(&iter->seq))) { int ret; @@ -4234,7 +4252,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char *space = " "; + static const char space[] = " "; int prec = tgid ? 12 : 2; print_event_info(buf, m); @@ -4258,9 +4276,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct tracer *type = iter->trace; unsigned long entries; unsigned long total; - const char *name = "preemption"; - - name = type->name; + const char *name = type->name; get_total_entries(buf, &total, &entries); @@ -4274,17 +4290,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, -#if defined(CONFIG_PREEMPT_NONE) - "server", -#elif defined(CONFIG_PREEMPT_VOLUNTARY) - "desktop", -#elif defined(CONFIG_PREEMPT) - "preempt", -#elif defined(CONFIG_PREEMPT_RT) - "preempt_rt", -#else + preempt_model_none() ? "server" : + preempt_model_voluntary() ? "desktop" : + preempt_model_full() ? "preempt" : + preempt_model_rt() ? "preempt_rt" : "unknown", -#endif /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP @@ -5460,7 +5470,7 @@ static const char readme_msg[] = " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" - " trace_clock\t\t-change the clock used to order events\n" + " trace_clock\t\t- change the clock used to order events\n" " local: Per cpu clock but may not be synced across CPUs\n" " global: Synced across CPUs but slows tracing down.\n" " counter: Not a clock, but just an increment\n" @@ -5469,7 +5479,7 @@ static const char readme_msg[] = #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif - "\n timestamp_mode\t-view the mode used to timestamp events\n" + "\n timestamp_mode\t- view the mode used to timestamp events\n" " delta: Delta difference against a buffer-wide timestamp\n" " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" @@ -6317,12 +6327,18 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } +static bool tracer_options_updated; + static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ if (!tr->dir) return; + /* Only create trace option files after update_tracer_options finish */ + if (!tracer_options_updated) + return; + create_trace_option_files(tr, t); } @@ -6408,9 +6424,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) synchronize_rcu(); free_snapshot(tr); } -#endif -#ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) @@ -6439,7 +6453,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; - int i; + char *name; size_t ret; int err; @@ -6453,11 +6467,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; + name = strim(buf); - err = tracing_set_tracer(tr, buf); + err = tracing_set_tracer(tr, name); if (err) return err; @@ -7725,7 +7737,7 @@ const struct file_operations trace_min_max_fops = { struct err_info { const char **errs; /* ptr to loc-specific array of err strings */ u8 type; /* index into errs -> specific err string */ - u8 pos; /* MAX_FILTER_STR_VAL = 256 */ + u16 pos; /* caret position */ u64 ts; }; @@ -7733,26 +7745,52 @@ struct tracing_log_err { struct list_head list; struct err_info info; char loc[TRACING_LOG_LOC_MAX]; /* err location */ - char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ + char *cmd; /* what caused err */ }; static DEFINE_MUTEX(tracing_err_log_lock); -static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) +static struct tracing_log_err *alloc_tracing_log_err(int len) +{ + struct tracing_log_err *err; + + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + return ERR_PTR(-ENOMEM); + + err->cmd = kzalloc(len, GFP_KERNEL); + if (!err->cmd) { + kfree(err); + return ERR_PTR(-ENOMEM); + } + + return err; +} + +static void free_tracing_log_err(struct tracing_log_err *err) +{ + kfree(err->cmd); + kfree(err); +} + +static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, + int len) { struct tracing_log_err *err; if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { - err = kzalloc(sizeof(*err), GFP_KERNEL); - if (!err) - err = ERR_PTR(-ENOMEM); - else + err = alloc_tracing_log_err(len); + if (PTR_ERR(err) != -ENOMEM) tr->n_err_log_entries++; return err; } err = list_first_entry(&tr->err_log, struct tracing_log_err, list); + kfree(err->cmd); + err->cmd = kzalloc(len, GFP_KERNEL); + if (!err->cmd) + return ERR_PTR(-ENOMEM); list_del(&err->list); return err; @@ -7813,22 +7851,25 @@ unsigned int err_pos(char *cmd, const char *str) */ void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, - const char **errs, u8 type, u8 pos) + const char **errs, u8 type, u16 pos) { struct tracing_log_err *err; + int len = 0; if (!tr) tr = &global_trace; + len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; + mutex_lock(&tracing_err_log_lock); - err = get_tracing_log_err(tr); + err = get_tracing_log_err(tr, len); if (PTR_ERR(err) == -ENOMEM) { mutex_unlock(&tracing_err_log_lock); return; } snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); - snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); + snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); err->info.errs = errs; err->info.type = type; @@ -7846,7 +7887,7 @@ static void clear_tracing_err_log(struct trace_array *tr) mutex_lock(&tracing_err_log_lock); list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); - kfree(err); + free_tracing_log_err(err); } tr->n_err_log_entries = 0; @@ -7874,9 +7915,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v) mutex_unlock(&tracing_err_log_lock); } -static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) +static void tracing_err_log_show_pos(struct seq_file *m, u16 pos) { - u8 i; + u16 i; for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) seq_putc(m, ' '); @@ -9132,6 +9173,7 @@ static void __update_tracer_options(struct trace_array *tr) static void update_tracer_options(struct trace_array *tr) { mutex_lock(&trace_types_lock); + tracer_options_updated = true; __update_tracer_options(tr); mutex_unlock(&trace_types_lock); } @@ -9564,6 +9606,7 @@ extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static struct workqueue_struct *eval_map_wq __initdata; static struct work_struct eval_map_work __initdata; +static struct work_struct tracerfs_init_work __initdata; static void __init eval_map_work_func(struct work_struct *work) { @@ -9589,6 +9632,8 @@ static int __init trace_eval_init(void) return 0; } +subsys_initcall(trace_eval_init); + static int __init trace_eval_sync(void) { /* Make sure the eval map updates are finished */ @@ -9671,15 +9716,8 @@ static struct notifier_block trace_module_nb = { }; #endif /* CONFIG_MODULES */ -static __init int tracer_init_tracefs(void) +static __init void tracer_init_tracefs_work_func(struct work_struct *work) { - int ret; - - trace_access_lock_init(); - - ret = tracing_init_dentry(); - if (ret) - return 0; event_trace_init(); @@ -9701,8 +9739,6 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops); - trace_eval_init(); - trace_create_eval_file(NULL); #ifdef CONFIG_MODULES @@ -9717,6 +9753,24 @@ static __init int tracer_init_tracefs(void) create_trace_instances(NULL); update_tracer_options(&global_trace); +} + +static __init int tracer_init_tracefs(void) +{ + int ret; + + trace_access_lock_init(); + + ret = tracing_init_dentry(); + if (ret) + return 0; + + if (eval_map_wq) { + INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); + queue_work(eval_map_wq, &tracerfs_init_work); + } else { + tracer_init_tracefs_work_func(NULL); + } return 0; } @@ -10122,6 +10176,14 @@ out: return ret; } +void __init ftrace_boot_snapshot(void) +{ + if (snapshot_at_boot) { + tracing_snapshot(); + internal_trace_puts("** Boot snapshot taken **\n"); + } +} + void __init early_trace_init(void) { if (tracepoint_printk) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c5b09c31e077..ff816fb41e48 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1573,13 +1573,12 @@ struct enable_trigger_data { }; extern int event_enable_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, - struct event_trigger_data *data); -extern void event_enable_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data); +extern void event_enable_trigger_free(struct event_trigger_data *data); extern int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param); + char *glob, char *cmd, + char *param_and_filter); extern int event_enable_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file); @@ -1587,8 +1586,7 @@ extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); -extern int event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable); extern void update_cond_flag(struct trace_event_file *file); @@ -1629,10 +1627,11 @@ extern void event_trigger_reset_filter(struct event_command *cmd_ops, extern int event_trigger_register(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, - char *cmd, - char *trigger, - struct event_trigger_data *trigger_data, - int *n_registered); + struct event_trigger_data *trigger_data); +extern void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data); /** * struct event_trigger_ops - callbacks for trace event triggers @@ -1686,12 +1685,9 @@ struct event_trigger_ops { struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); - int (*init)(struct event_trigger_ops *ops, - struct event_trigger_data *data); - void (*free)(struct event_trigger_ops *ops, - struct event_trigger_data *data); + int (*init)(struct event_trigger_data *data); + void (*free)(struct event_trigger_data *data); int (*print)(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data); }; @@ -1877,7 +1873,7 @@ extern ssize_t trace_parse_run_command(struct file *file, extern unsigned int err_pos(char *cmd, const char *str); extern void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, - const char **errs, u8 type, u8 pos); + const char **errs, u8 type, u16 pos); /* * Normal trace_printk() and friends allocates special buffers diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 0580287d7a0d..778200dd8ede 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -300,7 +300,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, { struct xbc_node *node; const char *p, *handler; - int ret; + int ret = 0; handler = xbc_node_get_data(hnode); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index e34e8182ee4b..076b447a1b88 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -255,19 +255,14 @@ static const struct file_operations dynamic_events_ops = { /* Make a tracefs interface for controlling dynamic events */ static __init int init_dynamic_event(void) { - struct dentry *entry; int ret; ret = tracing_init_dentry(); if (ret) return 0; - entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, - NULL, &dynamic_events_ops); - - /* Event list interface */ - if (!entry) - pr_warn("Could not create tracefs 'dynamic_events' entry\n"); + trace_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, + NULL, &dynamic_events_ops); return 0; } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 541aa13581b9..7d4478525c66 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -511,20 +511,17 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) * functions are just stubs to fulfill what is needed to use the trigger * infrastructure. */ -static int eprobe_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int eprobe_trigger_init(struct event_trigger_data *data) { return 0; } -static void eprobe_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void eprobe_trigger_free(struct event_trigger_data *data) { } static int eprobe_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { /* Do not print eprobe event triggers */ @@ -549,7 +546,8 @@ static struct event_trigger_ops eprobe_trigger_ops = { static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, + char *param_and_filter) { return -1; } @@ -650,7 +648,7 @@ static struct trace_event_functions eprobe_funcs = { static int disable_eprobe(struct trace_eprobe *ep, struct trace_array *tr) { - struct event_trigger_data *trigger; + struct event_trigger_data *trigger = NULL, *iter; struct trace_event_file *file; struct eprobe_data *edata; @@ -658,14 +656,16 @@ static int disable_eprobe(struct trace_eprobe *ep, if (!file) return -ENOENT; - list_for_each_entry(trigger, &file->triggers, list) { - if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE)) + list_for_each_entry(iter, &file->triggers, list) { + if (!(iter->flags & EVENT_TRIGGER_FL_PROBE)) continue; - edata = trigger->private_data; - if (edata->ep == ep) + edata = iter->private_data; + if (edata->ep == ep) { + trigger = iter; break; + } } - if (list_entry_is_head(trigger, &file->triggers, list)) + if (!trigger) return -ENODEV; list_del_rcu(&trigger->list); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3147614c1812..181f08186d32 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -40,6 +40,14 @@ static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); static bool eventdir_initialized; +static LIST_HEAD(module_strings); + +struct module_string { + struct list_head next; + struct module *module; + char *str; +}; + #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) static struct kmem_cache *field_cachep; @@ -399,7 +407,14 @@ static void test_event_printk(struct trace_event_call *call) a = strchr(fmt + i, '&'); if ((a && (a < r)) || test_field(r, call)) dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); } + next_arg: i--; arg++; @@ -759,7 +774,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -783,7 +800,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -1705,9 +1724,9 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { + struct trace_subsystem_dir *dir = NULL, *iter_dir; + struct trace_array *tr = NULL, *iter_tr; struct event_subsystem *system = NULL; - struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */ - struct trace_array *tr; int ret; if (tracing_is_disabled()) @@ -1716,10 +1735,12 @@ static int subsystem_open(struct inode *inode, struct file *filp) /* Make sure the system still exists */ mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - list_for_each_entry(dir, &tr->systems, list) { - if (dir == inode->i_private) { + list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) { + list_for_each_entry(iter_dir, &iter_tr->systems, list) { + if (iter_dir == inode->i_private) { /* Don't open systems with no events */ + tr = iter_tr; + dir = iter_dir; if (dir->nr_events) { __get_system_dir(dir); system = dir->subsystem; @@ -1735,9 +1756,6 @@ static int subsystem_open(struct inode *inode, struct file *filp) if (!system) return -ENODEV; - /* Some versions of gcc think dir can be uninitialized here */ - WARN_ON(!dir); - /* Still need to increment the ref count of the system */ if (trace_array_get(tr) < 0) { put_system(dir); @@ -2262,8 +2280,8 @@ static struct dentry * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct dentry *parent) { + struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - struct event_subsystem *system; struct dentry *entry; /* First see if we did not already create this dir */ @@ -2277,13 +2295,13 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } /* Now see if the system itself exists. */ - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) + system = NULL; + list_for_each_entry(iter, &event_subsystems, list) { + if (strcmp(iter->name, name) == 0) { + system = iter; break; + } } - /* Reset system variable when not found */ - if (&system->list == &event_subsystems) - system = NULL; dir = kmalloc(sizeof(*dir), GFP_KERNEL); if (!dir) @@ -2633,6 +2651,76 @@ static void update_event_printk(struct trace_event_call *call, } } +static void add_str_to_module(struct module *module, char *str) +{ + struct module_string *modstr; + + modstr = kmalloc(sizeof(*modstr), GFP_KERNEL); + + /* + * If we failed to allocate memory here, then we'll just + * let the str memory leak when the module is removed. + * If this fails to allocate, there's worse problems than + * a leaked string on module removal. + */ + if (WARN_ON_ONCE(!modstr)) + return; + + modstr->module = module; + modstr->str = str; + + list_add(&modstr->next, &module_strings); +} + +static void update_event_fields(struct trace_event_call *call, + struct trace_eval_map *map) +{ + struct ftrace_event_field *field; + struct list_head *head; + char *ptr; + char *str; + int len = strlen(map->eval_string); + + /* Dynamic events should never have field maps */ + if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) + return; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + ptr = strchr(field->type, '['); + if (!ptr) + continue; + ptr++; + + if (!isalpha(*ptr) && *ptr != '_') + continue; + + if (strncmp(map->eval_string, ptr, len) != 0) + continue; + + str = kstrdup(field->type, GFP_KERNEL); + if (WARN_ON_ONCE(!str)) + return; + ptr = str + (ptr - field->type); + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ + if (WARN_ON_ONCE(!ptr)) { + kfree(str); + continue; + } + + /* + * If the event is part of a module, then we need to free the string + * when the module is removed. Otherwise, it will stay allocated + * until a reboot. + */ + if (call->module) + add_str_to_module(call->module, str); + + field->type = str; + } +} + void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; @@ -2668,6 +2756,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) first = false; } update_event_printk(call, map[i]); + update_event_fields(call, map[i]); } } } @@ -2758,6 +2847,7 @@ int trace_add_event_call(struct trace_event_call *call) mutex_unlock(&trace_types_lock); return ret; } +EXPORT_SYMBOL_GPL(trace_add_event_call); /* * Must be called under locking of trace_types_lock, event_mutex and @@ -2819,6 +2909,7 @@ int trace_remove_event_call(struct trace_event_call *call) return ret; } +EXPORT_SYMBOL_GPL(trace_remove_event_call); #define for_each_event(event, start, end) \ for (event = start; \ @@ -2853,6 +2944,7 @@ static void trace_module_add_events(struct module *mod) static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; + struct module_string *modstr, *m; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { @@ -2861,6 +2953,14 @@ static void trace_module_remove_events(struct module *mod) if (call->module == mod) __trace_remove_event_call(call); } + /* Check for any strings allocade for this module */ + list_for_each_entry_safe(modstr, m, &module_strings, next) { + if (modstr->module != mod) + continue; + list_del(&modstr->next); + kfree(modstr->str); + kfree(modstr); + } up_write(&trace_event_sem); /* @@ -3446,12 +3546,10 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_fops); - if (!entry) { - pr_warn("Could not create tracefs 'set_event' entry\n"); + entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_fops); + if (!entry) return -ENOMEM; - } d_events = tracefs_create_dir("events", parent); if (!d_events) { @@ -3466,16 +3564,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) /* There are not as crucial, just warn if they are not created */ - entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_pid_fops); - if (!entry) - pr_warn("Could not create tracefs 'set_event_pid' entry\n"); + trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_pid_fops); - entry = tracefs_create_file("set_event_notrace_pid", - TRACE_MODE_WRITE, parent, tr, - &ftrace_set_event_notrace_pid_fops); - if (!entry) - pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); + trace_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); /* ring buffer internal formats */ trace_create_file("header_page", TRACE_MODE_READ, d_events, @@ -3690,17 +3784,14 @@ static __init int event_trace_init_fields(void) __init int event_trace_init(void) { struct trace_array *tr; - struct dentry *entry; int ret; tr = top_trace_array(); if (!tr) return -ENODEV; - entry = tracefs_create_file("available_events", TRACE_MODE_READ, - NULL, tr, &ftrace_avail_fops); - if (!entry) - pr_warn("Could not create tracefs 'available_events' entry\n"); + trace_create_file("available_events", TRACE_MODE_READ, + NULL, tr, &ftrace_avail_fops); ret = early_event_add_tracer(NULL, tr); if (ret) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b458a9afa2c0..4b1057ab9d96 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1816,7 +1816,7 @@ static void create_filter_finish(struct filter_parse_error *pe) * create_filter - create a filter for a trace_event_call * @tr: the trace array associated with these events * @call: trace_event_call to create a filter for - * @filter_str: filter string + * @filter_string: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) * Must be a pointer that references a NULL pointer. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index dc7f733b4cb3..48e82e141d54 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -727,11 +727,16 @@ static struct track_data *track_data_alloc(unsigned int key_len, return data; } -static char last_cmd[MAX_FILTER_STR_VAL]; +#define HIST_PREFIX "hist:" + +static char *last_cmd; static char last_cmd_loc[MAX_FILTER_STR_VAL]; static int errpos(char *str) { + if (!str || !last_cmd) + return 0; + return err_pos(last_cmd, str); } @@ -739,12 +744,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str) { const char *system = NULL, *name = NULL; struct trace_event_call *call; + int len; if (!str) return; - strcpy(last_cmd, "hist:"); - strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:")); + /* sizeof() contains the nul byte */ + len = sizeof(HIST_PREFIX) + strlen(str); + kfree(last_cmd); + last_cmd = kzalloc(len, GFP_KERNEL); + if (!last_cmd) + return; + + strcpy(last_cmd, HIST_PREFIX); + /* Again, sizeof() contains the nul byte */ + len -= sizeof(HIST_PREFIX); + strncat(last_cmd, str, len); if (file) { call = file->event_call; @@ -757,18 +772,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str) } if (system) - snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name); } -static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) +static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos) { + if (!last_cmd) + return; + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, err_type, err_pos); } static void hist_err_clear(void) { - last_cmd[0] = '\0'; + if (last_cmd) + last_cmd[0] = '\0'; last_cmd_loc[0] = '\0'; } @@ -2074,8 +2093,11 @@ static int init_var_ref(struct hist_field *ref_field, return err; free: kfree(ref_field->system); + ref_field->system = NULL; kfree(ref_field->event_name); + ref_field->event_name = NULL; kfree(ref_field->name); + ref_field->name = NULL; goto out; } @@ -2766,7 +2788,8 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, static struct event_command trigger_hist_cmd; static int event_hist_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param); + char *glob, char *cmd, + char *param_and_filter); static bool compatible_keys(struct hist_trigger_data *target_hist_data, struct hist_trigger_data *hist_data, @@ -4142,7 +4165,7 @@ static int create_val_field(struct hist_trigger_data *hist_data, return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); } -static const char *no_comm = "(no comm)"; +static const char no_comm[] = "(no comm)"; static u64 hist_field_execname(struct hist_field *hist_field, struct tracing_map_elt *elt, @@ -5233,7 +5256,7 @@ static void hist_trigger_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5465,7 +5488,7 @@ static void hist_trigger_debug_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5602,7 +5625,6 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) } static int event_hist_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5610,7 +5632,7 @@ static int event_hist_trigger_print(struct seq_file *m, bool have_var = false; unsigned int i; - seq_puts(m, "hist:"); + seq_puts(m, HIST_PREFIX); if (data->name) seq_printf(m, "%s:", data->name); @@ -5710,8 +5732,7 @@ static int event_hist_trigger_print(struct seq_file *m, return 0; } -static int event_hist_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5739,8 +5760,7 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data) } } -static void event_hist_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void event_hist_trigger_free(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5769,25 +5789,23 @@ static struct event_trigger_ops event_hist_trigger_ops = { .free = event_hist_trigger_free, }; -static int event_hist_trigger_named_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int event_hist_trigger_named_init(struct event_trigger_data *data) { data->ref++; save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(ops, data->named_data); + event_hist_trigger_init(data->named_data); return 0; } -static void event_hist_trigger_named_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void event_hist_trigger_named_free(struct event_trigger_data *data) { if (WARN_ON_ONCE(data->ref <= 0)) return; - event_hist_trigger_free(ops, data->named_data); + event_hist_trigger_free(data->named_data); data->ref--; if (!data->ref) { @@ -5914,6 +5932,48 @@ static bool hist_trigger_match(struct event_trigger_data *data, return true; } +static bool existing_hist_update_only(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool updated = false; + + if (!hist_data->attrs->pause && !hist_data->attrs->cont && + !hist_data->attrs->clear) + goto out; + + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) + goto out; + } + } + + if (hist_data->attrs->name && !named_data) + goto out; + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + if (hist_data->attrs->pause) + test->paused = true; + else if (hist_data->attrs->cont) + test->paused = false; + else if (hist_data->attrs->clear) + hist_clear(test); + updated = true; + goto out; + } + } + out: + return updated; +} + static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) @@ -5942,19 +6002,11 @@ static int hist_register_trigger(char *glob, list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test, named_data, false)) - continue; - if (hist_data->attrs->pause) - test->paused = true; - else if (hist_data->attrs->cont) - test->paused = false; - else if (hist_data->attrs->clear) - hist_clear(test); - else { + if (hist_trigger_match(data, test, named_data, false)) { hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; + goto out; } - goto out; } } new: @@ -5974,7 +6026,7 @@ static int hist_register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } @@ -5993,8 +6045,6 @@ static int hist_register_trigger(char *glob, if (named_data) destroy_hist_data(hist_data); - - ret++; out: return ret; } @@ -6070,20 +6120,19 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { + struct event_trigger_data *test = NULL, *iter, *named_data = NULL; struct hist_trigger_data *hist_data = data->private_data; - struct event_trigger_data *test, *named_data = NULL; - bool unregistered = false; lockdep_assert_held(&event_mutex); if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry(test, &file->triggers, list) { - if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test, named_data, false)) + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, iter, named_data, false)) continue; - unregistered = true; + test = iter; list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -6091,11 +6140,11 @@ static void hist_unregister_trigger(char *glob, } } - if (unregistered && test->ops->free) - test->ops->free(test->ops, test); + if (test && test->ops->free) + test->ops->free(test); if (hist_data->enable_timestamps) { - if (!hist_data->remove || unregistered) + if (!hist_data->remove || test) tracing_set_filter_buffering(file->tr, false); } } @@ -6145,57 +6194,57 @@ static void hist_unreg_all(struct trace_event_file *file) if (hist_data->enable_timestamps) tracing_set_filter_buffering(file->tr, false); if (test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); } } } static int event_hist_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, + char *param_and_filter) { unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; struct event_trigger_data *trigger_data; struct hist_trigger_attrs *attrs; - struct event_trigger_ops *trigger_ops; struct hist_trigger_data *hist_data; + char *param, *filter, *p, *start; struct synth_event *se; const char *se_name; - bool remove = false; - char *trigger, *p, *start; + bool remove; int ret = 0; lockdep_assert_held(&event_mutex); - WARN_ON(!glob); + if (WARN_ON(!glob)) + return -EINVAL; - if (strlen(glob)) { + if (glob[0]) { hist_err_clear(); - last_cmd_set(file, param); + last_cmd_set(file, param_and_filter); } - if (!param) - return -EINVAL; + remove = event_trigger_check_remove(glob); - if (glob[0] == '!') - remove = true; + if (event_trigger_empty_param(param_and_filter)) + return -EINVAL; /* * separate the trigger from the filter (k:v [if filter]) * allowing for whitespace in the trigger */ - p = trigger = param; + p = param = param_and_filter; do { p = strstr(p, "if"); if (!p) break; - if (p == param) + if (p == param_and_filter) return -EINVAL; if (*(p - 1) != ' ' && *(p - 1) != '\t') { p++; continue; } - if (p >= param + strlen(param) - (sizeof("if") - 1) - 1) + if (p >= param_and_filter + strlen(param_and_filter) - (sizeof("if") - 1) - 1) return -EINVAL; if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') { p++; @@ -6205,24 +6254,24 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, } while (1); if (!p) - param = NULL; + filter = NULL; else { *(p - 1) = '\0'; - param = strstrip(p); - trigger = strstrip(trigger); + filter = strstrip(p); + param = strstrip(param); } /* * To simplify arithmetic expression parsing, replace occurrences of * '.sym-offset' modifier with '.symXoffset' */ - start = strstr(trigger, ".sym-offset"); + start = strstr(param, ".sym-offset"); while (start) { *(start + 4) = 'X'; start = strstr(start + 11, ".sym-offset"); } - attrs = parse_hist_trigger_attrs(file->tr, trigger); + attrs = parse_hist_trigger_attrs(file->tr, param); if (IS_ERR(attrs)) return PTR_ERR(attrs); @@ -6235,29 +6284,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; } - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - - INIT_LIST_HEAD(&trigger_data->list); - RCU_INIT_POINTER(trigger_data->filter, NULL); - - trigger_data->private_data = hist_data; - - /* if param is non-empty, it's supposed to be a filter */ - if (param && cmd_ops->set_filter) { - ret = cmd_ops->set_filter(param, trigger_data, file); - if (ret < 0) - goto out_free; - } + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); + if (ret < 0) + goto out_free; if (remove) { if (!have_hist_trigger_match(trigger_data, file)) @@ -6268,7 +6303,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) @@ -6277,17 +6312,11 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of triggers registered, - * but if it didn't register any it returns zero. Consider no - * triggers registered a failure too. - */ - if (!ret) { - if (!(attrs->pause || attrs->cont || attrs->clear)) - ret = -ENOENT; + if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - } else if (ret < 0) + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) goto out_free; if (get_named_trigger_data(trigger_data)) @@ -6312,18 +6341,15 @@ enable: se = find_synth_event(se_name); if (se) se->ref++; - /* Just return zero, not the number of registered triggers */ - ret = 0; out: if (ret == 0) hist_err_clear(); return ret; out_unreg: - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); remove_hist_vars(hist_data); @@ -6444,7 +6470,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file) update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); if (test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); } } } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 154db74dadbc..5e8c07aef071 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -42,10 +42,13 @@ enum { ERRORS }; static const char *err_text[] = { ERRORS }; -static char last_cmd[MAX_FILTER_STR_VAL]; +static char *last_cmd; static int errpos(const char *str) { + if (!str || !last_cmd) + return 0; + return err_pos(last_cmd, str); } @@ -54,11 +57,16 @@ static void last_cmd_set(const char *str) if (!str) return; - strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); + kfree(last_cmd); + + last_cmd = kstrdup(str, GFP_KERNEL); } -static void synth_err(u8 err_type, u8 err_pos) +static void synth_err(u8 err_type, u16 err_pos) { + if (!last_cmd) + return; + tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 7eb9d04f1c2e..cb866c3141af 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -188,7 +188,7 @@ static int trigger_show(struct seq_file *m, void *v) } data = list_entry(v, struct event_trigger_data, list); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); return 0; } @@ -432,7 +432,6 @@ event_trigger_print(const char *name, struct seq_file *m, /** * event_trigger_init - Generic event_trigger_ops @init implementation - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data * * Common implementation of event trigger initialization. @@ -442,8 +441,7 @@ event_trigger_print(const char *name, struct seq_file *m, * * Return: 0 on success, errno otherwise */ -int event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_trigger_init(struct event_trigger_data *data) { data->ref++; return 0; @@ -451,7 +449,6 @@ int event_trigger_init(struct event_trigger_ops *ops, /** * event_trigger_free - Generic event_trigger_ops @free implementation - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data * * Common implementation of event trigger de-initialization. @@ -460,8 +457,7 @@ int event_trigger_init(struct event_trigger_ops *ops, * implementations. */ static void -event_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +event_trigger_free(struct event_trigger_data *data) { if (WARN_ON_ONCE(data->ref <= 0)) return; @@ -515,7 +511,7 @@ clear_event_triggers(struct trace_array *tr) trace_event_trigger_enable_disable(file, 0); list_del_rcu(&data->list); if (data->ops->free) - data->ops->free(data->ops, data); + data->ops->free(data); } } } @@ -581,19 +577,18 @@ static int register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } list_add_rcu(&data->list, &file->triggers); - ret++; update_cond_flag(file); - if (trace_event_trigger_enable_disable(file, 1) < 0) { + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { list_del_rcu(&data->list); update_cond_flag(file); - ret--; } out: return ret; @@ -614,14 +609,13 @@ static void unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file) { - struct event_trigger_data *data; - bool unregistered = false; + struct event_trigger_data *data = NULL, *iter; lockdep_assert_held(&event_mutex); - list_for_each_entry(data, &file->triggers, list) { - if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { - unregistered = true; + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + data = iter; list_del_rcu(&data->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -629,8 +623,8 @@ static void unregister_trigger(char *glob, } } - if (unregistered && data->ops->free) - data->ops->free(data->ops, data); + if (data && data->ops->free) + data->ops->free(data); } /* @@ -744,15 +738,15 @@ bool event_trigger_empty_param(const char *param) /** * event_trigger_separate_filter - separate an event trigger from a filter - * @param: The param string containing trigger and possibly filter - * @trigger: outparam, will be filled with a pointer to the trigger + * @param_and_filter: String containing trigger and possibly filter + * @param: outparam, will be filled with a pointer to the trigger * @filter: outparam, will be filled with a pointer to the filter * @param_required: Specifies whether or not the param string is required * * Given a param string of the form '[trigger] [if filter]', this * function separates the filter from the trigger and returns the - * trigger in *trigger and the filter in *filter. Either the *trigger - * or the *filter may be set to NULL by this function - if not set to + * trigger in @param and the filter in @filter. Either the @param + * or the @filter may be set to NULL by this function - if not set to * NULL, they will contain strings corresponding to the trigger and * filter. * @@ -927,48 +921,37 @@ void event_trigger_reset_filter(struct event_command *cmd_ops, * @cmd_ops: The event_command operations for the trigger * @file: The event file for the trigger's event * @glob: The trigger command string, with optional remove(!) operator - * @cmd: The cmd string - * @param: The param string * @trigger_data: The trigger_data for the trigger - * @n_registered: optional outparam, the number of triggers registered * * Register an event trigger. The @cmd_ops are used to call the - * cmd_ops->reg() function which actually does the registration. The - * cmd_ops->reg() function returns the number of triggers registered, - * which is assigned to n_registered, if n_registered is non-NULL. + * cmd_ops->reg() function which actually does the registration. * * Return: 0 on success, errno otherwise */ int event_trigger_register(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, - char *cmd, - char *param, - struct event_trigger_data *trigger_data, - int *n_registered) + struct event_trigger_data *trigger_data) { - int ret; - - if (n_registered) - *n_registered = 0; - - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - cmd_ops->unreg(glob, trigger_data, file); - ret = -ENOENT; - } else if (ret > 0) { - if (n_registered) - *n_registered = ret; - /* Just return zero, not the number of enabled functions */ - ret = 0; - } + return cmd_ops->reg(glob, trigger_data, file); +} - return ret; +/** + * event_trigger_unregister - unregister an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @trigger_data: The trigger_data for the trigger + * + * Unregister an event trigger. The @cmd_ops are used to call the + * cmd_ops->unreg() function which actually does the unregistration. + */ +void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data) +{ + cmd_ops->unreg(glob, trigger_data, file); } /* @@ -981,7 +964,7 @@ int event_trigger_register(struct event_command *cmd_ops, * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger * @cmd: The cmd portion of the string used to register the trigger - * @param: The params portion of the string used to register the trigger + * @param_and_filter: The param and filter portion of the string used to register the trigger * * Common implementation for event command parsing and trigger * instantiation. @@ -994,94 +977,53 @@ int event_trigger_register(struct event_command *cmd_ops, static int event_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, char *param_and_filter) { struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; - char *trigger = NULL; - char *number; + char *param, *filter; + bool remove; int ret; - /* separate the trigger from the filter (t:n [if filter]) */ - if (param && isdigit(param[0])) { - trigger = strsep(¶m, " \t"); - if (param) { - param = skip_spaces(param); - if (!*param) - param = NULL; - } - } + remove = event_trigger_check_remove(glob); - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, false); + if (ret) + return ret; ret = -ENOMEM; - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); if (!trigger_data) goto out; - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - trigger_data->private_data = file; - INIT_LIST_HEAD(&trigger_data->list); - INIT_LIST_HEAD(&trigger_data->named_list); - - if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_data, file); + if (remove) { + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); ret = 0; goto out; } - if (trigger) { - number = strsep(&trigger, ":"); - - ret = -EINVAL; - if (!strlen(number)) - goto out_free; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &trigger_data->count); - if (ret) - goto out_free; - } - - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; - - if (!cmd_ops->set_filter) - goto out_reg; + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; - ret = cmd_ops->set_filter(param, trigger_data, file); + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); if (ret < 0) goto out_free; - out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ - event_trigger_init(trigger_ops, trigger_data); - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - cmd_ops->unreg(glob, trigger_data, file); - ret = -ENOENT; - } else if (ret > 0) - ret = 0; + event_trigger_init(trigger_data); + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) + goto out_free; /* Down the counter of trigger_data or free it if not used anymore */ - event_trigger_free(trigger_ops, trigger_data); + event_trigger_free(trigger_data); out: return ret; out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); kfree(trigger_data); goto out; } @@ -1401,16 +1343,14 @@ traceoff_count_trigger(struct event_trigger_data *data, } static int -traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +traceon_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("traceon", m, (void *)data->count, data->filter_str); } static int -traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("traceoff", m, (void *)data->count, data->filter_str); @@ -1521,8 +1461,7 @@ register_snapshot_trigger(char *glob, } static int -snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("snapshot", m, (void *)data->count, data->filter_str); @@ -1617,8 +1556,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, } static int -stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("stacktrace", m, (void *)data->count, data->filter_str); @@ -1708,7 +1646,6 @@ event_enable_count_trigger(struct event_trigger_data *data, } int event_enable_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1733,8 +1670,7 @@ int event_enable_trigger_print(struct seq_file *m, return 0; } -void event_enable_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +void event_enable_trigger_free(struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1781,39 +1717,33 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, char *param_and_filter) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; struct trace_array *tr = file->tr; + char *param, *filter; + bool enable, remove; const char *system; const char *event; bool hist = false; - char *trigger; - char *number; - bool enable; int ret; - if (!param) - return -EINVAL; + remove = event_trigger_check_remove(glob); - /* separate the trigger from the filter (s:e:n [if filter]) */ - trigger = strsep(¶m, " \t"); - if (!trigger) + if (event_trigger_empty_param(param_and_filter)) return -EINVAL; - if (param) { - param = skip_spaces(param); - if (!*param) - param = NULL; - } - system = strsep(&trigger, ":"); - if (!trigger) + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, true); + if (ret) + return ret; + + system = strsep(¶m, ":"); + if (!param) return -EINVAL; - event = strsep(&trigger, ":"); + event = strsep(¶m, ":"); ret = -EINVAL; event_enable_file = find_event_file(tr, system, event); @@ -1829,32 +1759,24 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, #else enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; #endif - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - ret = -ENOMEM; - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); - if (!trigger_data) - goto out; enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); - if (!enable_data) { - kfree(trigger_data); + if (!enable_data) goto out; - } - - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - INIT_LIST_HEAD(&trigger_data->list); - RCU_INIT_POINTER(trigger_data->filter, NULL); enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data->private_data = enable_data; - if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_data, file); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + if (!trigger_data) { + kfree(enable_data); + goto out; + } + + if (remove) { + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); kfree(enable_data); ret = 0; @@ -1862,35 +1784,16 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, } /* Up the trigger_data count to make sure nothing frees it on failure */ - event_trigger_init(trigger_ops, trigger_data); - - if (trigger) { - number = strsep(&trigger, ":"); - - ret = -EINVAL; - if (!strlen(number)) - goto out_free; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &trigger_data->count); - if (ret) - goto out_free; - } + event_trigger_init(trigger_data); - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; - - if (!cmd_ops->set_filter) - goto out_reg; + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; - ret = cmd_ops->set_filter(param, trigger_data, file); + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); if (ret < 0) goto out_free; - out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(event_enable_file->event_call); if (!ret) { @@ -1901,32 +1804,23 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = trace_event_enable_disable(event_enable_file, 1, 1); if (ret < 0) goto out_put; - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - ret = -ENOENT; - goto out_disable; - } else if (ret < 0) + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) goto out_disable; - /* Just return zero, not the number of enabled functions */ - ret = 0; - event_trigger_free(trigger_ops, trigger_data); + + event_trigger_free(trigger_data); out: return ret; - out_disable: trace_event_enable_disable(event_enable_file, 0, 1); out_put: trace_event_put_ref(event_enable_file->event_call); out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); - event_trigger_free(trigger_ops, trigger_data); + event_trigger_reset_filter(cmd_ops, trigger_data); + event_trigger_free(trigger_data); kfree(enable_data); + goto out; } @@ -1953,19 +1847,18 @@ int event_enable_register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } list_add_rcu(&data->list, &file->triggers); - ret++; update_cond_flag(file); - if (trace_event_trigger_enable_disable(file, 1) < 0) { + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { list_del_rcu(&data->list); update_cond_flag(file); - ret--; } out: return ret; @@ -1976,19 +1869,18 @@ void event_enable_unregister_trigger(char *glob, struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; + struct event_trigger_data *data = NULL, *iter; struct enable_trigger_data *enable_data; - struct event_trigger_data *data; - bool unregistered = false; lockdep_assert_held(&event_mutex); - list_for_each_entry(data, &file->triggers, list) { - enable_data = data->private_data; + list_for_each_entry(iter, &file->triggers, list) { + enable_data = iter->private_data; if (enable_data && - (data->cmd_ops->trigger_type == + (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) && (enable_data->file == test_enable_data->file)) { - unregistered = true; + data = iter; list_del_rcu(&data->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -1996,8 +1888,8 @@ void event_enable_unregister_trigger(char *glob, } } - if (unregistered && data->ops->free) - data->ops->free(data->ops, data); + if (data && data->ops->free) + data->ops->free(data); } static struct event_trigger_ops * diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c new file mode 100644 index 000000000000..706e1686b5eb --- /dev/null +++ b/kernel/trace/trace_events_user.c @@ -0,0 +1,1628 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021, Microsoft Corporation. + * + * Authors: + * Beau Belgrave <beaub@linux.microsoft.com> + */ + +#include <linux/bitmap.h> +#include <linux/cdev.h> +#include <linux/hashtable.h> +#include <linux/list.h> +#include <linux/io.h> +#include <linux/uio.h> +#include <linux/ioctl.h> +#include <linux/jhash.h> +#include <linux/trace_events.h> +#include <linux/tracefs.h> +#include <linux/types.h> +#include <linux/uaccess.h> +/* Reminder to move to uapi when everything works */ +#ifdef CONFIG_COMPILE_TEST +#include <linux/user_events.h> +#else +#include <uapi/linux/user_events.h> +#endif +#include "trace.h" +#include "trace_dynevent.h" + +#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1) + +#define FIELD_DEPTH_TYPE 0 +#define FIELD_DEPTH_NAME 1 +#define FIELD_DEPTH_SIZE 2 + +/* + * Limits how many trace_event calls user processes can create: + * Must be a power of two of PAGE_SIZE. + */ +#define MAX_PAGE_ORDER 0 +#define MAX_PAGES (1 << MAX_PAGE_ORDER) +#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) + +/* Limit how long of an event name plus args within the subsystem. */ +#define MAX_EVENT_DESC 512 +#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define MAX_FIELD_ARRAY_SIZE 1024 +#define MAX_FIELD_ARG_NAME 256 + +static char *register_page_data; + +static DEFINE_MUTEX(reg_mutex); +static DEFINE_HASHTABLE(register_table, 4); +static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); + +/* + * Stores per-event properties, as users register events + * within a file a user_event might be created if it does not + * already exist. These are globally used and their lifetime + * is tied to the refcnt member. These cannot go away until the + * refcnt reaches zero. + */ +struct user_event { + struct tracepoint tracepoint; + struct trace_event_call call; + struct trace_event_class class; + struct dyn_event devent; + struct hlist_node node; + struct list_head fields; + struct list_head validators; + atomic_t refcnt; + int index; + int flags; + int min_size; +}; + +/* + * Stores per-file events references, as users register events + * within a file this structure is modified and freed via RCU. + * The lifetime of this struct is tied to the lifetime of the file. + * These are not shared and only accessible by the file that created it. + */ +struct user_event_refs { + struct rcu_head rcu; + int count; + struct user_event *events[]; +}; + +#define VALIDATOR_ENSURE_NULL (1 << 0) +#define VALIDATOR_REL (1 << 1) + +struct user_event_validator { + struct list_head link; + int offset; + int flags; +}; + +typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, + void *tpdata, bool *faulted); + +static int user_event_parse(char *name, char *args, char *flags, + struct user_event **newuser); + +static u32 user_event_key(char *name) +{ + return jhash(name, strlen(name), 0); +} + +static __always_inline __must_check +size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) +{ + size_t ret; + + pagefault_disable(); + + ret = copy_from_iter_nocache(addr, bytes, i); + + pagefault_enable(); + + return ret; +} + +static struct list_head *user_event_get_fields(struct trace_event_call *call) +{ + struct user_event *user = (struct user_event *)call->data; + + return &user->fields; +} + +/* + * Parses a register command for user_events + * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]] + * + * Example event named 'test' with a 20 char 'msg' field with an unsigned int + * 'id' field after: + * test char[20] msg;unsigned int id + * + * NOTE: Offsets are from the user data perspective, they are not from the + * trace_entry/buffer perspective. We automatically add the common properties + * sizes to the offset for the user. + * + * Upon success user_event has its ref count increased by 1. + */ +static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) +{ + char *name = raw_command; + char *args = strpbrk(name, " "); + char *flags; + + if (args) + *args++ = '\0'; + + flags = strpbrk(name, ":"); + + if (flags) + *flags++ = '\0'; + + return user_event_parse(name, args, flags, newuser); +} + +static int user_field_array_size(const char *type) +{ + const char *start = strchr(type, '['); + char val[8]; + char *bracket; + int size = 0; + + if (start == NULL) + return -EINVAL; + + if (strscpy(val, start + 1, sizeof(val)) <= 0) + return -EINVAL; + + bracket = strchr(val, ']'); + + if (!bracket) + return -EINVAL; + + *bracket = '\0'; + + if (kstrtouint(val, 0, &size)) + return -EINVAL; + + if (size > MAX_FIELD_ARRAY_SIZE) + return -EINVAL; + + return size; +} + +static int user_field_size(const char *type) +{ + /* long is not allowed from a user, since it's ambigious in size */ + if (strcmp(type, "s64") == 0) + return sizeof(s64); + if (strcmp(type, "u64") == 0) + return sizeof(u64); + if (strcmp(type, "s32") == 0) + return sizeof(s32); + if (strcmp(type, "u32") == 0) + return sizeof(u32); + if (strcmp(type, "int") == 0) + return sizeof(int); + if (strcmp(type, "unsigned int") == 0) + return sizeof(unsigned int); + if (strcmp(type, "s16") == 0) + return sizeof(s16); + if (strcmp(type, "u16") == 0) + return sizeof(u16); + if (strcmp(type, "short") == 0) + return sizeof(short); + if (strcmp(type, "unsigned short") == 0) + return sizeof(unsigned short); + if (strcmp(type, "s8") == 0) + return sizeof(s8); + if (strcmp(type, "u8") == 0) + return sizeof(u8); + if (strcmp(type, "char") == 0) + return sizeof(char); + if (strcmp(type, "unsigned char") == 0) + return sizeof(unsigned char); + if (str_has_prefix(type, "char[")) + return user_field_array_size(type); + if (str_has_prefix(type, "unsigned char[")) + return user_field_array_size(type); + if (str_has_prefix(type, "__data_loc ")) + return sizeof(u32); + if (str_has_prefix(type, "__rel_loc ")) + return sizeof(u32); + + /* Uknown basic type, error */ + return -EINVAL; +} + +static void user_event_destroy_validators(struct user_event *user) +{ + struct user_event_validator *validator, *next; + struct list_head *head = &user->validators; + + list_for_each_entry_safe(validator, next, head, link) { + list_del(&validator->link); + kfree(validator); + } +} + +static void user_event_destroy_fields(struct user_event *user) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + + list_for_each_entry_safe(field, next, head, link) { + list_del(&field->link); + kfree(field); + } +} + +static int user_event_add_field(struct user_event *user, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) +{ + struct user_event_validator *validator; + struct ftrace_event_field *field; + int validator_flags = 0; + + field = kmalloc(sizeof(*field), GFP_KERNEL); + + if (!field) + return -ENOMEM; + + if (str_has_prefix(type, "__data_loc ")) + goto add_validator; + + if (str_has_prefix(type, "__rel_loc ")) { + validator_flags |= VALIDATOR_REL; + goto add_validator; + } + + goto add_field; + +add_validator: + if (strstr(type, "char") != 0) + validator_flags |= VALIDATOR_ENSURE_NULL; + + validator = kmalloc(sizeof(*validator), GFP_KERNEL); + + if (!validator) { + kfree(field); + return -ENOMEM; + } + + validator->flags = validator_flags; + validator->offset = offset; + + /* Want sequential access when validating */ + list_add_tail(&validator->link, &user->validators); + +add_field: + field->type = type; + field->name = name; + field->offset = offset; + field->size = size; + field->is_signed = is_signed; + field->filter_type = filter_type; + + list_add(&field->link, &user->fields); + + /* + * Min size from user writes that are required, this does not include + * the size of trace_entry (common fields). + */ + user->min_size = (offset + size) - sizeof(struct trace_entry); + + return 0; +} + +/* + * Parses the values of a field within the description + * Format: type name [size] + */ +static int user_event_parse_field(char *field, struct user_event *user, + u32 *offset) +{ + char *part, *type, *name; + u32 depth = 0, saved_offset = *offset; + int len, size = -EINVAL; + bool is_struct = false; + + field = skip_spaces(field); + + if (*field == '\0') + return 0; + + /* Handle types that have a space within */ + len = str_has_prefix(field, "unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "struct "); + if (len) { + is_struct = true; + goto skip_next; + } + + len = str_has_prefix(field, "__data_loc unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__data_loc "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__rel_loc unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__rel_loc "); + if (len) + goto skip_next; + + goto parse; +skip_next: + type = field; + field = strpbrk(field + len, " "); + + if (field == NULL) + return -EINVAL; + + *field++ = '\0'; + depth++; +parse: + name = NULL; + + while ((part = strsep(&field, " ")) != NULL) { + switch (depth++) { + case FIELD_DEPTH_TYPE: + type = part; + break; + case FIELD_DEPTH_NAME: + name = part; + break; + case FIELD_DEPTH_SIZE: + if (!is_struct) + return -EINVAL; + + if (kstrtou32(part, 10, &size)) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + if (depth < FIELD_DEPTH_SIZE || !name) + return -EINVAL; + + if (depth == FIELD_DEPTH_SIZE) + size = user_field_size(type); + + if (size == 0) + return -EINVAL; + + if (size < 0) + return size; + + *offset = saved_offset + size; + + return user_event_add_field(user, type, name, saved_offset, size, + type[0] != 'u', FILTER_OTHER); +} + +static int user_event_parse_fields(struct user_event *user, char *args) +{ + char *field; + u32 offset = sizeof(struct trace_entry); + int ret = -EINVAL; + + if (args == NULL) + return 0; + + while ((field = strsep(&args, ";")) != NULL) { + ret = user_event_parse_field(field, user, &offset); + + if (ret) + break; + } + + return ret; +} + +static struct trace_event_fields user_event_fields_array[1]; + +static const char *user_field_format(const char *type) +{ + if (strcmp(type, "s64") == 0) + return "%lld"; + if (strcmp(type, "u64") == 0) + return "%llu"; + if (strcmp(type, "s32") == 0) + return "%d"; + if (strcmp(type, "u32") == 0) + return "%u"; + if (strcmp(type, "int") == 0) + return "%d"; + if (strcmp(type, "unsigned int") == 0) + return "%u"; + if (strcmp(type, "s16") == 0) + return "%d"; + if (strcmp(type, "u16") == 0) + return "%u"; + if (strcmp(type, "short") == 0) + return "%d"; + if (strcmp(type, "unsigned short") == 0) + return "%u"; + if (strcmp(type, "s8") == 0) + return "%d"; + if (strcmp(type, "u8") == 0) + return "%u"; + if (strcmp(type, "char") == 0) + return "%d"; + if (strcmp(type, "unsigned char") == 0) + return "%u"; + if (strstr(type, "char[") != 0) + return "%s"; + + /* Unknown, likely struct, allowed treat as 64-bit */ + return "%llu"; +} + +static bool user_field_is_dyn_string(const char *type, const char **str_func) +{ + if (str_has_prefix(type, "__data_loc ")) { + *str_func = "__get_str"; + goto check; + } + + if (str_has_prefix(type, "__rel_loc ")) { + *str_func = "__get_rel_str"; + goto check; + } + + return false; +check: + return strstr(type, "char") != 0; +} + +#define LEN_OR_ZERO (len ? len - pos : 0) +static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + int pos = 0, depth = 0; + const char *str_func; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + list_for_each_entry_safe_reverse(field, next, head, link) { + if (depth != 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s", + field->name, user_field_format(field->type)); + + depth++; + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + list_for_each_entry_safe_reverse(field, next, head, link) { + if (user_field_is_dyn_string(field->type, &str_func)) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", %s(%s)", str_func, field->name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", field->name); + } + + return pos + 1; +} +#undef LEN_OR_ZERO + +static int user_event_create_print_fmt(struct user_event *user) +{ + char *print_fmt; + int len; + + len = user_event_set_print_fmt(user, NULL, 0); + + print_fmt = kmalloc(len, GFP_KERNEL); + + if (!print_fmt) + return -ENOMEM; + + user_event_set_print_fmt(user, print_fmt, len); + + user->call.print_fmt = print_fmt; + + return 0; +} + +static enum print_line_t user_event_print_trace(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + /* Unsafe to try to decode user provided print_fmt, use hex */ + trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16, + 1, iter->ent, iter->ent_size, true); + + return trace_handle_return(&iter->seq); +} + +static struct trace_event_functions user_event_funcs = { + .trace = user_event_print_trace, +}; + +static int user_event_set_call_visible(struct user_event *user, bool visible) +{ + int ret; + const struct cred *old_cred; + struct cred *cred; + + cred = prepare_creds(); + + if (!cred) + return -ENOMEM; + + /* + * While by default tracefs is locked down, systems can be configured + * to allow user_event files to be less locked down. The extreme case + * being "other" has read/write access to user_events_data/status. + * + * When not locked down, processes may not have have permissions to + * add/remove calls themselves to tracefs. We need to temporarily + * switch to root file permission to allow for this scenario. + */ + cred->fsuid = GLOBAL_ROOT_UID; + + old_cred = override_creds(cred); + + if (visible) + ret = trace_add_event_call(&user->call); + else + ret = trace_remove_event_call(&user->call); + + revert_creds(old_cred); + put_cred(cred); + + return ret; +} + +static int destroy_user_event(struct user_event *user) +{ + int ret = 0; + + /* Must destroy fields before call removal */ + user_event_destroy_fields(user); + + ret = user_event_set_call_visible(user, false); + + if (ret) + return ret; + + dyn_event_remove(&user->devent); + + register_page_data[user->index] = 0; + clear_bit(user->index, page_bitmap); + hash_del(&user->node); + + user_event_destroy_validators(user); + kfree(user->call.print_fmt); + kfree(EVENT_NAME(user)); + kfree(user); + + return ret; +} + +static struct user_event *find_user_event(char *name, u32 *outkey) +{ + struct user_event *user; + u32 key = user_event_key(name); + + *outkey = key; + + hash_for_each_possible(register_table, user, node, key) + if (!strcmp(EVENT_NAME(user), name)) { + atomic_inc(&user->refcnt); + return user; + } + + return NULL; +} + +static int user_event_validate(struct user_event *user, void *data, int len) +{ + struct list_head *head = &user->validators; + struct user_event_validator *validator; + void *pos, *end = data + len; + u32 loc, offset, size; + + list_for_each_entry(validator, head, link) { + pos = data + validator->offset; + + /* Already done min_size check, no bounds check here */ + loc = *(u32 *)pos; + offset = loc & 0xffff; + size = loc >> 16; + + if (likely(validator->flags & VALIDATOR_REL)) + pos += offset + sizeof(loc); + else + pos = data + offset; + + pos += size; + + if (unlikely(pos > end)) + return -EFAULT; + + if (likely(validator->flags & VALIDATOR_ENSURE_NULL)) + if (unlikely(*(char *)(pos - 1) != '\0')) + return -EFAULT; + } + + return 0; +} + +/* + * Writes the user supplied payload out to a trace file. + */ +static void user_event_ftrace(struct user_event *user, struct iov_iter *i, + void *tpdata, bool *faulted) +{ + struct trace_event_file *file; + struct trace_entry *entry; + struct trace_event_buffer event_buffer; + size_t size = sizeof(*entry) + i->count; + + file = (struct trace_event_file *)tpdata; + + if (!file || + !(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) + return; + + /* Allocates and fills trace_entry, + 1 of this is data payload */ + entry = trace_event_buffer_reserve(&event_buffer, file, size); + + if (unlikely(!entry)) + return; + + if (unlikely(!copy_nofault(entry + 1, i->count, i))) + goto discard; + + if (!list_empty(&user->validators) && + unlikely(user_event_validate(user, entry, size))) + goto discard; + + trace_event_buffer_commit(&event_buffer); + + return; +discard: + *faulted = true; + __trace_event_discard_commit(event_buffer.buffer, + event_buffer.event); +} + +#ifdef CONFIG_PERF_EVENTS +/* + * Writes the user supplied payload out to perf ring buffer. + */ +static void user_event_perf(struct user_event *user, struct iov_iter *i, + void *tpdata, bool *faulted) +{ + struct hlist_head *perf_head; + + perf_head = this_cpu_ptr(user->call.perf_events); + + if (perf_head && !hlist_empty(perf_head)) { + struct trace_entry *perf_entry; + struct pt_regs *regs; + size_t size = sizeof(*perf_entry) + i->count; + int context; + + perf_entry = perf_trace_buf_alloc(ALIGN(size, 8), + ®s, &context); + + if (unlikely(!perf_entry)) + return; + + perf_fetch_caller_regs(regs); + + if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) + goto discard; + + if (!list_empty(&user->validators) && + unlikely(user_event_validate(user, perf_entry, size))) + goto discard; + + perf_trace_buf_submit(perf_entry, size, context, + user->call.event.type, 1, regs, + perf_head, NULL); + + return; +discard: + *faulted = true; + perf_swevent_put_recursion_context(context); + } +} +#endif + +/* + * Update the register page that is shared between user processes. + */ +static void update_reg_page_for(struct user_event *user) +{ + struct tracepoint *tp = &user->tracepoint; + char status = 0; + + if (atomic_read(&tp->key.enabled) > 0) { + struct tracepoint_func *probe_func_ptr; + user_event_func_t probe_func; + + rcu_read_lock_sched(); + + probe_func_ptr = rcu_dereference_sched(tp->funcs); + + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + + if (probe_func == user_event_ftrace) + status |= EVENT_STATUS_FTRACE; +#ifdef CONFIG_PERF_EVENTS + else if (probe_func == user_event_perf) + status |= EVENT_STATUS_PERF; +#endif + else + status |= EVENT_STATUS_OTHER; + } while ((++probe_func_ptr)->func); + } + + rcu_read_unlock_sched(); + } + + register_page_data[user->index] = status; +} + +/* + * Register callback for our events from tracing sub-systems. + */ +static int user_event_reg(struct trace_event_call *call, + enum trace_reg type, + void *data) +{ + struct user_event *user = (struct user_event *)call->data; + int ret = 0; + + if (!user) + return -ENOENT; + + switch (type) { + case TRACE_REG_REGISTER: + ret = tracepoint_probe_register(call->tp, + call->class->probe, + data); + if (!ret) + goto inc; + break; + + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->probe, + data); + goto dec; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + ret = tracepoint_probe_register(call->tp, + call->class->perf_probe, + data); + if (!ret) + goto inc; + break; + + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->perf_probe, + data); + goto dec; + + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + break; +#endif + } + + return ret; +inc: + atomic_inc(&user->refcnt); + update_reg_page_for(user); + return 0; +dec: + update_reg_page_for(user); + atomic_dec(&user->refcnt); + return 0; +} + +static int user_event_create(const char *raw_command) +{ + struct user_event *user; + char *name; + int ret; + + if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX)) + return -ECANCELED; + + raw_command += USER_EVENTS_PREFIX_LEN; + raw_command = skip_spaces(raw_command); + + name = kstrdup(raw_command, GFP_KERNEL); + + if (!name) + return -ENOMEM; + + mutex_lock(®_mutex); + + ret = user_event_parse_cmd(name, &user); + + if (!ret) + atomic_dec(&user->refcnt); + + mutex_unlock(®_mutex); + + if (ret) + kfree(name); + + return ret; +} + +static int user_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + struct ftrace_event_field *field, *next; + struct list_head *head; + int depth = 0; + + seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user)); + + head = trace_get_fields(&user->call); + + list_for_each_entry_safe_reverse(field, next, head, link) { + if (depth == 0) + seq_puts(m, " "); + else + seq_puts(m, "; "); + + seq_printf(m, "%s %s", field->type, field->name); + + if (str_has_prefix(field->type, "struct ")) + seq_printf(m, " %d", field->size); + + depth++; + } + + seq_puts(m, "\n"); + + return 0; +} + +static bool user_event_is_busy(struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + + return atomic_read(&user->refcnt) != 0; +} + +static int user_event_free(struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + + if (atomic_read(&user->refcnt) != 0) + return -EBUSY; + + return destroy_user_event(user); +} + +static bool user_field_match(struct ftrace_event_field *field, int argc, + const char **argv, int *iout) +{ + char *field_name, *arg_name; + int len, pos, i = *iout; + bool colon = false, match = false; + + if (i >= argc) + return false; + + len = MAX_FIELD_ARG_NAME; + field_name = kmalloc(len, GFP_KERNEL); + arg_name = kmalloc(len, GFP_KERNEL); + + if (!arg_name || !field_name) + goto out; + + pos = 0; + + for (; i < argc; ++i) { + if (i != *iout) + pos += snprintf(arg_name + pos, len - pos, " "); + + pos += snprintf(arg_name + pos, len - pos, argv[i]); + + if (strchr(argv[i], ';')) { + ++i; + colon = true; + break; + } + } + + pos = 0; + + pos += snprintf(field_name + pos, len - pos, field->type); + pos += snprintf(field_name + pos, len - pos, " "); + pos += snprintf(field_name + pos, len - pos, field->name); + + if (colon) + pos += snprintf(field_name + pos, len - pos, ";"); + + *iout = i; + + match = strcmp(arg_name, field_name) == 0; +out: + kfree(arg_name); + kfree(field_name); + + return match; +} + +static bool user_fields_match(struct user_event *user, int argc, + const char **argv) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + int i = 0; + + list_for_each_entry_safe_reverse(field, next, head, link) + if (!user_field_match(field, argc, argv, &i)) + return false; + + if (i != argc) + return false; + + return true; +} + +static bool user_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + bool match; + + match = strcmp(EVENT_NAME(user), event) == 0 && + (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); + + if (match && argc > 0) + match = user_fields_match(user, argc, argv); + + return match; +} + +static struct dyn_event_operations user_event_dops = { + .create = user_event_create, + .show = user_event_show, + .is_busy = user_event_is_busy, + .free = user_event_free, + .match = user_event_match, +}; + +static int user_event_trace_register(struct user_event *user) +{ + int ret; + + ret = register_trace_event(&user->call.event); + + if (!ret) + return -ENODEV; + + ret = user_event_set_call_visible(user, true); + + if (ret) + unregister_trace_event(&user->call.event); + + return ret; +} + +/* + * Parses the event name, arguments and flags then registers if successful. + * The name buffer lifetime is owned by this method for success cases only. + * Upon success the returned user_event has its ref count increased by 1. + */ +static int user_event_parse(char *name, char *args, char *flags, + struct user_event **newuser) +{ + int ret; + int index; + u32 key; + struct user_event *user; + + /* Prevent dyn_event from racing */ + mutex_lock(&event_mutex); + user = find_user_event(name, &key); + mutex_unlock(&event_mutex); + + if (user) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + return 0; + } + + index = find_first_zero_bit(page_bitmap, MAX_EVENTS); + + if (index == MAX_EVENTS) + return -EMFILE; + + user = kzalloc(sizeof(*user), GFP_KERNEL); + + if (!user) + return -ENOMEM; + + INIT_LIST_HEAD(&user->class.fields); + INIT_LIST_HEAD(&user->fields); + INIT_LIST_HEAD(&user->validators); + + user->tracepoint.name = name; + + ret = user_event_parse_fields(user, args); + + if (ret) + goto put_user; + + ret = user_event_create_print_fmt(user); + + if (ret) + goto put_user; + + user->call.data = user; + user->call.class = &user->class; + user->call.name = name; + user->call.flags = TRACE_EVENT_FL_TRACEPOINT; + user->call.tp = &user->tracepoint; + user->call.event.funcs = &user_event_funcs; + + user->class.system = USER_EVENTS_SYSTEM; + user->class.fields_array = user_event_fields_array; + user->class.get_fields = user_event_get_fields; + user->class.reg = user_event_reg; + user->class.probe = user_event_ftrace; +#ifdef CONFIG_PERF_EVENTS + user->class.perf_probe = user_event_perf; +#endif + + mutex_lock(&event_mutex); + + ret = user_event_trace_register(user); + + if (ret) + goto put_user_lock; + + user->index = index; + + /* Ensure we track ref */ + atomic_inc(&user->refcnt); + + dyn_event_init(&user->devent, &user_event_dops); + dyn_event_add(&user->devent, &user->call); + set_bit(user->index, page_bitmap); + hash_add(register_table, &user->node, key); + + mutex_unlock(&event_mutex); + + *newuser = user; + return 0; +put_user_lock: + mutex_unlock(&event_mutex); +put_user: + user_event_destroy_fields(user); + user_event_destroy_validators(user); + kfree(user); + return ret; +} + +/* + * Deletes a previously created event if it is no longer being used. + */ +static int delete_user_event(char *name) +{ + u32 key; + int ret; + struct user_event *user = find_user_event(name, &key); + + if (!user) + return -ENOENT; + + /* Ensure we are the last ref */ + if (atomic_read(&user->refcnt) != 1) { + ret = -EBUSY; + goto put_ref; + } + + ret = destroy_user_event(user); + + if (ret) + goto put_ref; + + return ret; +put_ref: + /* No longer have this ref */ + atomic_dec(&user->refcnt); + + return ret; +} + +/* + * Validates the user payload and writes via iterator. + */ +static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) +{ + struct user_event_refs *refs; + struct user_event *user = NULL; + struct tracepoint *tp; + ssize_t ret = i->count; + int idx; + + if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx))) + return -EFAULT; + + rcu_read_lock_sched(); + + refs = rcu_dereference_sched(file->private_data); + + /* + * The refs->events array is protected by RCU, and new items may be + * added. But the user retrieved from indexing into the events array + * shall be immutable while the file is opened. + */ + if (likely(refs && idx < refs->count)) + user = refs->events[idx]; + + rcu_read_unlock_sched(); + + if (unlikely(user == NULL)) + return -ENOENT; + + if (unlikely(i->count < user->min_size)) + return -EINVAL; + + tp = &user->tracepoint; + + /* + * It's possible key.enabled disables after this check, however + * we don't mind if a few events are included in this condition. + */ + if (likely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + user_event_func_t probe_func; + struct iov_iter copy; + void *tpdata; + bool faulted; + + if (unlikely(fault_in_iov_iter_readable(i, i->count))) + return -EFAULT; + + faulted = false; + + rcu_read_lock_sched(); + + probe_func_ptr = rcu_dereference_sched(tp->funcs); + + if (probe_func_ptr) { + do { + copy = *i; + probe_func = probe_func_ptr->func; + tpdata = probe_func_ptr->data; + probe_func(user, ©, tpdata, &faulted); + } while ((++probe_func_ptr)->func); + } + + rcu_read_unlock_sched(); + + if (unlikely(faulted)) + return -EFAULT; + } + + return ret; +} + +static ssize_t user_events_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct iovec iov; + struct iov_iter i; + + if (unlikely(*ppos != 0)) + return -EFAULT; + + if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) + return -EFAULT; + + return user_events_write_core(file, &i); +} + +static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) +{ + return user_events_write_core(kp->ki_filp, i); +} + +static int user_events_ref_add(struct file *file, struct user_event *user) +{ + struct user_event_refs *refs, *new_refs; + int i, size, count = 0; + + refs = rcu_dereference_protected(file->private_data, + lockdep_is_held(®_mutex)); + + if (refs) { + count = refs->count; + + for (i = 0; i < count; ++i) + if (refs->events[i] == user) + return i; + } + + size = struct_size(refs, events, count + 1); + + new_refs = kzalloc(size, GFP_KERNEL); + + if (!new_refs) + return -ENOMEM; + + new_refs->count = count + 1; + + for (i = 0; i < count; ++i) + new_refs->events[i] = refs->events[i]; + + new_refs->events[i] = user; + + atomic_inc(&user->refcnt); + + rcu_assign_pointer(file->private_data, new_refs); + + if (refs) + kfree_rcu(refs, rcu); + + return i; +} + +static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) +{ + u32 size; + long ret; + + ret = get_user(size, &ureg->size); + + if (ret) + return ret; + + if (size > PAGE_SIZE) + return -E2BIG; + + return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); +} + +/* + * Registers a user_event on behalf of a user process. + */ +static long user_events_ioctl_reg(struct file *file, unsigned long uarg) +{ + struct user_reg __user *ureg = (struct user_reg __user *)uarg; + struct user_reg reg; + struct user_event *user; + char *name; + long ret; + + ret = user_reg_get(ureg, ®); + + if (ret) + return ret; + + name = strndup_user((const char __user *)(uintptr_t)reg.name_args, + MAX_EVENT_DESC); + + if (IS_ERR(name)) { + ret = PTR_ERR(name); + return ret; + } + + ret = user_event_parse_cmd(name, &user); + + if (ret) { + kfree(name); + return ret; + } + + ret = user_events_ref_add(file, user); + + /* No longer need parse ref, ref_add either worked or not */ + atomic_dec(&user->refcnt); + + /* Positive number is index and valid */ + if (ret < 0) + return ret; + + put_user((u32)ret, &ureg->write_index); + put_user(user->index, &ureg->status_index); + + return 0; +} + +/* + * Deletes a user_event on behalf of a user process. + */ +static long user_events_ioctl_del(struct file *file, unsigned long uarg) +{ + void __user *ubuf = (void __user *)uarg; + char *name; + long ret; + + name = strndup_user(ubuf, MAX_EVENT_DESC); + + if (IS_ERR(name)) + return PTR_ERR(name); + + /* event_mutex prevents dyn_event from racing */ + mutex_lock(&event_mutex); + ret = delete_user_event(name); + mutex_unlock(&event_mutex); + + kfree(name); + + return ret; +} + +/* + * Handles the ioctl from user mode to register or alter operations. + */ +static long user_events_ioctl(struct file *file, unsigned int cmd, + unsigned long uarg) +{ + long ret = -ENOTTY; + + switch (cmd) { + case DIAG_IOCSREG: + mutex_lock(®_mutex); + ret = user_events_ioctl_reg(file, uarg); + mutex_unlock(®_mutex); + break; + + case DIAG_IOCSDEL: + mutex_lock(®_mutex); + ret = user_events_ioctl_del(file, uarg); + mutex_unlock(®_mutex); + break; + } + + return ret; +} + +/* + * Handles the final close of the file from user mode. + */ +static int user_events_release(struct inode *node, struct file *file) +{ + struct user_event_refs *refs; + struct user_event *user; + int i; + + /* + * Ensure refs cannot change under any situation by taking the + * register mutex during the final freeing of the references. + */ + mutex_lock(®_mutex); + + refs = file->private_data; + + if (!refs) + goto out; + + /* + * The lifetime of refs has reached an end, it's tied to this file. + * The underlying user_events are ref counted, and cannot be freed. + * After this decrement, the user_events may be freed elsewhere. + */ + for (i = 0; i < refs->count; ++i) { + user = refs->events[i]; + + if (user) + atomic_dec(&user->refcnt); + } +out: + file->private_data = NULL; + + mutex_unlock(®_mutex); + + kfree(refs); + + return 0; +} + +static const struct file_operations user_data_fops = { + .write = user_events_write, + .write_iter = user_events_write_iter, + .unlocked_ioctl = user_events_ioctl, + .release = user_events_release, +}; + +/* + * Maps the shared page into the user process for checking if event is enabled. + */ +static int user_status_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + + if (size != MAX_EVENTS) + return -EINVAL; + + return remap_pfn_range(vma, vma->vm_start, + virt_to_phys(register_page_data) >> PAGE_SHIFT, + size, vm_get_page_prot(VM_READ)); +} + +static void *user_seq_start(struct seq_file *m, loff_t *pos) +{ + if (*pos) + return NULL; + + return (void *)1; +} + +static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos) +{ + ++*pos; + return NULL; +} + +static void user_seq_stop(struct seq_file *m, void *p) +{ +} + +static int user_seq_show(struct seq_file *m, void *p) +{ + struct user_event *user; + char status; + int i, active = 0, busy = 0, flags; + + mutex_lock(®_mutex); + + hash_for_each(register_table, i, user, node) { + status = register_page_data[user->index]; + flags = user->flags; + + seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); + + if (flags != 0 || status != 0) + seq_puts(m, " #"); + + if (status != 0) { + seq_puts(m, " Used by"); + if (status & EVENT_STATUS_FTRACE) + seq_puts(m, " ftrace"); + if (status & EVENT_STATUS_PERF) + seq_puts(m, " perf"); + if (status & EVENT_STATUS_OTHER) + seq_puts(m, " other"); + busy++; + } + + seq_puts(m, "\n"); + active++; + } + + mutex_unlock(®_mutex); + + seq_puts(m, "\n"); + seq_printf(m, "Active: %d\n", active); + seq_printf(m, "Busy: %d\n", busy); + seq_printf(m, "Max: %ld\n", MAX_EVENTS); + + return 0; +} + +static const struct seq_operations user_seq_ops = { + .start = user_seq_start, + .next = user_seq_next, + .stop = user_seq_stop, + .show = user_seq_show, +}; + +static int user_status_open(struct inode *node, struct file *file) +{ + return seq_open(file, &user_seq_ops); +} + +static const struct file_operations user_status_fops = { + .open = user_status_open, + .mmap = user_status_mmap, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Creates a set of tracefs files to allow user mode interactions. + */ +static int create_user_tracefs(void) +{ + struct dentry *edata, *emmap; + + edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE, + NULL, NULL, &user_data_fops); + + if (!edata) { + pr_warn("Could not create tracefs 'user_events_data' entry\n"); + goto err; + } + + /* mmap with MAP_SHARED requires writable fd */ + emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE, + NULL, NULL, &user_status_fops); + + if (!emmap) { + tracefs_remove(edata); + pr_warn("Could not create tracefs 'user_events_mmap' entry\n"); + goto err; + } + + return 0; +err: + return -ENODEV; +} + +static void set_page_reservations(bool set) +{ + int page; + + for (page = 0; page < MAX_PAGES; ++page) { + void *addr = register_page_data + (PAGE_SIZE * page); + + if (set) + SetPageReserved(virt_to_page(addr)); + else + ClearPageReserved(virt_to_page(addr)); + } +} + +static int __init trace_events_user_init(void) +{ + struct page *pages; + int ret; + + /* Zero all bits beside 0 (which is reserved for failures) */ + bitmap_zero(page_bitmap, MAX_EVENTS); + set_bit(0, page_bitmap); + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); + if (!pages) + return -ENOMEM; + register_page_data = page_address(pages); + + set_page_reservations(true); + + ret = create_user_tracefs(); + + if (ret) { + pr_warn("user_events could not register with tracefs\n"); + set_page_reservations(false); + __free_pages(pages, MAX_PAGE_ORDER); + return ret; + } + + if (dyn_event_register(&user_event_dops)) + pr_warn("user_events could not register with dyn_events\n"); + + return 0; +} + +fs_initcall(trace_events_user_init); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b62fd785b599..a245ea673715 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1433,7 +1433,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, fbuffer.regs = regs; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = (unsigned long)tk->rp.kp.addr; - entry->ret_ip = (unsigned long)ri->ret_addr; + entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); @@ -1628,7 +1628,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, return; entry->func = (unsigned long)tk->rp.kp.addr; - entry->ret_ip = (unsigned long)ri->ret_addr; + entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); @@ -1718,8 +1718,17 @@ static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct kretprobe *rp = get_kretprobe(ri); - struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp); + struct trace_kprobe *tk; + + /* + * There is a small chance that get_kretprobe(ri) returns NULL when + * the kretprobe is unregister on another CPU between kretprobe's + * trampoline_handler and this function. + */ + if (unlikely(!rp)) + return 0; + tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) @@ -1907,25 +1916,18 @@ core_initcall(init_kprobe_trace_early); static __init int init_kprobe_trace(void) { int ret; - struct dentry *entry; ret = tracing_init_dentry(); if (ret) return 0; - entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE, - NULL, NULL, &kprobe_events_ops); - /* Event list interface */ - if (!entry) - pr_warn("Could not create tracefs 'kprobe_events' entry\n"); + trace_create_file("kprobe_events", TRACE_MODE_WRITE, + NULL, NULL, &kprobe_events_ops); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ, - NULL, NULL, &kprobe_profile_ops); - - if (!entry) - pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); + trace_create_file("kprobe_profile", TRACE_MODE_READ, + NULL, NULL, &kprobe_profile_ops); setup_boot_kprobe_events(); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 5e3c62a08fc0..313439920a8c 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1167,8 +1167,10 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) * used to record the beginning and to report the end of a thread noise window. */ static void -trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, - struct task_struct *n) +trace_sched_switch_callback(void *data, bool preempt, + struct task_struct *p, + struct task_struct *n, + unsigned int prev_state) { struct osnoise_variables *osn_var = this_cpu_osn_var(); @@ -1576,11 +1578,27 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) trace_timerlat_sample(&s); - notify_new_max_latency(diff); + if (osnoise_data.stop_tracing) { + if (time_to_us(diff) >= osnoise_data.stop_tracing) { + + /* + * At this point, if stop_tracing is set and <= print_stack, + * print_stack is set and would be printed in the thread handler. + * + * Thus, print the stack trace as it is helpful to define the + * root cause of an IRQ latency. + */ + if (osnoise_data.stop_tracing <= osnoise_data.print_stack) { + timerlat_save_stack(0); + timerlat_dump_stack(time_to_us(diff)); + } - if (osnoise_data.stop_tracing) - if (time_to_us(diff) >= osnoise_data.stop_tracing) osnoise_stop_tracing(); + notify_new_max_latency(diff); + + return HRTIMER_NORESTART; + } + } wake_up_process(tlat->kthread); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8aa493d25c73..67f47ea27921 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -692,7 +692,7 @@ static LIST_HEAD(ftrace_event_list); static int trace_search_list(struct list_head **list) { - struct trace_event *e; + struct trace_event *e = NULL, *iter; int next = __TRACE_LAST_TYPE; if (list_empty(&ftrace_event_list)) { @@ -704,9 +704,11 @@ static int trace_search_list(struct list_head **list) * We used up all possible max events, * lets see if somebody freed one. */ - list_for_each_entry(e, &ftrace_event_list, list) { - if (e->type != next) + list_for_each_entry(iter, &ftrace_event_list, list) { + if (iter->type != next) { + e = iter; break; + } next++; } @@ -714,7 +716,10 @@ static int trace_search_list(struct list_head **list) if (next > TRACE_EVENT_TYPE_MAX) return 0; - *list = &e->list; + if (e) + *list = &e->list; + else + *list = &ftrace_event_list; return next; } @@ -778,9 +783,8 @@ int register_trace_event(struct trace_event *event) list_add_tail(&event->list, list); - } else if (event->type > __TRACE_LAST_TYPE) { - printk(KERN_WARNING "Need to add type to trace.h\n"); - WARN_ON(1); + } else if (WARN(event->type > __TRACE_LAST_TYPE, + "Need to add type to trace.h")) { goto out; } else { /* Is this event already used */ @@ -1571,13 +1575,8 @@ __init static int init_events(void) for (i = 0; events[i]; i++) { event = events[i]; - ret = register_trace_event(event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - event->type); - WARN_ON_ONCE(1); - } + WARN_ONCE(!ret, "event %d failed to register", event->type); } return 0; diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index f4938040c228..95b58bd757ce 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -46,7 +46,7 @@ void trace_hardirqs_on(void) this_cpu_write(tracing_irq_cpu, 0); } - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); @@ -94,7 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) this_cpu_write(tracing_irq_cpu, 0); } - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c index 4d4b78c8ca25..a520b11afb0d 100644 --- a/kernel/trace/trace_recursion_record.c +++ b/kernel/trace/trace_recursion_record.c @@ -224,12 +224,9 @@ static const struct file_operations recursed_functions_fops = { __init static int create_recursed_functions(void) { - struct dentry *dentry; - dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE, - NULL, NULL, &recursed_functions_fops); - if (!dentry) - pr_warn("WARNING: Failed to create recursed_functions\n"); + trace_create_file("recursed_functions", TRACE_MODE_WRITE, + NULL, NULL, &recursed_functions_fops); return 0; } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index e304196d7c28..c9ffdcfe622e 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -22,7 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { int flags; @@ -44,7 +45,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee) if (!flags) return; - tracing_record_taskinfo(current, flags); + tracing_record_taskinfo_sched_switch(current, wakee, flags); } static int tracing_sched_register(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 2402de520eca..330aee1c1a49 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -426,7 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, static void notrace probe_wakeup_sched_switch(void *ignore, bool preempt, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { struct trace_array_cpu *data; u64 T0, T1, delta; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index abcadbe933bb..a2d301f58ced 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -895,6 +895,9 @@ trace_selftest_startup_function_graph(struct tracer *trace, ret = -1; goto out; } + + /* Enable tracing on all functions again */ + ftrace_set_global_filter(NULL, 0, 1); #endif /* Don't test dynamic tracing, the function tracer already did */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f755bde42fd0..b69e207012c9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -154,7 +154,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; /* parameter types */ - if (tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ @@ -296,9 +296,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct trace_event_file *trace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - unsigned int trace_ctx; + struct trace_event_buffer fbuffer; unsigned long args[6]; int syscall_nr; int size; @@ -321,20 +319,16 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - trace_ctx = tracing_gen_ctx(); - - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - sys_data->enter_event->event.type, size, trace_ctx); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) return; - entry = ring_buffer_event_data(event); + entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; syscall_get_arguments(current, regs, args); memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); - event_trigger_unlock_commit(trace_file, buffer, event, entry, - trace_ctx); + trace_event_buffer_commit(&fbuffer); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -343,9 +337,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct trace_event_file *trace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - unsigned int trace_ctx; + struct trace_event_buffer fbuffer; int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); @@ -364,20 +356,15 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return; - trace_ctx = tracing_gen_ctx(); - - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - sys_data->exit_event->event.type, sizeof(*entry), - trace_ctx); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry)); + if (!entry) return; - entry = ring_buffer_event_data(event); + entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - event_trigger_unlock_commit(trace_file, buffer, event, entry, - trace_ctx); + trace_event_buffer_commit(&fbuffer); } static int reg_event_syscall_enter(struct trace_event_file *file, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9711589273cd..c3dc4f859a6b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -546,7 +546,6 @@ static int __trace_uprobe_create(int argc, const char **argv) bool is_return = false; int i, ret; - ret = 0; ref_ctr_offset = 0; switch (argv[0][0]) { diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9628b5571846..9901708ce6b8 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1045,7 +1045,8 @@ static void sort_secondary(struct tracing_map *map, /** * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map * @map: The tracing_map - * @sort_key: The sort key to use for sorting + * @sort_keys: The sort key to use for sorting + * @n_sort_keys: hitcount, always have at least one * @sort_entries: outval: pointer to allocated and sorted array of entries * * tracing_map_sort_entries() sorts the current set of entries in the diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 1d261fbe367b..4252f0645b9e 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -23,15 +23,20 @@ void bacct_add_tsk(struct user_namespace *user_ns, { const struct cred *tcred; u64 utime, stime, utimescaled, stimescaled; - u64 delta; + u64 now_ns, delta; time64_t btime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in nsec */ - delta = ktime_get_ns() - tsk->start_time; + now_ns = ktime_get_ns(); + /* store whole group time first */ + delta = now_ns - tsk->group_leader->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); + stats->ac_tgetime = delta; + delta = now_ns - tsk->start_time; + do_div(delta, NSEC_PER_USEC); stats->ac_etime = delta; /* Convert to seconds for btime (note y2106 limit) */ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); @@ -51,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); + stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); diff --git a/kernel/umh.c b/kernel/umh.c index 36c123360ab8..b989736e8707 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -132,7 +132,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); + pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) sub_info->retval = pid; else @@ -171,8 +171,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work) * want to pollute current->children, and we need a parent * that always ignores SIGCHLD to ensure auto-reaping. */ - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - CLONE_PARENT | SIGCHLD); + pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, + CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 9dae1f648713..8303f4c7ca71 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -28,7 +28,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); if (IS_ERR(file)) { - mntput(mnt); + kern_unmount(mnt); return ERR_CAST(file); } @@ -38,7 +38,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na if (err >= 0) err = -ENOMEM; filp_close(file, NULL); - mntput(mnt); + kern_unmount(mnt); return ERR_PTR(err); } diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 00703444a219..230038d4f908 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -271,7 +271,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) return 0; error_p: - for (i = 0; i < nr_pages; i++) + while (--i >= 0) __free_page(pages[i]); kfree(pages); error: @@ -370,6 +370,7 @@ static void __put_watch_queue(struct kref *kref) for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); + kfree(wqueue->notes); bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); @@ -395,6 +396,7 @@ static void free_watch(struct rcu_head *rcu) put_watch_queue(rcu_access_pointer(watch->queue)); atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); + kfree(watch); } static void __put_watch(struct kref *kref) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 99afb88d2e85..ecb0e8346e65 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -57,7 +57,7 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; * Should we panic when a soft-lockup or hard-lockup occurs: */ unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -168,7 +168,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -848,7 +848,7 @@ void __init lockup_detector_init(void) pr_info("Disabling watchdog on nohz_full cores by default\n"); cpumask_copy(&watchdog_cpumask, - housekeeping_cpumask(HK_FLAG_TIMER)); + housekeeping_cpumask(HK_TYPE_TIMER)); if (!watchdog_nmi_probe()) nmi_watchdog_available = true; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33f1106b4f99..1ea50f6be843 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -154,15 +154,20 @@ struct worker_pool { unsigned long watchdog_ts; /* L: watchdog timestamp */ - /* The current concurrency level. */ - atomic_t nr_running; + /* + * The counter is incremented in a process context on the associated CPU + * w/ preemption disabled, and decremented or reset in the same context + * but w/ pool->lock held. The readers grab pool->lock and are + * guaranteed to see if the counter reached zero. + */ + int nr_running; struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ int nr_idle; /* L: currently idle workers */ - struct list_head idle_list; /* X: list of idle workers */ + struct list_head idle_list; /* L: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ @@ -777,7 +782,7 @@ static bool work_is_canceling(struct work_struct *work) static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(&pool->nr_running); + return !pool->nr_running; } /* @@ -802,8 +807,7 @@ static bool may_start_working(struct worker_pool *pool) /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { - return !list_empty(&pool->worklist) && - atomic_read(&pool->nr_running) <= 1; + return !list_empty(&pool->worklist) && (pool->nr_running <= 1); } /* Do we need a new worker? Called from manager. */ @@ -826,7 +830,7 @@ static bool too_many_workers(struct worker_pool *pool) * Wake up functions. */ -/* Return the first idle worker. Safe with preemption disabled */ +/* Return the first idle worker. Called with pool->lock held. */ static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) @@ -873,7 +877,7 @@ void wq_worker_running(struct task_struct *task) */ preempt_disable(); if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(&worker->pool->nr_running); + worker->pool->nr_running++; preempt_enable(); worker->sleeping = 0; } @@ -887,7 +891,7 @@ void wq_worker_running(struct task_struct *task) */ void wq_worker_sleeping(struct task_struct *task) { - struct worker *next, *worker = kthread_data(task); + struct worker *worker = kthread_data(task); struct worker_pool *pool; /* @@ -917,23 +921,9 @@ void wq_worker_sleeping(struct task_struct *task) return; } - /* - * The counterpart of the following dec_and_test, implied mb, - * worklist not empty test sequence is in insert_work(). - * Please read comment there. - * - * NOT_RUNNING is clear. This means that we're bound to and - * running on the local cpu w/ rq lock held and preemption - * disabled, which in turn means that none else could be - * manipulating idle_list, so dereferencing idle_list without pool - * lock is safe. - */ - if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) { - next = first_idle_worker(pool); - if (next) - wake_up_process(next->task); - } + pool->nr_running--; + if (need_more_worker(pool)) + wake_up_worker(pool); raw_spin_unlock_irq(&pool->lock); } @@ -987,7 +977,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_dec(&pool->nr_running); + pool->nr_running--; } worker->flags |= flags; @@ -1019,7 +1009,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(&pool->nr_running); + pool->nr_running++; } /** @@ -1372,13 +1362,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, list_add_tail(&work->entry, head); get_pwq(pwq); - /* - * Ensure either wq_worker_sleeping() sees the above - * list_add_tail() or we see zero nr_running to avoid workers lying - * around lazily while there are works to be processed. - */ - smp_mb(); - if (__need_more_worker(pool)) wake_up_worker(pool); } @@ -1827,8 +1810,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* Sanity check nr_running. */ - WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && - atomic_read(&pool->nr_running)); + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); } /** @@ -2806,13 +2788,13 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, } /** - * flush_workqueue - ensure that any scheduled work has run to completion. + * __flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * This function sleeps until all work items which were queued on entry * have finished execution, but it is not livelocked by new incoming ones. */ -void flush_workqueue(struct workqueue_struct *wq) +void __flush_workqueue(struct workqueue_struct *wq) { struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), @@ -2961,7 +2943,7 @@ void flush_workqueue(struct workqueue_struct *wq) out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL(flush_workqueue); +EXPORT_SYMBOL(__flush_workqueue); /** * drain_workqueue - drain a workqueue @@ -2989,7 +2971,7 @@ void drain_workqueue(struct workqueue_struct *wq) wq->flags |= __WQ_DRAINING; mutex_unlock(&wq->mutex); reflush: - flush_workqueue(wq); + __flush_workqueue(wq); mutex_lock(&wq->mutex); @@ -5006,7 +4988,7 @@ static void unbind_workers(int cpu) * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ - atomic_set(&pool->nr_running, 0); + pool->nr_running = 0; /* * With concurrency management just turned off, a busy @@ -5019,7 +5001,7 @@ static void unbind_workers(int cpu) for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, -1); - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); } mutex_unlock(&wq_pool_attach_mutex); @@ -6006,13 +5988,13 @@ static void __init wq_numa_init(void) void __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; - int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); @@ -6129,3 +6111,11 @@ void __init workqueue_init(void) wq_online = true; wq_watchdog_init(); } + +/* + * Despite the naming, this is a no-op function which is here only for avoiding + * link error. Since compile-time warning may fail to catch, we will need to + * emit run-time warning from __flush_workqueue(). + */ +void __warn_flushing_systemwide_wq(void) { } +EXPORT_SYMBOL(__warn_flushing_systemwide_wq); |