diff options
Diffstat (limited to 'kernel')
70 files changed, 3131 insertions, 395 deletions
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 6432a37ac1c9..c565fbf66ac8 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -102,6 +102,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); if (ret < 0) { + audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3a8c9d744800..79a5da1bc5bb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1073,31 +1073,6 @@ int audit_alloc(struct task_struct *tsk) return 0; } -/** - * audit_alloc_kernel - allocate an audit_context for a kernel task - * @tsk: the kernel task - * - * Similar to the audit_alloc() function, but intended for kernel private - * threads. Returns zero on success, negative values on failure. - */ -int audit_alloc_kernel(struct task_struct *tsk) -{ - /* - * At the moment we are just going to call into audit_alloc() to - * simplify the code, but there two things to keep in mind with this - * approach: - * - * 1. Filtering internal kernel tasks is a bit laughable in almost all - * cases, but there is at least one case where there is a benefit: - * the '-a task,never' case allows the admin to effectively disable - * task auditing at runtime. - * - * 2. The {set,clear}_task_syscall_work() ops likely have zero effect - * on these internal kernel tasks, but they probably don't hurt either. - */ - return audit_alloc(tsk); -} - static inline void audit_free_context(struct audit_context *context) { /* resetting is extra work, but it is likely just noise */ @@ -1965,6 +1940,7 @@ void __audit_uring_exit(int success, long code) goto out; } + audit_return_fixup(ctx, success, code); if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case @@ -2006,7 +1982,6 @@ void __audit_uring_exit(int success, long code) audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) goto out; - audit_return_fixup(ctx, success, code); audit_log_exit(); out: @@ -2090,13 +2065,13 @@ void __audit_syscall_exit(int success, long return_code) if (!list_empty(&context->killed_trees)) audit_kill_trees(context); + audit_return_fixup(context, success, return_code); /* run through both filters to ensure we set the filterkey properly */ audit_filter_syscall(current, context); audit_filter_inodes(current, context); if (context->current_state < AUDIT_STATE_RECORD) goto out; - audit_return_fixup(context, success, return_code); audit_log_exit(); out: diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d3e734bf8056..624527401d4d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -649,6 +649,11 @@ static int bpf_iter_init_array_map(void *priv_data, seq_info->percpu_value_buf = value_buf; } + /* bpf_iter_attach_map() acquires a map uref, and the uref may be + * released before or in the middle of iterating map elements, so + * acquire an extra map uref for iterator. + */ + bpf_map_inc_with_uref(map); seq_info->map = map; return 0; } @@ -657,6 +662,7 @@ static void bpf_iter_fini_array_map(void *priv_data) { struct bpf_iter_seq_array_map_info *seq_info = priv_data; + bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 2726a5950cfa..24b755eca0b3 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -68,13 +68,18 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo) +{ + return tinfo->reg_info->feature & BPF_ITER_RESCHED; +} + static bool bpf_iter_support_resched(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED; + return bpf_iter_target_support_resched(iter_priv->tinfo); } /* maximum visited objects before bailing out */ @@ -537,6 +542,10 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, if (!tinfo) return -ENOENT; + /* Only allow sleepable program for resched-able iterator */ + if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo)) + return -EINVAL; + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index da7578426a46..6c530a5e560a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -311,12 +311,8 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, struct htab_elem *l; if (node) { - u32 key_size = htab->map.key_size; - l = container_of(node, struct htab_elem, lru_node); - memcpy(l->key, key, key_size); - check_and_init_map_value(&htab->map, - l->key + round_up(key_size, 8)); + memcpy(l->key, key, htab->map.key_size); return l; } @@ -2064,6 +2060,7 @@ static int bpf_iter_init_hash_map(void *priv_data, seq_info->percpu_value_buf = value_buf; } + bpf_map_inc_with_uref(map); seq_info->map = map; seq_info->htab = container_of(map, struct bpf_htab, map); return 0; @@ -2073,6 +2070,7 @@ static void bpf_iter_fini_hash_map(void *priv_data) { struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index e2618fb5870e..82c61612f382 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -21,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map) /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { - uintptr_t sk_user_data; + struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); - sk_user_data = (uintptr_t)sk->sk_user_data; - if (sk_user_data & SK_USER_DATA_BPF) { - struct sock __rcu **socks; - - socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); + socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); + if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 83c7136c5788..a4d40d98428a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3886,6 +3886,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct btf *attach_btf = bpf_prog_get_target_btf(prog); struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_kstats stats; @@ -4088,10 +4089,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, if (prog->aux->btf) info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; - if (prog->aux->attach_btf) - info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf); - else if (prog->aux->dst_prog) - info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf); + if (attach_btf) + info.attach_btf_obj_id = btf_obj_id(attach_btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; @@ -5072,9 +5071,6 @@ static bool syscall_prog_is_valid_access(int off, int size, BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { - struct bpf_prog * __maybe_unused prog; - struct bpf_tramp_run_ctx __maybe_unused run_ctx; - switch (cmd) { case BPF_MAP_CREATE: case BPF_MAP_UPDATE_ELEM: @@ -5084,6 +5080,26 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) case BPF_LINK_CREATE: case BPF_RAW_TRACEPOINT_OPEN: break; + default: + return -EINVAL; + } + return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); +} + + +/* To shut up -Wmissing-prototypes. + * This function is used by the kernel light skeleton + * to load bpf programs when modules are loaded or during kernel boot. + * See tools/lib/bpf/skel_internal.h + */ +int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); + +int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) +{ + struct bpf_prog * __maybe_unused prog; + struct bpf_tramp_run_ctx __maybe_unused run_ctx; + + switch (cmd) { #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ case BPF_PROG_TEST_RUN: if (attr->test.data_in || attr->test.data_out || @@ -5114,11 +5130,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) return 0; #endif default: - return -EINVAL; + return ____bpf_sys_bpf(cmd, attr, size); } - return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } -EXPORT_SYMBOL(bpf_sys_bpf); +EXPORT_SYMBOL(kern_sys_bpf); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 0f532e6a717f..ff87e38af8a7 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -841,7 +841,10 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * multiple rcu callbacks. */ hlist_del(&tr->hlist); - kfree(tr->fops); + if (tr->fops) { + ftrace_free_filter(tr->fops); + kfree(tr->fops); + } kfree(tr); out: mutex_unlock(&trampoline_mutex); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 2ade21b54dc4..ff6a8099eb2a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -59,6 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + cpus_read_lock(); percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -72,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) break; } percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); mutex_unlock(&cgroup_mutex); return retval; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ffaccd6373f1..e4bb5d57f4d1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1820,6 +1820,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) if (ss->css_rstat_flush) { list_del_rcu(&css->rstat_css_node); + synchronize_rcu(); list_add_rcu(&css->rstat_css_node, &dcgrp->rstat_css_list); } @@ -2370,6 +2371,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) EXPORT_SYMBOL_GPL(task_cgroup_path); /** + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + */ +static void cgroup_attach_lock(bool lock_threadgroup) +{ + cpus_read_lock(); + if (lock_threadgroup) + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +static void cgroup_attach_unlock(bool lock_threadgroup) +{ + if (lock_threadgroup) + percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); +} + +/** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context @@ -2841,8 +2883,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) - __acquires(&cgroup_threadgroup_rwsem) + bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; @@ -2859,12 +2900,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) { - percpu_down_write(&cgroup_threadgroup_rwsem); - *locked = true; - } else { - *locked = false; - } + *threadgroup_locked = pid || threadgroup; + cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { @@ -2895,17 +2932,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, goto out_unlock_rcu; out_unlock_threadgroup: - if (*locked) { - percpu_up_write(&cgroup_threadgroup_rwsem); - *locked = false; - } + cgroup_attach_unlock(*threadgroup_locked); + *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool locked) - __releases(&cgroup_threadgroup_rwsem) +void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; @@ -2913,8 +2947,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - if (locked) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(threadgroup_locked); + for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -3000,8 +3034,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - if (has_tasks) - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(has_tasks); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3022,8 +3055,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - if (has_tasks) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(has_tasks); return ret; } @@ -3698,7 +3730,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; - new = psi_trigger_create(psi, buf, nbytes, res); + new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); @@ -4971,13 +5003,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool locked; + bool threadgroup_locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5003,7 +5035,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, threadgroup_locked); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 71a418858a5e..1f3a55297f39 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2239,7 +2239,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) goto out_unlock; cgroup_taskset_for_each(task, css, tset) { - ret = task_can_attach(task, cs->cpus_allowed); + ret = task_can_attach(task, cs->effective_cpus); if (ret) goto out_unlock; ret = security_task_setscheduler(task); @@ -2289,7 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - cpus_read_lock(); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2343,7 +2343,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config index ff756221f112..436f806aa1ed 100644 --- a/kernel/configs/xen.config +++ b/kernel/configs/xen.config @@ -34,7 +34,6 @@ CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m CONFIG_XEN_SCSI_FRONTEND=m # others CONFIG_XEN_BALLOON=y -CONFIG_XEN_SCRUB_PAGES=y CONFIG_XEN_DEV_EVTCHN=m CONFIG_XEN_BLKDEV_FRONTEND=m CONFIG_XEN_NETDEV_FRONTEND=m diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 71122e01623c..a0eb4d5cf557 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -9,12 +9,15 @@ #include <linux/init.h> #include <linux/utsname.h> #include <linux/vmalloc.h> +#include <linux/sizes.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/sha1.h> +#include "kallsyms_internal.h" + /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; @@ -43,6 +46,15 @@ static int __init parse_crashkernel_mem(char *cmdline, unsigned long long *crash_base) { char *cur = cmdline, *tmp; + unsigned long long total_mem = system_ram; + + /* + * Firmware sometimes reserves some memory regions for its own use, + * so the system memory size is less than the actual physical memory + * size. Work around this by rounding up the total size to 128M, + * which is enough for most test cases. + */ + total_mem = roundup(total_mem, SZ_128M); /* for each entry of the comma-separated list */ do { @@ -87,13 +99,13 @@ static int __init parse_crashkernel_mem(char *cmdline, return -EINVAL; } cur = tmp; - if (size >= system_ram) { + if (size >= total_mem) { pr_warn("crashkernel: invalid size\n"); return -EINVAL; } /* match ? */ - if (system_ram >= start && system_ram < end) { + if (total_mem >= start && total_mem < end) { *crash_size = size; break; } @@ -480,6 +492,19 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif +#ifdef CONFIG_KALLSYMS + VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_num_syms); + VMCOREINFO_SYMBOL(kallsyms_token_table); + VMCOREINFO_SYMBOL(kallsyms_token_index); +#ifdef CONFIG_KALLSYMS_BASE_RELATIVE + VMCOREINFO_SYMBOL(kallsyms_offsets); + VMCOREINFO_SYMBOL(kallsyms_relative_base); +#else + VMCOREINFO_SYMBOL(kallsyms_addresses); +#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ +#endif /* CONFIG_KALLSYMS */ + arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 375fb3c9538d..c21abc77c53e 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -74,7 +74,7 @@ out_unmap_membase: return ERR_PTR(-ENOMEM); } -static void dma_release_coherent_memory(struct dma_coherent_mem *mem) +static void _dma_release_coherent_memory(struct dma_coherent_mem *mem) { if (!mem) return; @@ -126,10 +126,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, ret = dma_assign_coherent_memory(dev, mem); if (ret) - dma_release_coherent_memory(mem); + _dma_release_coherent_memory(mem); return ret; } +void dma_release_coherent_memory(struct device *dev) +{ + if (dev) + _dma_release_coherent_memory(dev->dma_mem); +} + static void *__dma_alloc_from_coherent(struct device *dev, struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8d0b68a17042..63859a101ed8 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -453,29 +453,60 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu_all(); } +/* + * Unmaps segments, except for ones marked as pci_p2pdma which do not + * require any further action as they contain a bus address. + */ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; - for_each_sg(sgl, sg, nents, i) - dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, - attrs); + for_each_sg(sgl, sg, nents, i) { + if (sg_is_dma_bus_address(sg)) + sg_dma_unmark_bus_address(sg); + else + dma_direct_unmap_page(dev, sg->dma_address, + sg_dma_len(sg), dir, attrs); + } } #endif int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { - int i; + struct pci_p2pdma_map_state p2pdma_state = {}; + enum pci_p2pdma_map_type map; struct scatterlist *sg; + int i, ret; for_each_sg(sgl, sg, nents, i) { + if (is_pci_p2pdma_page(sg_page(sg))) { + map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg); + switch (map) { + case PCI_P2PDMA_MAP_BUS_ADDR: + continue; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * Any P2P mapping that traverses the PCI + * host bridge must be mapped with CPU physical + * address and not PCI bus addresses. This is + * done with dma_direct_map_page() below. + */ + break; + default: + ret = -EREMOTEIO; + goto out_unmap; + } + } + sg->dma_address = dma_direct_map_page(dev, sg_page(sg), sg->offset, sg->length, dir, attrs); - if (sg->dma_address == DMA_MAPPING_ERROR) + if (sg->dma_address == DMA_MAPPING_ERROR) { + ret = -EIO; goto out_unmap; + } sg_dma_len(sg) = sg->length; } @@ -483,7 +514,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return -EIO; + return ret; } dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index a78c0ba70645..e38ffc5e6bdd 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -8,6 +8,7 @@ #define _KERNEL_DMA_DIRECT_H #include <linux/dma-direct.h> +#include <linux/memremap.h> int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, @@ -87,10 +88,15 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (is_swiotlb_force_bounce(dev)) + if (is_swiotlb_force_bounce(dev)) { + if (is_pci_p2pdma_page(page)) + return DMA_MAPPING_ERROR; return swiotlb_map(dev, phys, size, dir, attrs); + } if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (is_pci_p2pdma_page(page)) + return DMA_MAPPING_ERROR; if (is_swiotlb_active(dev)) return swiotlb_map(dev, phys, size, dir, attrs); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index db7244291b74..49cbf3e33de7 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -197,7 +197,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, if (ents > 0) debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO)) + ents != -EIO && ents != -EREMOTEIO)) return -EIO; return ents; @@ -249,12 +249,15 @@ EXPORT_SYMBOL(dma_map_sg_attrs); * Returns 0 on success or a negative error code on error. The following * error codes are supported with the given meaning: * - * -EINVAL An invalid argument, unaligned access or other error - * in usage. Will not succeed if retried. - * -ENOMEM Insufficient resources (like memory or IOVA space) to - * complete the mapping. Should succeed if retried later. - * -EIO Legacy error code with an unknown meaning. eg. this is - * returned if a lower level call returned DMA_MAPPING_ERROR. + * -EINVAL An invalid argument, unaligned access or other error + * in usage. Will not succeed if retried. + * -ENOMEM Insufficient resources (like memory or IOVA space) to + * complete the mapping. Should succeed if retried later. + * -EIO Legacy error code with an unknown meaning. eg. this is + * returned if a lower level call returned + * DMA_MAPPING_ERROR. + * -EREMOTEIO The DMA device cannot access P2PDMA memory specified + * in the sg_table. This will not succeed if retried. */ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) @@ -720,6 +723,24 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); +bool dma_pci_p2pdma_supported(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + /* if ops is not set, dma direct will be used which supports P2PDMA */ + if (!ops) + return true; + + /* + * Note: dma_ops_bypass is not checked here because P2PDMA should + * not be used with dma mapping ops that do not have support even + * if the specific device is bypassing them. + */ + + return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED; +} +EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported); + #ifdef CONFIG_ARCH_HAS_DMA_SET_MASK void arch_dma_set_mask(struct device *dev, u64 mask); #else @@ -773,6 +794,18 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +size_t dma_opt_mapping_size(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + size_t size = SIZE_MAX; + + if (ops && ops->opt_mapping_size) + size = ops->opt_mapping_size(); + + return min(dma_max_mapping_size(dev), size); +} +EXPORT_SYMBOL_GPL(dma_opt_mapping_size); + bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index cb50f8d38360..c5a9190b218f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -62,6 +62,12 @@ #define INVALID_PHYS_ADDR (~(phys_addr_t)0) +struct io_tlb_slot { + phys_addr_t orig_addr; + size_t alloc_size; + unsigned int list; +}; + static bool swiotlb_force_bounce; static bool swiotlb_force_disable; @@ -70,6 +76,62 @@ struct io_tlb_mem io_tlb_default_mem; phys_addr_t swiotlb_unencrypted_base; static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; +static unsigned long default_nareas; + +/** + * struct io_tlb_area - IO TLB memory area descriptor + * + * This is a single area with a single lock. + * + * @used: The number of used IO TLB block. + * @index: The slot index to start searching in this area for next round. + * @lock: The lock to protect the above data structures in the map and + * unmap calls. + */ +struct io_tlb_area { + unsigned long used; + unsigned int index; + spinlock_t lock; +}; + +/* + * Round up number of slabs to the next power of 2. The last area is going + * be smaller than the rest if default_nslabs is not power of two. + * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE, + * otherwise a segment may span two or more areas. It conflicts with free + * contiguous slots tracking: free slots are treated contiguous no matter + * whether they cross an area boundary. + * + * Return true if default_nslabs is rounded up. + */ +static bool round_up_default_nslabs(void) +{ + if (!default_nareas) + return false; + + if (default_nslabs < IO_TLB_SEGSIZE * default_nareas) + default_nslabs = IO_TLB_SEGSIZE * default_nareas; + else if (is_power_of_2(default_nslabs)) + return false; + default_nslabs = roundup_pow_of_two(default_nslabs); + return true; +} + +static void swiotlb_adjust_nareas(unsigned int nareas) +{ + /* use a single area when non is specified */ + if (!nareas) + nareas = 1; + else if (!is_power_of_2(nareas)) + nareas = roundup_pow_of_two(nareas); + + default_nareas = nareas; + + pr_info("area num %d.\n", nareas); + if (round_up_default_nslabs()) + pr_info("SWIOTLB bounce buffer size roundup to %luMB", + (default_nslabs << IO_TLB_SHIFT) >> 20); +} static int __init setup_io_tlb_npages(char *str) @@ -81,6 +143,10 @@ setup_io_tlb_npages(char *str) } if (*str == ',') ++str; + if (isdigit(*str)) + swiotlb_adjust_nareas(simple_strtoul(str, &str, 0)); + if (*str == ',') + ++str; if (!strcmp(str, "force")) swiotlb_force_bounce = true; else if (!strcmp(str, "noforce")) @@ -112,8 +178,11 @@ void __init swiotlb_adjust_size(unsigned long size) */ if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) return; + size = ALIGN(size, IO_TLB_SIZE); default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + if (round_up_default_nslabs()) + size = default_nslabs << IO_TLB_SHIFT; pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } @@ -192,7 +261,8 @@ void __init swiotlb_update_mem_attributes(void) } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, - unsigned long nslabs, unsigned int flags, bool late_alloc) + unsigned long nslabs, unsigned int flags, + bool late_alloc, unsigned int nareas) { void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; @@ -200,12 +270,18 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->nslabs = nslabs; mem->start = start; mem->end = mem->start + bytes; - mem->index = 0; mem->late_alloc = late_alloc; + mem->nareas = nareas; + mem->area_nslabs = nslabs / mem->nareas; mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); - spin_lock_init(&mem->lock); + for (i = 0; i < mem->nareas; i++) { + spin_lock_init(&mem->areas[i].lock); + mem->areas[i].index = 0; + mem->areas[i].used = 0; + } + for (i = 0; i < mem->nslabs; i++) { mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; @@ -232,7 +308,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long nslabs = default_nslabs; + unsigned long nslabs; size_t alloc_size; size_t bytes; void *tlb; @@ -243,6 +319,17 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, return; /* + * default_nslabs maybe changed when adjust area number. + * So allocate bounce buffer after adjusting area number. + */ + if (!default_nareas) + swiotlb_adjust_nareas(num_possible_cpus()); + + nslabs = default_nslabs; + if (nslabs < IO_TLB_MIN_SLABS) + panic("%s: nslabs = %lu too small\n", __func__, nslabs); + + /* * By default allocate the bounce buffer memory from low memory, but * allow to pick a location everywhere for hypervisors with guest * memory encryption. @@ -254,7 +341,8 @@ retry: else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) { - pr_warn("%s: failed to allocate tlb structure\n", __func__); + pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", + __func__, bytes); return; } @@ -274,7 +362,13 @@ retry: panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false); + mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), + default_nareas), SMP_CACHE_BYTES); + if (!mem->areas) + panic("%s: Failed to allocate mem->areas.\n", __func__); + + swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false, + default_nareas); if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); @@ -282,7 +376,7 @@ retry: void __init swiotlb_init(bool addressing_limit, unsigned int flags) { - return swiotlb_init_remap(addressing_limit, flags, NULL); + swiotlb_init_remap(addressing_limit, flags, NULL); } /* @@ -296,7 +390,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned char *vstart = NULL; - unsigned int order; + unsigned int order, area_order; bool retried = false; int rc = 0; @@ -337,19 +431,34 @@ retry: (PAGE_SIZE << order) >> 20); } + if (!default_nareas) + swiotlb_adjust_nareas(num_possible_cpus()); + + area_order = get_order(array_size(sizeof(*mem->areas), + default_nareas)); + mem->areas = (struct io_tlb_area *) + __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order); + if (!mem->areas) + goto error_area; + mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(array_size(sizeof(*mem->slots), nslabs))); - if (!mem->slots) { - free_pages((unsigned long)vstart, order); - return -ENOMEM; - } + if (!mem->slots) + goto error_slots; set_memory_decrypted((unsigned long)vstart, (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true); + swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true, + default_nareas); swiotlb_print_info(); return 0; + +error_slots: + free_pages((unsigned long)mem->areas, area_order); +error_area: + free_pages((unsigned long)vstart, order); + return -ENOMEM; } void __init swiotlb_exit(void) @@ -357,6 +466,7 @@ void __init swiotlb_exit(void) struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long tbl_vaddr; size_t tbl_size, slots_size; + unsigned int area_order; if (swiotlb_force_bounce) return; @@ -371,9 +481,14 @@ void __init swiotlb_exit(void) set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT); if (mem->late_alloc) { + area_order = get_order(array_size(sizeof(*mem->areas), + mem->nareas)); + free_pages((unsigned long)mem->areas, area_order); free_pages(tbl_vaddr, get_order(tbl_size)); free_pages((unsigned long)mem->slots, get_order(slots_size)); } else { + memblock_free_late(__pa(mem->areas), + array_size(sizeof(*mem->areas), mem->nareas)); memblock_free_late(mem->start, tbl_size); memblock_free_late(__pa(mem->slots), slots_size); } @@ -476,9 +591,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask) return nr_slots(boundary_mask + 1); } -static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) +static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index) { - if (index >= mem->nslabs) + if (index >= mem->area_nslabs) return 0; return index; } @@ -487,10 +602,12 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, - size_t alloc_size, unsigned int alloc_align_mask) +static int swiotlb_do_find_slots(struct device *dev, int area_index, + phys_addr_t orig_addr, size_t alloc_size, + unsigned int alloc_align_mask) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_area *area = mem->areas + area_index; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; @@ -501,8 +618,11 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, unsigned int index, wrap, count = 0, i; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned long flags; + unsigned int slot_base; + unsigned int slot_index; BUG_ON(!nslots); + BUG_ON(area_index >= mem->nareas); /* * For mappings with an alignment requirement don't bother looping to @@ -514,16 +634,20 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1); - spin_lock_irqsave(&mem->lock, flags); - if (unlikely(nslots > mem->nslabs - mem->used)) + spin_lock_irqsave(&area->lock, flags); + if (unlikely(nslots > mem->area_nslabs - area->used)) goto not_found; - index = wrap = wrap_index(mem, ALIGN(mem->index, stride)); + slot_base = area_index * mem->area_nslabs; + index = wrap = wrap_area_index(mem, ALIGN(area->index, stride)); + do { + slot_index = slot_base + index; + if (orig_addr && - (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != - (orig_addr & iotlb_align_mask)) { - index = wrap_index(mem, index + 1); + (slot_addr(tbl_dma_addr, slot_index) & + iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { + index = wrap_area_index(mem, index + 1); continue; } @@ -532,26 +656,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, * contiguous buffers, we allocate the buffers from that slot * and mark the entries as '0' indicating unavailable. */ - if (!iommu_is_span_boundary(index, nslots, + if (!iommu_is_span_boundary(slot_index, nslots, nr_slots(tbl_dma_addr), max_slots)) { - if (mem->slots[index].list >= nslots) + if (mem->slots[slot_index].list >= nslots) goto found; } - index = wrap_index(mem, index + stride); + index = wrap_area_index(mem, index + stride); } while (index != wrap); not_found: - spin_unlock_irqrestore(&mem->lock, flags); + spin_unlock_irqrestore(&area->lock, flags); return -1; found: - for (i = index; i < index + nslots; i++) { + for (i = slot_index; i < slot_index + nslots; i++) { mem->slots[i].list = 0; - mem->slots[i].alloc_size = - alloc_size - (offset + ((i - index) << IO_TLB_SHIFT)); + mem->slots[i].alloc_size = alloc_size - (offset + + ((i - slot_index) << IO_TLB_SHIFT)); } - for (i = index - 1; + for (i = slot_index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) mem->slots[i].list = ++count; @@ -559,14 +683,42 @@ found: /* * Update the indices to avoid searching in the next round. */ - if (index + nslots < mem->nslabs) - mem->index = index + nslots; + if (index + nslots < mem->area_nslabs) + area->index = index + nslots; else - mem->index = 0; - mem->used += nslots; + area->index = 0; + area->used += nslots; + spin_unlock_irqrestore(&area->lock, flags); + return slot_index; +} - spin_unlock_irqrestore(&mem->lock, flags); - return index; +static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size, unsigned int alloc_align_mask) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + int start = raw_smp_processor_id() & (mem->nareas - 1); + int i = start, index; + + do { + index = swiotlb_do_find_slots(dev, i, orig_addr, alloc_size, + alloc_align_mask); + if (index >= 0) + return index; + if (++i >= mem->nareas) + i = 0; + } while (i != start); + + return -1; +} + +static unsigned long mem_used(struct io_tlb_mem *mem) +{ + int i; + unsigned long used = 0; + + for (i = 0; i < mem->nareas; i++) + used += mem->areas[i].used; + return used; } phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, @@ -580,7 +732,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, int index; phys_addr_t tlb_addr; - if (!mem) + if (!mem || !mem->nslabs) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) @@ -598,7 +750,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, mem->nslabs, mem->used); + alloc_size, mem->nslabs, mem_used(mem)); return (phys_addr_t)DMA_MAPPING_ERROR; } @@ -628,6 +780,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) unsigned int offset = swiotlb_align_offset(dev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; int nslots = nr_slots(mem->slots[index].alloc_size + offset); + int aindex = index / mem->area_nslabs; + struct io_tlb_area *area = &mem->areas[aindex]; int count, i; /* @@ -636,7 +790,9 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */ - spin_lock_irqsave(&mem->lock, flags); + BUG_ON(aindex >= mem->nareas); + + spin_lock_irqsave(&area->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) count = mem->slots[index + nslots].list; else @@ -660,8 +816,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) mem->slots[i].list = ++count; - mem->used -= nslots; - spin_unlock_irqrestore(&mem->lock, flags); + area->used -= nslots; + spin_unlock_irqrestore(&area->lock, flags); } /* @@ -756,6 +912,13 @@ bool is_swiotlb_active(struct device *dev) } EXPORT_SYMBOL_GPL(is_swiotlb_active); +static int io_tlb_used_get(void *data, u64 *val) +{ + *val = mem_used(&io_tlb_default_mem); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n"); + static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { @@ -764,7 +927,8 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, return; debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); - debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); + debugfs_create_file("io_tlb_used", 0400, mem->debugfs, NULL, + &fops_io_tlb_used); } static int __init __maybe_unused swiotlb_create_default_debugfs(void) @@ -815,6 +979,9 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, struct io_tlb_mem *mem = rmem->priv; unsigned long nslabs = rmem->size >> IO_TLB_SHIFT; + /* Set Per-device io tlb area to one */ + unsigned int nareas = 1; + /* * Since multiple devices can share the same pool, the private data, * io_tlb_mem struct, will be initialized by the first device attached @@ -831,10 +998,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, return -ENOMEM; } + mem->areas = kcalloc(nareas, sizeof(*mem->areas), + GFP_KERNEL); + if (!mem->areas) { + kfree(mem->slots); + kfree(mem); + return -ENOMEM; + } + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE, - false); + false, nareas); mem->for_alloc = true; rmem->priv = mem; diff --git a/kernel/events/core.c b/kernel/events/core.c index 4e718b93442b..2621fd24ad26 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4457,7 +4457,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 __enabled, __running, __now;; + u64 __enabled, __running, __now; calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) diff --git a/kernel/exit.c b/kernel/exit.c index 64c938ce36fe..84021b24f79e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1051,7 +1051,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * p->signal fields because the whole thread group is dead * and nobody can change them. * - * psig->stats_lock also protects us from our sub-theads + * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. Until * we change k_getrusage()-like users to rely on this lock * we have to take ->siglock as well. diff --git a/kernel/fork.c b/kernel/fork.c index 28772142022a..90c85b17bf69 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1965,6 +1965,18 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) mutex_unlock(&oom_adj_mutex); } +#ifdef CONFIG_RV +static void rv_task_fork(struct task_struct *p) +{ + int i; + + for (i = 0; i < RV_PER_TASK_MONITORS; i++) + p->rv[i].da_mon.monitoring = false; +} +#else +#define rv_task_fork(p) do {} while (0) +#endif + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2403,6 +2415,8 @@ static __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + rv_task_fork(p); + rseq_fork(p, clone_flags); /* Don't start children in a dying pid namespace */ diff --git a/kernel/hung_task.c b/kernel/hung_task.c index cff3ae8c818f..bb2354f73ded 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -229,7 +229,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 79a85834ce9d..3e7e2c2ad2f7 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -32,28 +32,7 @@ #include <linux/bsearch.h> #include <linux/btf_ids.h> -/* - * These will be re-linked against their real values - * during the second link stage. - */ -extern const unsigned long kallsyms_addresses[] __weak; -extern const int kallsyms_offsets[] __weak; -extern const u8 kallsyms_names[] __weak; - -/* - * Tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV). - */ -extern const unsigned int kallsyms_num_syms -__section(".rodata") __attribute__((weak)); - -extern const unsigned long kallsyms_relative_base -__section(".rodata") __attribute__((weak)); - -extern const char kallsyms_token_table[] __weak; -extern const u16 kallsyms_token_index[] __weak; - -extern const unsigned int kallsyms_markers[] __weak; +#include "kallsyms_internal.h" /* * Expand a compressed symbol data into the resulting uncompressed string, diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h new file mode 100644 index 000000000000..2d0c6f2f0243 --- /dev/null +++ b/kernel/kallsyms_internal.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef LINUX_KALLSYMS_INTERNAL_H_ +#define LINUX_KALLSYMS_INTERNAL_H_ + +#include <linux/types.h> + +/* + * These will be re-linked against their real values + * during the second link stage. + */ +extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; +extern const u8 kallsyms_names[] __weak; + +/* + * Tell the compiler that the count isn't in the small data section if the arch + * has one (eg: FRV). + */ +extern const unsigned int kallsyms_num_syms +__section(".rodata") __attribute__((weak)); + +extern const unsigned long kallsyms_relative_base +__section(".rodata") __attribute__((weak)); + +extern const char kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; + +extern const unsigned int kallsyms_markers[] __weak; + +#endif // LINUX_KALLSYMS_INTERNAL_H_ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index a7b411c22f19..1d546dc97c50 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -40,6 +40,9 @@ void set_kexec_sig_enforced(void) static int kexec_calculate_store_digests(struct kimage *image); +/* Maximum size in bytes for kernel/initrd files. */ +#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX) + /* * Currently this is the only default function that is exported as some * architectures need it to do additional handlings. @@ -190,11 +193,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, const char __user *cmdline_ptr, unsigned long cmdline_len, unsigned flags) { - int ret; + ssize_t ret; void *ldata; ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf, - INT_MAX, NULL, READING_KEXEC_IMAGE); + KEXEC_FILE_SIZE_MAX, NULL, + READING_KEXEC_IMAGE); if (ret < 0) return ret; image->kernel_buf_len = ret; @@ -214,7 +218,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf, - INT_MAX, NULL, + KEXEC_FILE_SIZE_MAX, NULL, READING_KEXEC_INITRAMFS); if (ret < 0) goto out; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f214f8c088ed..08350e35aba2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1560,7 +1560,8 @@ static int check_kprobe_address_safe(struct kprobe *p, preempt_disable(); /* Ensure it is not in reserved area nor out of text */ - if (!kernel_text_address((unsigned long) p->addr) || + if (!(core_kernel_text((unsigned long) p->addr) || + is_module_text_address((unsigned long) p->addr)) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || @@ -1706,11 +1707,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* - * If 'kprobes_all_disarmed' is set, 'orig_p' - * should have already been disarmed, so - * skip unneed disarming process. + * Don't be lazy here. Even if 'kprobes_all_disarmed' + * is false, 'orig_p' might not have been armed yet. + * Note arm_all_kprobes() __tries__ to arm all kprobes + * on the best effort basis. */ - if (!kprobes_all_disarmed) { + if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { ret = disarm_kprobe(orig_p, true); if (ret) { p->flags &= ~KPROBE_FLAG_DISABLED; diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig new file mode 100644 index 000000000000..26ea5d04f56c --- /dev/null +++ b/kernel/module/Kconfig @@ -0,0 +1,293 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig MODULES + bool "Enable loadable module support" + modules + help + Kernel modules are small pieces of compiled code which can + be inserted in the running kernel, rather than being + permanently built into the kernel. You use the "modprobe" + tool to add (and sometimes remove) them. If you say Y here, + many parts of the kernel can be built as modules (by + answering M instead of Y where indicated): this is most + useful for infrequently used options which are not required + for booting. For more information, see the man pages for + modprobe, lsmod, modinfo, insmod and rmmod. + + If you say Y here, you will need to run "make + modules_install" to put the modules under /lib/modules/ + where modprobe can find them (you may need to be root to do + this). + + If unsure, say Y. + +if MODULES + +config MODULE_FORCE_LOAD + bool "Forced module loading" + default n + help + Allow loading of modules without version information (ie. modprobe + --force). Forced module loading sets the 'F' (forced) taint flag and + is usually a really bad idea. + +config MODULE_UNLOAD + bool "Module unloading" + help + Without this option you will not be able to unload any + modules (note that some modules may not be unloadable + anyway), which makes your kernel smaller, faster + and simpler. If unsure, say Y. + +config MODULE_FORCE_UNLOAD + bool "Forced module unloading" + depends on MODULE_UNLOAD + help + This option allows you to force a module to unload, even if the + kernel believes it is unsafe: the kernel will remove the module + without waiting for anyone to stop using it (using the -f option to + rmmod). This is mainly for kernel developers and desperate users. + If unsure, say N. + +config MODULE_UNLOAD_TAINT_TRACKING + bool "Tainted module unload tracking" + depends on MODULE_UNLOAD + default n + help + This option allows you to maintain a record of each unloaded + module that tainted the kernel. In addition to displaying a + list of linked (or loaded) modules e.g. on detection of a bad + page (see bad_page()), the aforementioned details are also + shown. If unsure, say N. + +config MODVERSIONS + bool "Module versioning support" + help + Usually, you have to use modules compiled with your kernel. + Saying Y here makes it sometimes possible to use modules + compiled for different kernels, by adding enough information + to the modules to (hopefully) spot any changes which would + make them incompatible with the kernel you are running. If + unsure, say N. + +config ASM_MODVERSIONS + bool + default HAVE_ASM_MODVERSIONS && MODVERSIONS + help + This enables module versioning for exported symbols also from + assembly. This can be enabled only when the target architecture + supports it. + +config MODULE_SRCVERSION_ALL + bool "Source checksum for all modules" + help + Modules which contain a MODULE_VERSION get an extra "srcversion" + field inserted into their modinfo section, which contains a + sum of the source files which made it. This helps maintainers + see exactly which source was used to build a module (since + others sometimes change the module source without updating + the version). With this option, such a "srcversion" field + will be created for all modules. If unsure, say N. + +config MODULE_SIG + bool "Module signature verification" + select MODULE_SIG_FORMAT + help + Check modules for valid signatures upon load: the signature + is simply appended to the module. For more information see + <file:Documentation/admin-guide/module-signing.rst>. + + Note that this option adds the OpenSSL development packages as a + kernel build dependency so that the signing tool can use its crypto + library. + + You should enable this option if you wish to use either + CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via + another LSM - otherwise unsigned modules will be loadable regardless + of the lockdown policy. + + !!!WARNING!!! If you enable this option, you MUST make sure that the + module DOES NOT get stripped after being signed. This includes the + debuginfo strip done by some packagers (such as rpmbuild) and + inclusion into an initramfs that wants the module size reduced. + +config MODULE_SIG_FORCE + bool "Require modules to be validly signed" + depends on MODULE_SIG + help + Reject unsigned modules or signed modules for which we don't have a + key. Without this, such modules will simply taint the kernel. + +config MODULE_SIG_ALL + bool "Automatically sign all modules" + default y + depends on MODULE_SIG || IMA_APPRAISE_MODSIG + help + Sign all modules during make modules_install. Without this option, + modules must be signed manually, using the scripts/sign-file tool. + +comment "Do not forget to sign required modules with scripts/sign-file" + depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL + +choice + prompt "Which hash algorithm should modules be signed with?" + depends on MODULE_SIG || IMA_APPRAISE_MODSIG + help + This determines which sort of hashing algorithm will be used during + signature generation. This algorithm _must_ be built into the kernel + directly so that signature verification can take place. It is not + possible to load a signed module containing the algorithm to check + the signature on that module. + +config MODULE_SIG_SHA1 + bool "Sign modules with SHA-1" + select CRYPTO_SHA1 + +config MODULE_SIG_SHA224 + bool "Sign modules with SHA-224" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA256 + bool "Sign modules with SHA-256" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA384 + bool "Sign modules with SHA-384" + select CRYPTO_SHA512 + +config MODULE_SIG_SHA512 + bool "Sign modules with SHA-512" + select CRYPTO_SHA512 + +endchoice + +config MODULE_SIG_HASH + string + depends on MODULE_SIG || IMA_APPRAISE_MODSIG + default "sha1" if MODULE_SIG_SHA1 + default "sha224" if MODULE_SIG_SHA224 + default "sha256" if MODULE_SIG_SHA256 + default "sha384" if MODULE_SIG_SHA384 + default "sha512" if MODULE_SIG_SHA512 + +choice + prompt "Module compression mode" + help + This option allows you to choose the algorithm which will be used to + compress modules when 'make modules_install' is run. (or, you can + choose to not compress modules at all.) + + External modules will also be compressed in the same way during the + installation. + + For modules inside an initrd or initramfs, it's more efficient to + compress the whole initrd or initramfs instead. + + This is fully compatible with signed modules. + + Please note that the tool used to load modules needs to support the + corresponding algorithm. module-init-tools MAY support gzip, and kmod + MAY support gzip, xz and zstd. + + Your build system needs to provide the appropriate compression tool + to compress the modules. + + If in doubt, select 'None'. + +config MODULE_COMPRESS_NONE + bool "None" + help + Do not compress modules. The installed modules are suffixed + with .ko. + +config MODULE_COMPRESS_GZIP + bool "GZIP" + help + Compress modules with GZIP. The installed modules are suffixed + with .ko.gz. + +config MODULE_COMPRESS_XZ + bool "XZ" + help + Compress modules with XZ. The installed modules are suffixed + with .ko.xz. + +config MODULE_COMPRESS_ZSTD + bool "ZSTD" + help + Compress modules with ZSTD. The installed modules are suffixed + with .ko.zst. + +endchoice + +config MODULE_DECOMPRESS + bool "Support in-kernel module decompression" + depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ + select ZLIB_INFLATE if MODULE_COMPRESS_GZIP + select XZ_DEC if MODULE_COMPRESS_XZ + help + + Support for decompressing kernel modules by the kernel itself + instead of relying on userspace to perform this task. Useful when + load pinning security policy is enabled. + + If unsure, say N. + +config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS + bool "Allow loading of modules with missing namespace imports" + help + Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in + a namespace. A module that makes use of a symbol exported with such a + namespace is required to import the namespace via MODULE_IMPORT_NS(). + There is no technical reason to enforce correct namespace imports, + but it creates consistency between symbols defining namespaces and + users importing namespaces they make use of. This option relaxes this + requirement and lifts the enforcement when loading a module. + + If unsure, say N. + +config MODPROBE_PATH + string "Path to modprobe binary" + default "/sbin/modprobe" + help + When kernel code requests a module, it does so by calling + the "modprobe" userspace utility. This option allows you to + set the path where that binary is found. This can be changed + at runtime via the sysctl file + /proc/sys/kernel/modprobe. Setting this to the empty string + removes the kernel's ability to request modules (but + userspace can still load modules explicitly). + +config TRIM_UNUSED_KSYMS + bool "Trim unused exported kernel symbols" if EXPERT + depends on !COMPILE_TEST + help + The kernel and some modules make many symbols available for + other modules to use via EXPORT_SYMBOL() and variants. Depending + on the set of modules being selected in your kernel configuration, + many of those exported symbols might never be used. + + This option allows for unused exported symbols to be dropped from + the build. In turn, this provides the compiler more opportunities + (especially when using LTO) for optimizing the code and reducing + binary size. This might have some security advantages as well. + + If unsure, or if you need to build out-of-tree modules, say N. + +config UNUSED_KSYMS_WHITELIST + string "Whitelist of symbols to keep in ksymtab" + depends on TRIM_UNUSED_KSYMS + help + By default, all unused exported symbols will be un-exported from the + build when TRIM_UNUSED_KSYMS is selected. + + UNUSED_KSYMS_WHITELIST allows to whitelist symbols that must be kept + exported at all times, even in absence of in-tree users. The value to + set here is the path to a text file containing the list of symbols, + one per line. The path can be absolute, or relative to the kernel + source tree. + +config MODULES_TREE_LOOKUP + def_bool y + depends on PERF_EVENTS || TRACING || CFI_CLANG + +endif # MODULES diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index 2fc7081dd7c1..4d0bcb3d9e44 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -119,10 +119,10 @@ static ssize_t module_gzip_decompress(struct load_info *info, goto out_inflate_end; } - s.next_out = kmap(page); + s.next_out = kmap_local_page(page); s.avail_out = PAGE_SIZE; rc = zlib_inflate(&s, 0); - kunmap(page); + kunmap_local(s.next_out); new_size += PAGE_SIZE - s.avail_out; } while (rc == Z_OK); @@ -178,11 +178,11 @@ static ssize_t module_xz_decompress(struct load_info *info, goto out; } - xz_buf.out = kmap(page); + xz_buf.out = kmap_local_page(page); xz_buf.out_pos = 0; xz_buf.out_size = PAGE_SIZE; xz_ret = xz_dec_run(xz_dec, &xz_buf); - kunmap(page); + kunmap_local(xz_buf.out); new_size += xz_buf.out_pos; } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK); diff --git a/kernel/module/internal.h b/kernel/module/internal.h index ec104c2950c3..680d980a4fb2 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -103,7 +103,7 @@ struct module *find_module_all(const char *name, size_t len, bool even_unformed) int cmp_name(const void *name, const void *sym); long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, unsigned int section); -char *module_flags(struct module *mod, char *buf); +char *module_flags(struct module *mod, char *buf, bool show_state); size_t module_flags_taint(unsigned long taints, char *buf); static inline void module_assert_mutex_or_preempt(void) diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 77e75bead569..f5c5c9175333 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -457,26 +457,39 @@ unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) return 0; } -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name) +static unsigned long __module_kallsyms_lookup_name(const char *name) { struct module *mod; char *colon; - unsigned long ret = 0; + + colon = strnchr(name, MODULE_NAME_LEN, ':'); + if (colon) { + mod = find_module_all(name, colon - name, false); + if (mod) + return find_kallsyms_symbol_value(mod, colon + 1); + return 0; + } + + list_for_each_entry_rcu(mod, &modules, list) { + unsigned long ret; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + ret = find_kallsyms_symbol_value(mod, name); + if (ret) + return ret; + } + return 0; +} + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name) +{ + unsigned long ret; /* Don't lock: we're in enough trouble already. */ preempt_disable(); - if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { - if ((mod = find_module_all(name, colon - name, false)) != NULL) - ret = find_kallsyms_symbol_value(mod, colon + 1); - } else { - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) - break; - } - } + ret = __module_kallsyms_lookup_name(name); preempt_enable(); return ret; } diff --git a/kernel/module/main.c b/kernel/module/main.c index 57fc2821be63..a4e4d84b6f4e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -119,7 +119,7 @@ static void mod_update_bounds(struct module *mod) } /* Block module loading/unloading? */ -int modules_disabled = 0; +int modules_disabled; core_param(nomodule, modules_disabled, bint, 0); /* Waiting for a module to finish initializing? */ @@ -524,7 +524,10 @@ static struct module_attribute modinfo_##field = { \ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); -static char last_unloaded_module[MODULE_NAME_LEN+1]; +static struct { + char name[MODULE_NAME_LEN + 1]; + char taints[MODULE_FLAGS_BUF_SIZE]; +} last_unloaded_module; #ifdef CONFIG_MODULE_UNLOAD @@ -694,6 +697,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, { struct module *mod; char name[MODULE_NAME_LEN]; + char buf[MODULE_FLAGS_BUF_SIZE]; int ret, forced = 0; if (!capable(CAP_SYS_MODULE) || modules_disabled) @@ -753,8 +757,9 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, async_synchronize_full(); - /* Store the name of the last unloaded module for diagnostic purposes */ - strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + /* Store the name and taints of the last unloaded module for diagnostic purposes */ + strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name)); + strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints)); free_module(mod); /* someone could wait for the module in add_unformed_module() */ @@ -2094,7 +2099,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->static_call_sites), &mod->num_static_call_sites); #endif -#ifdef CONFIG_KUNIT +#if IS_ENABLED(CONFIG_KUNIT) mod->kunit_suites = section_objs(info, ".kunit_test_suites", sizeof(*mod->kunit_suites), &mod->num_kunit_suites); @@ -2151,7 +2156,7 @@ static int move_module(struct module *mod, struct load_info *info) #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC /* Do the allocs. */ - ptr = vmalloc(mod->data_layout.size); + ptr = vzalloc(mod->data_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a @@ -2164,7 +2169,6 @@ static int move_module(struct module *mod, struct load_info *info) return -ENOMEM; } - memset(ptr, 0, mod->data_layout.size); mod->data_layout.base = ptr; #endif /* Transfer each section which specifies SHF_ALLOC */ @@ -2423,6 +2427,12 @@ static void do_free_init(struct work_struct *w) } } +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "module." +/* Default value for module->async_probe_requested */ +static bool async_probe; +module_param(async_probe, bool, 0644); + /* * This is where the real work happens. * @@ -2643,7 +2653,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, int ret; if (strcmp(param, "async_probe") == 0) { - mod->async_probe_requested = true; + if (strtobool(val, &mod->async_probe_requested)) + mod->async_probe_requested = true; return 0; } @@ -2810,6 +2821,8 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto bug_cleanup; + mod->async_probe_requested = async_probe; + /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, mod, @@ -2984,24 +2997,27 @@ static void cfi_cleanup(struct module *mod) } /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ -char *module_flags(struct module *mod, char *buf) +char *module_flags(struct module *mod, char *buf, bool show_state) { int bx = 0; BUG_ON(mod->state == MODULE_STATE_UNFORMED); + if (!mod->taints && !show_state) + goto out; if (mod->taints || mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; bx += module_flags_taint(mod->taints, buf + bx); /* Show a - for module-is-being-unloaded */ - if (mod->state == MODULE_STATE_GOING) + if (mod->state == MODULE_STATE_GOING && show_state) buf[bx++] = '-'; /* Show a + for module-is-being-loaded */ - if (mod->state == MODULE_STATE_COMING) + if (mod->state == MODULE_STATE_COMING && show_state) buf[bx++] = '+'; buf[bx++] = ')'; } +out: buf[bx] = '\0'; return buf; @@ -3134,12 +3150,13 @@ void print_modules(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - pr_cont(" %s%s", mod->name, module_flags(mod, buf)); + pr_cont(" %s%s", mod->name, module_flags(mod, buf, true)); } print_unloaded_tainted_modules(); preempt_enable(); - if (last_unloaded_module[0]) - pr_cont(" [last unloaded: %s]", last_unloaded_module); + if (last_unloaded_module.name[0]) + pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name, + last_unloaded_module.taints); pr_cont("\n"); } diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c index 9a8f4f0f6329..cf5b9f1e6ec4 100644 --- a/kernel/module/procfs.c +++ b/kernel/module/procfs.c @@ -91,7 +91,7 @@ static int m_show(struct seq_file *m, void *p) /* Taints info */ if (mod->taints) - seq_printf(m, " %s", module_flags(mod, buf)); + seq_printf(m, " %s", module_flags(mod, buf, true)); seq_puts(m, "\n"); return 0; diff --git a/kernel/profile.c b/kernel/profile.c index 37640a0bd8a3..7ea01ba30e75 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -109,6 +109,13 @@ int __ref profile_init(void) /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; + + if (!prof_len) { + pr_warn("profiling shift: %u too large\n", prof_shift); + prof_on = 0; + return -EINVAL; + } + buffer_bytes = prof_len*sizeof(atomic_t); if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) @@ -418,6 +425,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) return read; } +/* default is to not implement this call */ +int __weak setup_profiling_timer(unsigned mult) +{ + return -EINVAL; +} + /* * Writing to /proc/profile resets the counters * @@ -428,8 +441,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_SMP - extern int setup_profiling_timer(unsigned int multiplier); - if (count == sizeof(int)) { unsigned int multiplier; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3dc968006ad0..79aea7df4345 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4574,7 +4574,7 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } - if (register_shrinker(&kfree_rcu_shrinker)) + if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) pr_err("Failed to register kfree_rcu() shrinker!\n"); } diff --git a/kernel/resource.c b/kernel/resource.c index 34eaee179689..4c5e80b92f2f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -489,8 +489,9 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); -static int __region_intersects(resource_size_t start, size_t size, - unsigned long flags, unsigned long desc) +static int __region_intersects(struct resource *parent, resource_size_t start, + size_t size, unsigned long flags, + unsigned long desc) { struct resource res; int type = 0; int other = 0; @@ -499,7 +500,7 @@ static int __region_intersects(resource_size_t start, size_t size, res.start = start; res.end = start + size - 1; - for (p = iomem_resource.child; p ; p = p->sibling) { + for (p = parent->child; p ; p = p->sibling) { bool is_type = (((p->flags & flags) == flags) && ((desc == IORES_DESC_NONE) || (desc == p->desc))); @@ -543,7 +544,7 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags, int ret; read_lock(&resource_lock); - ret = __region_intersects(start, size, flags, desc); + ret = __region_intersects(&iomem_resource, start, size, flags, desc); read_unlock(&resource_lock); return ret; @@ -891,6 +892,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) } write_unlock(&resource_lock); } +/* + * Not for general consumption, only early boot memory map parsing, PCI + * resource discovery, and late discovery of CXL resources are expected + * to use this interface. The former are built-in and only the latter, + * CXL, is a module. + */ +EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL); /** * remove_resource - Remove a resource in the resource tree @@ -1773,62 +1781,139 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); -#ifdef CONFIG_DEVICE_PRIVATE -static struct resource *__request_free_mem_region(struct device *dev, - struct resource *base, unsigned long size, const char *name) +#ifdef CONFIG_GET_FREE_REGION +#define GFR_DESCENDING (1UL << 0) +#define GFR_REQUEST_REGION (1UL << 1) +#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) + +static resource_size_t gfr_start(struct resource *base, resource_size_t size, + resource_size_t align, unsigned long flags) +{ + if (flags & GFR_DESCENDING) { + resource_size_t end; + + end = min_t(resource_size_t, base->end, + (1ULL << MAX_PHYSMEM_BITS) - 1); + return end - size + 1; + } + + return ALIGN(base->start, align); +} + +static bool gfr_continue(struct resource *base, resource_size_t addr, + resource_size_t size, unsigned long flags) +{ + if (flags & GFR_DESCENDING) + return addr > size && addr >= base->start; + /* + * In the ascend case be careful that the last increment by + * @size did not wrap 0. + */ + return addr > addr - size && + addr <= min_t(resource_size_t, base->end, + (1ULL << MAX_PHYSMEM_BITS) - 1); +} + +static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, + unsigned long flags) { - resource_size_t end, addr; + if (flags & GFR_DESCENDING) + return addr - size; + return addr + size; +} + +static void remove_free_mem_region(void *_res) +{ + struct resource *res = _res; + + if (res->parent) + remove_resource(res); + free_resource(res); +} + +static struct resource * +get_free_mem_region(struct device *dev, struct resource *base, + resource_size_t size, const unsigned long align, + const char *name, const unsigned long desc, + const unsigned long flags) +{ + resource_size_t addr; struct resource *res; struct region_devres *dr = NULL; - size = ALIGN(size, 1UL << PA_SECTION_SHIFT); - end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); - addr = end - size + 1UL; + size = ALIGN(size, align); res = alloc_resource(GFP_KERNEL); if (!res) return ERR_PTR(-ENOMEM); - if (dev) { + if (dev && (flags & GFR_REQUEST_REGION)) { dr = devres_alloc(devm_region_release, sizeof(struct region_devres), GFP_KERNEL); if (!dr) { free_resource(res); return ERR_PTR(-ENOMEM); } + } else if (dev) { + if (devm_add_action_or_reset(dev, remove_free_mem_region, res)) + return ERR_PTR(-ENOMEM); } write_lock(&resource_lock); - for (; addr > size && addr >= base->start; addr -= size) { - if (__region_intersects(addr, size, 0, IORES_DESC_NONE) != - REGION_DISJOINT) + for (addr = gfr_start(base, size, align, flags); + gfr_continue(base, addr, size, flags); + addr = gfr_next(addr, size, flags)) { + if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) continue; - if (__request_region_locked(res, &iomem_resource, addr, size, - name, 0)) - break; + if (flags & GFR_REQUEST_REGION) { + if (__request_region_locked(res, &iomem_resource, addr, + size, name, 0)) + break; - if (dev) { - dr->parent = &iomem_resource; - dr->start = addr; - dr->n = size; - devres_add(dev, dr); - } + if (dev) { + dr->parent = &iomem_resource; + dr->start = addr; + dr->n = size; + devres_add(dev, dr); + } - res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; - write_unlock(&resource_lock); + res->desc = desc; + write_unlock(&resource_lock); + + + /* + * A driver is claiming this region so revoke any + * mappings. + */ + revoke_iomem(res); + } else { + res->start = addr; + res->end = addr + size - 1; + res->name = name; + res->desc = desc; + res->flags = IORESOURCE_MEM; + + /* + * Only succeed if the resource hosts an exclusive + * range after the insert + */ + if (__insert_resource(base, res) || res->child) + break; + + write_unlock(&resource_lock); + } - /* - * A driver is claiming this region so revoke any mappings. - */ - revoke_iomem(res); return res; } write_unlock(&resource_lock); - free_resource(res); - if (dr) + if (flags & GFR_REQUEST_REGION) { + free_resource(res); devres_free(dr); + } else if (dev) + devm_release_action(dev, remove_free_mem_region, res); return ERR_PTR(-ERANGE); } @@ -1847,18 +1932,48 @@ static struct resource *__request_free_mem_region(struct device *dev, struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size) { - return __request_free_mem_region(dev, base, size, dev_name(dev)); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN, + dev_name(dev), + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(devm_request_free_mem_region); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name) { - return __request_free_mem_region(NULL, base, size, name); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name, + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(request_free_mem_region); -#endif /* CONFIG_DEVICE_PRIVATE */ +/** + * alloc_free_mem_region - find a free region relative to @base + * @base: resource that will parent the new resource + * @size: size in bytes of memory to allocate from @base + * @align: alignment requirements for the allocation + * @name: resource name + * + * Buses like CXL, that can dynamically instantiate new memory regions, + * need a method to allocate physical address space for those regions. + * Allocate and insert a new resource to cover a free, unclaimed by a + * descendant of @base, range in the span of @base. + */ +struct resource *alloc_free_mem_region(struct resource *base, + unsigned long size, unsigned long align, + const char *name) +{ + /* Default of ascending direction and insert resource */ + unsigned long flags = 0; + + return get_free_mem_region(NULL, base, size, align, name, + IORES_DESC_NONE, flags); +} +EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL); +#endif /* CONFIG_GET_FREE_REGION */ static int __init strict_iomem(char *str) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 189999007f32..ee28253c9ac0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3802,7 +3802,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } -static inline bool ttwu_queue_cond(int cpu) +static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* * Do not complicate things with the async wake_list while the CPU is @@ -3811,6 +3811,10 @@ static inline bool ttwu_queue_cond(int cpu) if (!cpu_active(cpu)) return false; + /* Ensure the task will still be allowed to run on the CPU. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + /* * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. @@ -3840,7 +3844,7 @@ static inline bool ttwu_queue_cond(int cpu) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) { + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -9012,7 +9016,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, } int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) + const struct cpumask *cs_effective_cpus) { int ret = 0; @@ -9031,9 +9035,11 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) { - int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); + cs_effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) + return -EINVAL; ret = dl_cpu_busy(cpu, p); } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ec66b40bdd40..ecb4b4ff4ce0 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,12 +190,8 @@ static void group_init(struct psi_group *group) /* Init trigger-related members */ mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); - memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); - group->poll_states = 0; group->poll_min_period = U32_MAX; - memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; - group->polling_until = 0; init_waitqueue_head(&group->poll_wait); timer_setup(&group->poll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->poll_task, NULL); @@ -957,7 +953,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return 0; - cgroup->psi = kmalloc(sizeof(struct psi_group), GFP_KERNEL); + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); if (!cgroup->psi) return -ENOMEM; @@ -1091,7 +1087,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res) + char *buf, enum psi_res res) { struct psi_trigger *t; enum psi_states state; @@ -1320,7 +1316,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, nbytes, res); + new = psi_trigger_create(&psi_system, buf, res); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a6f071b2acac..e26688d387ae 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -481,9 +481,6 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); -extern void unregister_rt_sched_group(struct task_group *tg); -extern void free_rt_sched_group(struct task_group *tg); -extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); @@ -521,6 +518,10 @@ struct cfs_bandwidth { }; #endif /* CONFIG_CGROUP_SCHED */ +extern void unregister_rt_sched_group(struct task_group *tg); +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); + /* * u64_u32_load/u64_u32_store * diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index d4788f810b55..0b1cd985dc27 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); finish_wait(wq_head, &wbq_entry->wq_entry); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a492f159624f..860b2dcf3ac4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self); /* mm/fadvise.c */ COND_SYSCALL(fadvise64_64); +COND_SYSCALL_COMPAT(fadvise64_64); /* mm/, CONFIG_MMU only */ COND_SYSCALL(swapon); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b233714a1c78..205d605cacc5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -492,12 +492,12 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, int *i, vleft, first = 1, err = 0; size_t left; char *p; - + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -729,7 +729,7 @@ int proc_dobool(struct ctl_table *table, int write, void *buffer, * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * values from/to the user buffer, treated as an ASCII string. * * Returns 0 on success. */ @@ -1273,7 +1273,7 @@ static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lv * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in seconds, and are converted into * jiffies. * @@ -1306,8 +1306,8 @@ int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, * @ppos: pointer to the file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and * are converted into jiffies. * * Returns 0 on success. @@ -1315,8 +1315,8 @@ int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_userhz_jiffies_conv,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_userhz_jiffies_conv, NULL); } /** @@ -2061,7 +2061,7 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#if defined(CONFIG_TREE_RCU) +#ifdef CONFIG_TREE_RCU { .procname = "panic_on_rcu_stall", .data = &sysctl_panic_on_rcu_stall, @@ -2071,8 +2071,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#endif -#if defined(CONFIG_TREE_RCU) { .procname = "max_rcu_stall_to_panic", .data = &sysctl_max_rcu_stall_to_panic, diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index fcb3b21d8bdc..90ea5f373e50 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -70,7 +70,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, return do_sys_settimeofday64(&new_tp, NULL); } -int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) +static int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) { switch (which_clock) { case CLOCK_REALTIME: @@ -90,6 +90,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) return 0; } + SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 29923b20e0e4..526257b3727c 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,7 +449,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) +struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); struct __kernel_old_timeval tv; @@ -503,7 +503,7 @@ EXPORT_SYMBOL(set_normalized_timespec64); * * Returns the timespec64 representation of the nsec parameter. */ -struct timespec64 ns_to_timespec64(const s64 nsec) +struct timespec64 ns_to_timespec64(s64 nsec) { struct timespec64 ts = { 0, 0 }; s32 rem; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ccd6a5ade3e9..1052126bdca2 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1106,4 +1106,6 @@ config HIST_TRIGGERS_DEBUG If unsure, say N. +source "kernel/trace/rv/Kconfig" + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 0d261774d6f3..c6651e16b557 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -106,5 +106,6 @@ obj-$(CONFIG_FPROBE) += fprobe.o obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o +obj-$(CONFIG_RV) += rv/ libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bc921a3f7ea8..439e2ab6905e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1861,8 +1861,6 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } -static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip); - /* * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK * or no-needed to update, -EBUSY if it detects a conflict of the flag @@ -2974,6 +2972,16 @@ int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_startup_enable(command); + /* + * If ftrace is in an undefined state, we just remove ops from list + * to prevent the NULL pointer, instead of totally rolling it back and + * free trampoline, because those actions could cause further damage. + */ + if (unlikely(ftrace_disabled)) { + __unregister_ftrace_function(ops); + return -ENODEV; + } + ops->flags &= ~FTRACE_OPS_FL_ADDING; return 0; @@ -3108,49 +3116,6 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) ftrace_hash_empty(ops->func_hash->notrace_hash); } -/* - * Check if the current ops references the given ip. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_ip(struct ftrace_ops *ops, unsigned long ip) -{ - /* If ops isn't enabled, ignore it */ - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return false; - - /* If ops traces all then it includes this function */ - if (ops_traces_mod(ops)) - return true; - - /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) - return false; - - /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) - return false; - - return true; -} - -/* - * Check if the current ops references the record. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) -{ - return ops_references_ip(ops, rec->ip); -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { bool init_nop = ftrace_need_init_nop(); @@ -6812,6 +6777,38 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, return -ERANGE; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES) +/* + * Check if the current ops references the given ip. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return false; + + /* If ops traces all then it includes this function */ + if (ops_traces_mod(ops)) + return true; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) + return false; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) + return false; + + return true; +} +#endif + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6824,7 +6821,7 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) { + if (ops_references_ip(ops, rec->ip)) { if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) continue; if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig new file mode 100644 index 000000000000..831779607e84 --- /dev/null +++ b/kernel/trace/rv/Kconfig @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config DA_MON_EVENTS + bool + +config DA_MON_EVENTS_IMPLICIT + select DA_MON_EVENTS + bool + +config DA_MON_EVENTS_ID + select DA_MON_EVENTS + bool + +menuconfig RV + bool "Runtime Verification" + depends on TRACING + help + Enable the kernel runtime verification infrastructure. RV is a + lightweight (yet rigorous) method that complements classical + exhaustive verification techniques (such as model checking and + theorem proving). RV works by analyzing the trace of the system's + actual execution, comparing it against a formal specification of + the system behavior. + + For further information, see: + Documentation/trace/rv/runtime-verification.rst + +config RV_MON_WIP + depends on RV + depends on PREEMPT_TRACER + select DA_MON_EVENTS_IMPLICIT + bool "wip monitor" + help + Enable wip (wakeup in preemptive) sample monitor that illustrates + the usage of per-cpu monitors, and one limitation of the + preempt_disable/enable events. + + For further information, see: + Documentation/trace/rv/monitor_wip.rst + +config RV_MON_WWNR + depends on RV + select DA_MON_EVENTS_ID + bool "wwnr monitor" + help + Enable wwnr (wakeup while not running) sample monitor, this is a + sample monitor that illustrates the usage of per-task monitor. + The model is borken on purpose: it serves to test reactors. + + For further information, see: + Documentation/trace/rv/monitor_wwnr.rst + +config RV_REACTORS + bool "Runtime verification reactors" + default y + depends on RV + help + Enables the online runtime verification reactors. A runtime + monitor can cause a reaction to the detection of an exception + on the model's execution. By default, the monitors have + tracing reactions, printing the monitor output via tracepoints, + but other reactions can be added (on-demand) via this interface. + +config RV_REACT_PRINTK + bool "Printk reactor" + depends on RV_REACTORS + default y + help + Enables the printk reactor. The printk reactor emits a printk() + message if an exception is found. + +config RV_REACT_PANIC + bool "Panic reactor" + depends on RV_REACTORS + default y + help + Enables the panic reactor. The panic reactor emits a printk() + message if an exception is found and panic()s the system. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile new file mode 100644 index 000000000000..963d14875b45 --- /dev/null +++ b/kernel/trace/rv/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_RV) += rv.o +obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o +obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_REACTORS) += rv_reactors.o +obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o +obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c new file mode 100644 index 000000000000..83cace53b9fa --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "wip" + +#include <trace/events/rv.h> +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> + +#include "wip.h" + +struct rv_monitor rv_wip; +DECLARE_DA_MON_PER_CPU(wip, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_wip(preempt_disable_wip); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_wip(preempt_enable_wip); +} + +static void handle_sched_waking(void *data, struct task_struct *task) +{ + da_handle_event_wip(sched_waking_wip); +} + +static int enable_wip(void) +{ + int retval; + + retval = da_monitor_init_wip(); + if (retval) + return retval; + + rv_attach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("wip", sched_waking, handle_sched_waking); + rv_attach_trace_probe("wip", preempt_disable, handle_preempt_disable); + + return 0; +} + +static void disable_wip(void) +{ + rv_wip.enabled = 0; + + rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("wip", sched_waking, handle_sched_waking); + + da_monitor_destroy_wip(); +} + +struct rv_monitor rv_wip = { + .name = "wip", + .description = "wakeup in preemptive per-cpu testing monitor.", + .enable = enable_wip, + .disable = disable_wip, + .reset = da_monitor_reset_all_wip, + .enabled = 0, +}; + +static int register_wip(void) +{ + rv_register_monitor(&rv_wip); + return 0; +} + +static void unregister_wip(void) +{ + rv_unregister_monitor(&rv_wip); +} + +module_init(register_wip); +module_exit(unregister_wip); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>"); +MODULE_DESCRIPTION("wip: wakeup in preemptive - per-cpu sample monitor."); diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h new file mode 100644 index 000000000000..c1c47e2305ef --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wip automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wip { + preemptive_wip = 0, + non_preemptive_wip, + state_max_wip +}; + +#define INVALID_STATE state_max_wip + +enum events_wip { + preempt_disable_wip = 0, + preempt_enable_wip, + sched_waking_wip, + event_max_wip +}; + +struct automaton_wip { + char *state_names[state_max_wip]; + char *event_names[event_max_wip]; + unsigned char function[state_max_wip][event_max_wip]; + unsigned char initial_state; + bool final_states[state_max_wip]; +}; + +struct automaton_wip automaton_wip = { + .state_names = { + "preemptive", + "non_preemptive" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "sched_waking" + }, + .function = { + { non_preemptive_wip, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, preemptive_wip, non_preemptive_wip }, + }, + .initial_state = preemptive_wip, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c new file mode 100644 index 000000000000..599225d9cf38 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "wwnr" + +#include <trace/events/rv.h> +#include <trace/events/sched.h> + +#include "wwnr.h" + +struct rv_monitor rv_wwnr; +DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); + +static void handle_switch(void *data, bool preempt, struct task_struct *p, + struct task_struct *n, unsigned int prev_state) +{ + /* start monitoring only after the first suspension */ + if (prev_state == TASK_INTERRUPTIBLE) + da_handle_start_event_wwnr(p, switch_out_wwnr); + else + da_handle_event_wwnr(p, switch_out_wwnr); + + da_handle_event_wwnr(n, switch_in_wwnr); +} + +static void handle_wakeup(void *data, struct task_struct *p) +{ + da_handle_event_wwnr(p, wakeup_wwnr); +} + +static int enable_wwnr(void) +{ + int retval; + + retval = da_monitor_init_wwnr(); + if (retval) + return retval; + + rv_attach_trace_probe("wwnr", sched_switch, handle_switch); + rv_attach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + return 0; +} + +static void disable_wwnr(void) +{ + rv_wwnr.enabled = 0; + + rv_detach_trace_probe("wwnr", sched_switch, handle_switch); + rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + da_monitor_destroy_wwnr(); +} + +struct rv_monitor rv_wwnr = { + .name = "wwnr", + .description = "wakeup while not running per-task testing model.", + .enable = enable_wwnr, + .disable = disable_wwnr, + .reset = da_monitor_reset_all_wwnr, + .enabled = 0, +}; + +static int register_wwnr(void) +{ + rv_register_monitor(&rv_wwnr); + return 0; +} + +static void unregister_wwnr(void) +{ + rv_unregister_monitor(&rv_wwnr); +} + +module_init(register_wwnr); +module_exit(unregister_wwnr); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>"); +MODULE_DESCRIPTION("wwnr: wakeup while not running monitor"); diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h new file mode 100644 index 000000000000..d1afe55cdd4c --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wwnr automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wwnr { + not_running_wwnr = 0, + running_wwnr, + state_max_wwnr +}; + +#define INVALID_STATE state_max_wwnr + +enum events_wwnr { + switch_in_wwnr = 0, + switch_out_wwnr, + wakeup_wwnr, + event_max_wwnr +}; + +struct automaton_wwnr { + char *state_names[state_max_wwnr]; + char *event_names[event_max_wwnr]; + unsigned char function[state_max_wwnr][event_max_wwnr]; + unsigned char initial_state; + bool final_states[state_max_wwnr]; +}; + +struct automaton_wwnr automaton_wwnr = { + .state_names = { + "not_running", + "running" + }, + .event_names = { + "switch_in", + "switch_out", + "wakeup" + }, + .function = { + { running_wwnr, INVALID_STATE, not_running_wwnr }, + { INVALID_STATE, not_running_wwnr, INVALID_STATE }, + }, + .initial_state = not_running_wwnr, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c new file mode 100644 index 000000000000..b698d05dd069 --- /dev/null +++ b/kernel/trace/rv/reactor_panic.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * Panic RV reactor: + * Prints the exception msg to the kernel message log and panic(). + */ + +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +static void rv_panic_reaction(char *msg) +{ + panic(msg); +} + +static struct rv_reactor rv_panic = { + .name = "panic", + .description = "panic the system if an exception is found.", + .react = rv_panic_reaction +}; + +static int register_react_panic(void) +{ + rv_register_reactor(&rv_panic); + return 0; +} + +static void unregister_react_panic(void) +{ + rv_unregister_reactor(&rv_panic); +} + +module_init(register_react_panic); +module_exit(unregister_react_panic); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("panic rv reactor: panic if an exception is found."); diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c new file mode 100644 index 000000000000..31899f953af4 --- /dev/null +++ b/kernel/trace/rv/reactor_printk.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * Printk RV reactor: + * Prints the exception msg to the kernel message log. + */ +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +static void rv_printk_reaction(char *msg) +{ + printk_deferred(msg); +} + +static struct rv_reactor rv_printk = { + .name = "printk", + .description = "prints the exception msg to the kernel message log.", + .react = rv_printk_reaction +}; + +static int register_react_printk(void) +{ + rv_register_reactor(&rv_printk); + return 0; +} + +static void unregister_react_printk(void) +{ + rv_unregister_reactor(&rv_printk); +} + +module_init(register_react_printk); +module_exit(unregister_react_printk); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("printk rv reactor: printk if an exception is hit."); diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c new file mode 100644 index 000000000000..6c97cc2d754a --- /dev/null +++ b/kernel/trace/rv/rv.c @@ -0,0 +1,799 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * This is the online Runtime Verification (RV) interface. + * + * RV is a lightweight (yet rigorous) method that complements classical + * exhaustive verification techniques (such as model checking and + * theorem proving) with a more practical approach to complex systems. + * + * RV works by analyzing the trace of the system's actual execution, + * comparing it against a formal specification of the system behavior. + * RV can give precise information on the runtime behavior of the + * monitored system while enabling the reaction for unexpected + * events, avoiding, for example, the propagation of a failure on + * safety-critical systems. + * + * The development of this interface roots in the development of the + * paper: + * + * De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo + * Silva. Efficient formal verification for the Linux kernel. In: + * International Conference on Software Engineering and Formal Methods. + * Springer, Cham, 2019. p. 315-332. + * + * And: + * + * De Oliveira, Daniel Bristot, et al. Automata-based formal analysis + * and verification of the real-time Linux kernel. PhD Thesis, 2020. + * + * == Runtime monitor interface == + * + * A monitor is the central part of the runtime verification of a system. + * + * The monitor stands in between the formal specification of the desired + * (or undesired) behavior, and the trace of the actual system. + * + * In Linux terms, the runtime verification monitors are encapsulated + * inside the "RV monitor" abstraction. A RV monitor includes a reference + * model of the system, a set of instances of the monitor (per-cpu monitor, + * per-task monitor, and so on), and the helper functions that glue the + * monitor to the system via trace. Generally, a monitor includes some form + * of trace output as a reaction for event parsing and exceptions, + * as depicted bellow: + * + * Linux +----- RV Monitor ----------------------------------+ Formal + * Realm | | Realm + * +-------------------+ +----------------+ +-----------------+ + * | Linux kernel | | Monitor | | Reference | + * | Tracing | -> | Instance(s) | <- | Model | + * | (instrumentation) | | (verification) | | (specification) | + * +-------------------+ +----------------+ +-----------------+ + * | | | + * | V | + * | +----------+ | + * | | Reaction | | + * | +--+--+--+-+ | + * | | | | | + * | | | +-> trace output ? | + * +------------------------|--|----------------------+ + * | +----> panic ? + * +-------> <user-specified> + * + * This file implements the interface for loading RV monitors, and + * to control the verification session. + * + * == Registering monitors == + * + * The struct rv_monitor defines a set of callback functions to control + * a verification session. For instance, when a given monitor is enabled, + * the "enable" callback function is called to hook the instrumentation + * functions to the kernel trace events. The "disable" function is called + * when disabling the verification session. + * + * A RV monitor is registered via: + * int rv_register_monitor(struct rv_monitor *monitor); + * And unregistered via: + * int rv_unregister_monitor(struct rv_monitor *monitor); + * + * == User interface == + * + * The user interface resembles kernel tracing interface. It presents + * these files: + * + * "available_monitors" + * - List the available monitors, one per line. + * + * For example: + * # cat available_monitors + * wip + * wwnr + * + * "enabled_monitors" + * - Lists the enabled monitors, one per line; + * - Writing to it enables a given monitor; + * - Writing a monitor name with a '!' prefix disables it; + * - Truncating the file disables all enabled monitors. + * + * For example: + * # cat enabled_monitors + * # echo wip > enabled_monitors + * # echo wwnr >> enabled_monitors + * # cat enabled_monitors + * wip + * wwnr + * # echo '!wip' >> enabled_monitors + * # cat enabled_monitors + * wwnr + * # echo > enabled_monitors + * # cat enabled_monitors + * # + * + * Note that more than one monitor can be enabled concurrently. + * + * "monitoring_on" + * - It is an on/off general switcher for monitoring. Note + * that it does not disable enabled monitors or detach events, + * but stops the per-entity monitors from monitoring the events + * received from the instrumentation. It resembles the "tracing_on" + * switcher. + * + * "monitors/" + * Each monitor will have its own directory inside "monitors/". There + * the monitor specific files will be presented. + * The "monitors/" directory resembles the "events" directory on + * tracefs. + * + * For example: + * # cd monitors/wip/ + * # ls + * desc enable + * # cat desc + * auto-generated wakeup in preemptive monitor. + * # cat enable + * 0 + * + * For further information, see: + * Documentation/trace/rv/runtime-verification.rst + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> + +#ifdef CONFIG_DA_MON_EVENTS +#define CREATE_TRACE_POINTS +#include <trace/events/rv.h> +#endif + +#include "rv.h" + +DEFINE_MUTEX(rv_interface_lock); + +static struct rv_interface rv_root; + +struct dentry *get_monitors_root(void) +{ + return rv_root.monitors_dir; +} + +/* + * Interface for the monitor register. + */ +static LIST_HEAD(rv_monitors_list); + +static int task_monitor_count; +static bool task_monitor_slots[RV_PER_TASK_MONITORS]; + +int rv_get_task_monitor_slot(void) +{ + int i; + + lockdep_assert_held(&rv_interface_lock); + + if (task_monitor_count == RV_PER_TASK_MONITORS) + return -EBUSY; + + task_monitor_count++; + + for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + if (task_monitor_slots[i] == false) { + task_monitor_slots[i] = true; + return i; + } + } + + WARN_ONCE(1, "RV task_monitor_count and slots are out of sync\n"); + + return -EINVAL; +} + +void rv_put_task_monitor_slot(int slot) +{ + lockdep_assert_held(&rv_interface_lock); + + if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); + return; + } + + WARN_ONCE(!task_monitor_slots[slot], "RV releasing unused task_monitor_slots: %d\n", + slot); + + task_monitor_count--; + task_monitor_slots[slot] = false; +} + +/* + * This section collects the monitor/ files and folders. + */ +static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + const char *buff; + + buff = mdef->monitor->enabled ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +/* + * __rv_disable_monitor - disabled an enabled monitor + */ +static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +{ + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) { + mdef->monitor->enabled = 0; + mdef->monitor->disable(); + + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + if (sync) + tracepoint_synchronize_unregister(); + return 1; + } + return 0; +} + +/** + * rv_disable_monitor - disable a given runtime monitor + * + * Returns 0 on success. + */ +int rv_disable_monitor(struct rv_monitor_def *mdef) +{ + __rv_disable_monitor(mdef, true); + return 0; +} + +/** + * rv_enable_monitor - enable a given runtime monitor + * + * Returns 0 on success, error otherwise. + */ +int rv_enable_monitor(struct rv_monitor_def *mdef) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) + return 0; + + retval = mdef->monitor->enable(); + + if (!retval) + mdef->monitor->enabled = 1; + + return retval; +} + +/* + * interface for enabling/disabling a monitor. + */ +static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + retval = count; + + mutex_lock(&rv_interface_lock); + + if (val) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + mutex_unlock(&rv_interface_lock); + + return retval ? : count; +} + +static const struct file_operations interface_enable_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitor_enable_write_data, + .read = monitor_enable_read_data, +}; + +/* + * Interface to read monitors description. + */ +static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + char buff[256]; + + memset(buff, 0, sizeof(buff)); + + snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static const struct file_operations interface_desc_fops = { + .open = simple_open, + .llseek = no_llseek, + .read = monitor_desc_read_data, +}; + +/* + * During the registration of a monitor, this function creates + * the monitor dir, where the specific options of the monitor + * are exposed. + */ +static int create_monitor_dir(struct rv_monitor_def *mdef) +{ + struct dentry *root = get_monitors_root(); + const char *name = mdef->monitor->name; + struct dentry *tmp; + int retval; + + mdef->root_d = rv_create_dir(name, root); + if (!mdef->root_d) + return -ENOMEM; + + tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + retval = reactor_populate_monitor(mdef); + if (retval) + goto out_remove_root; + + return 0; + +out_remove_root: + rv_remove(mdef->root_d); + return retval; +} + +/* + * Available/Enable monitor shared seq functions. + */ +static int monitors_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mon_def = p; + + seq_printf(m, "%s\n", mon_def->monitor->name); + return 0; +} + +/* + * Used by the seq file operations at the end of a read + * operation. + */ +static void monitors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +/* + * Available monitor seq functions. + */ +static void *available_monitors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_monitors_list, *pos); +} + +static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_monitors_list, pos); +} + +/* + * Enable monitor seq functions. + */ +static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct rv_monitor_def *m_def = p; + + (*pos)++; + + list_for_each_entry_continue(m_def, &rv_monitors_list, list) { + if (m_def->monitor->enabled) + return m_def; + } + + return NULL; +} + +static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) +{ + struct rv_monitor_def *m_def; + loff_t l; + + mutex_lock(&rv_interface_lock); + + if (list_empty(&rv_monitors_list)) + return NULL; + + m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + + for (l = 0; l <= *pos; ) { + m_def = enabled_monitors_next(m, m_def, &l); + if (!m_def) + break; + } + + return m_def; +} + +/* + * available/enabled monitors seq definition. + */ +static const struct seq_operations available_monitors_seq_ops = { + .start = available_monitors_start, + .next = available_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +static const struct seq_operations enabled_monitors_seq_ops = { + .start = enabled_monitors_start, + .next = enabled_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +/* + * available_monitors interface. + */ +static int available_monitors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_monitors_seq_ops); +}; + +static const struct file_operations available_monitors_ops = { + .open = available_monitors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * enabled_monitors interface. + */ +static void disable_all_monitors(void) +{ + struct rv_monitor_def *mdef; + int enabled = 0; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(mdef, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mdef, false); + + if (enabled) { + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + tracepoint_synchronize_unregister(); + } + + mutex_unlock(&rv_interface_lock); +} + +static int enabled_monitors_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + disable_all_monitors(); + + return seq_open(file, &enabled_monitors_seq_ops); +}; + +static ssize_t enabled_monitors_write(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + int retval = -EINVAL; + bool enable = true; + char *ptr = buff; + int len; + + if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + if (ptr[0] == '!') { + enable = false; + ptr++; + } + + len = strlen(ptr); + if (!len) + return count; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (strcmp(ptr, mdef->monitor->name) != 0) + continue; + + /* + * Monitor found! + */ + if (enable) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + if (!retval) + retval = count; + + break; + } + + mutex_unlock(&rv_interface_lock); + return retval; +} + +static const struct file_operations enabled_monitors_ops = { + .open = enabled_monitors_open, + .read = seq_read, + .write = enabled_monitors_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Monitoring on global switcher! + */ +static bool __read_mostly monitoring_on; + +/** + * rv_monitoring_on - checks if monitoring is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_monitoring_on(void) +{ + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_rmb(); + return READ_ONCE(monitoring_on); +} + +/* + * monitoring_on general switcher. + */ +static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, + size_t count, loff_t *ppos) +{ + const char *buff; + + buff = rv_monitoring_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static void turn_monitoring_off(void) +{ + WRITE_ONCE(monitoring_on, false); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void reset_all_monitors(void) +{ + struct rv_monitor_def *mdef; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (mdef->monitor->enabled) + mdef->monitor->reset(); + } +} + +static void turn_monitoring_on(void) +{ + WRITE_ONCE(monitoring_on, true); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void turn_monitoring_on_with_reset(void) +{ + lockdep_assert_held(&rv_interface_lock); + + if (rv_monitoring_on()) + return; + + /* + * Monitors might be out of sync with the system if events were not + * processed because of !rv_monitoring_on(). + * + * Reset all monitors, forcing a re-sync. + */ + reset_all_monitors(); + turn_monitoring_on(); +} + +static ssize_t monitoring_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_monitoring_on_with_reset(); + else + turn_monitoring_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations monitoring_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitoring_on_write_data, + .read = monitoring_on_read_data, +}; + +static void destroy_monitor_dir(struct rv_monitor_def *mdef) +{ + reactor_cleanup_monitor(mdef); + rv_remove(mdef->root_d); +} + +/** + * rv_register_monitor - register a rv monitor. + * @monitor: The rv_monitor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *r; + int retval = 0; + + if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { + pr_info("Monitor %s has a name longer than %d\n", monitor->name, + MAX_RV_MONITOR_NAME_SIZE); + return -1; + } + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(r, &rv_monitors_list, list) { + if (strcmp(monitor->name, r->monitor->name) == 0) { + pr_info("Monitor %s is already registered\n", monitor->name); + retval = -1; + goto out_unlock; + } + } + + r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); + if (!r) { + retval = -ENOMEM; + goto out_unlock; + } + + r->monitor = monitor; + + retval = create_monitor_dir(r); + if (retval) { + kfree(r); + goto out_unlock; + } + + list_add_tail(&r->list, &rv_monitors_list); + +out_unlock: + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_monitor - unregister a rv monitor. + * @monitor: The rv_monitor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *ptr, *next; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { + if (strcmp(monitor->name, ptr->monitor->name) == 0) { + rv_disable_monitor(ptr); + list_del(&ptr->list); + destroy_monitor_dir(ptr); + } + } + + mutex_unlock(&rv_interface_lock); + return 0; +} + +int __init rv_init_interface(void) +{ + struct dentry *tmp; + int retval; + + rv_root.root_dir = rv_create_dir("rv", NULL); + if (!rv_root.root_dir) + goto out_err; + + rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir); + if (!rv_root.monitors_dir) + goto out_err; + + tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL, + &available_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL, + &enabled_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL, + &monitoring_on_fops); + if (!tmp) + goto out_err; + retval = init_rv_reactors(rv_root.root_dir); + if (retval) + goto out_err; + + turn_monitoring_on(); + + return 0; + +out_err: + rv_remove(rv_root.root_dir); + printk(KERN_ERR "RV: Error while creating the RV interface\n"); + return 1; +} diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h new file mode 100644 index 000000000000..db6cb0913dbd --- /dev/null +++ b/kernel/trace/rv/rv.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/mutex.h> + +struct rv_interface { + struct dentry *root_dir; + struct dentry *monitors_dir; +}; + +#include "../trace.h" +#include <linux/tracefs.h> +#include <linux/rv.h> + +#define RV_MODE_WRITE TRACE_MODE_WRITE +#define RV_MODE_READ TRACE_MODE_READ + +#define rv_create_dir tracefs_create_dir +#define rv_create_file tracefs_create_file +#define rv_remove tracefs_remove + +#define MAX_RV_MONITOR_NAME_SIZE 32 +#define MAX_RV_REACTOR_NAME_SIZE 32 + +extern struct mutex rv_interface_lock; + +#ifdef CONFIG_RV_REACTORS +struct rv_reactor_def { + struct list_head list; + struct rv_reactor *reactor; + /* protected by the monitor interface lock */ + int counter; +}; +#endif + +struct rv_monitor_def { + struct list_head list; + struct rv_monitor *monitor; + struct dentry *root_d; +#ifdef CONFIG_RV_REACTORS + struct rv_reactor_def *rdef; + bool reacting; +#endif + bool task_monitor; +}; + +struct dentry *get_monitors_root(void); +int rv_disable_monitor(struct rv_monitor_def *mdef); +int rv_enable_monitor(struct rv_monitor_def *mdef); + +#ifdef CONFIG_RV_REACTORS +int reactor_populate_monitor(struct rv_monitor_def *mdef); +void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int init_rv_reactors(struct dentry *root_dir); +#else +static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + return 0; +} + +static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + return; +} + +static inline int init_rv_reactors(struct dentry *root_dir) +{ + return 0; +} +#endif diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c new file mode 100644 index 000000000000..6aae106695b6 --- /dev/null +++ b/kernel/trace/rv/rv_reactors.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * Runtime reactor interface. + * + * A runtime monitor can cause a reaction to the detection of an + * exception on the model's execution. By default, the monitors have + * tracing reactions, printing the monitor output via tracepoints. + * But other reactions can be added (on-demand) via this interface. + * + * == Registering reactors == + * + * The struct rv_reactor defines a callback function to be executed + * in case of a model exception happens. The callback function + * receives a message to be (optionally) printed before executing + * the reaction. + * + * A RV reactor is registered via: + * int rv_register_reactor(struct rv_reactor *reactor) + * And unregistered via: + * int rv_unregister_reactor(struct rv_reactor *reactor) + * + * These functions are exported to modules, enabling reactors to be + * dynamically loaded. + * + * == User interface == + * + * The user interface resembles the kernel tracing interface and + * presents these files: + * + * "available_reactors" + * - List the available reactors, one per line. + * + * For example: + * # cat available_reactors + * nop + * panic + * printk + * + * "reacting_on" + * - It is an on/off general switch for reactors, disabling + * all reactions. + * + * "monitors/MONITOR/reactors" + * - List available reactors, with the select reaction for the given + * MONITOR inside []. The default one is the nop (no operation) + * reactor. + * - Writing the name of an reactor enables it to the given + * MONITOR. + * + * For example: + * # cat monitors/wip/reactors + * [nop] + * panic + * printk + * # echo panic > monitors/wip/reactors + * # cat monitors/wip/reactors + * nop + * [panic] + * printk + */ + +#include <linux/slab.h> + +#include "rv.h" + +/* + * Interface for the reactor register. + */ +static LIST_HEAD(rv_reactors_list); + +static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(name, r->reactor->name) == 0) + return r; + } + return NULL; +} + +/* + * Available reactors seq functions. + */ +static int reactors_show(struct seq_file *m, void *p) +{ + struct rv_reactor_def *rea_def = p; + + seq_printf(m, "%s\n", rea_def->reactor->name); + return 0; +} + +static void reactors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +static void *reactors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_reactors_list, *pos); +} + +static void *reactors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_reactors_list, pos); +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations available_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = reactors_show +}; + +/* + * available_reactors interface. + */ +static int available_reactors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_reactors_seq_ops); +}; + +static const struct file_operations available_reactors_ops = { + .open = available_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * Monitor's reactor file. + */ +static int monitor_reactor_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mdef = m->private; + struct rv_reactor_def *rdef = p; + + if (mdef->rdef == rdef) + seq_printf(m, "[%s]\n", rdef->reactor->name); + else + seq_printf(m, "%s\n", rdef->reactor->name); + return 0; +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations monitor_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = monitor_reactor_show +}; + +static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, + bool reacting) +{ + bool monitor_enabled; + + /* nothing to do */ + if (mdef->rdef == rdef) + return; + + monitor_enabled = mdef->monitor->enabled; + if (monitor_enabled) + rv_disable_monitor(mdef); + + /* swap reactor's usage */ + mdef->rdef->counter--; + rdef->counter++; + + mdef->rdef = rdef; + mdef->reacting = reacting; + mdef->monitor->react = rdef->reactor->react; + + if (monitor_enabled) + rv_enable_monitor(mdef); +} + +static ssize_t +monitor_reactors_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + struct rv_reactor_def *rdef; + struct seq_file *seq_f; + int retval = -EINVAL; + bool enable; + char *ptr; + int len; + + if (count < 1 || count > MAX_RV_REACTOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + len = strlen(ptr); + if (!len) + return count; + + /* + * See monitor_reactors_open() + */ + seq_f = file->private_data; + mdef = seq_f->private; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(rdef, &rv_reactors_list, list) { + if (strcmp(ptr, rdef->reactor->name) != 0) + continue; + + if (rdef == get_reactor_rdef_by_name("nop")) + enable = false; + else + enable = true; + + monitor_swap_reactors(mdef, rdef, enable); + + retval = count; + break; + } + + mutex_unlock(&rv_interface_lock); + + return retval; +} + +/* + * available_reactors interface. + */ +static int monitor_reactors_open(struct inode *inode, struct file *file) +{ + struct rv_monitor_def *mdef = inode->i_private; + struct seq_file *seq_f; + int ret; + + ret = seq_open(file, &monitor_reactors_seq_ops); + if (ret < 0) + return ret; + + /* + * seq_open stores the seq_file on the file->private data. + */ + seq_f = file->private_data; + + /* + * Copy the create file "private" data to the seq_file private data. + */ + seq_f->private = mdef; + + return 0; +}; + +static const struct file_operations monitor_reactors_ops = { + .open = monitor_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = monitor_reactors_write +}; + +static int __rv_register_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(reactor->name, r->reactor->name) == 0) { + pr_info("Reactor %s is already registered\n", reactor->name); + return -EINVAL; + } + } + + r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->reactor = reactor; + r->counter = 0; + + list_add_tail(&r->list, &rv_reactors_list); + + return 0; +} + +/** + * rv_register_reactor - register a rv reactor. + * @reactor: The rv_reactor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_reactor(struct rv_reactor *reactor) +{ + int retval = 0; + + if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) { + pr_info("Reactor %s has a name longer than %d\n", + reactor->name, MAX_RV_MONITOR_NAME_SIZE); + return -EINVAL; + } + + mutex_lock(&rv_interface_lock); + retval = __rv_register_reactor(reactor); + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_reactor - unregister a rv reactor. + * @reactor: The rv_reactor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *ptr, *next; + int ret = 0; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { + if (strcmp(reactor->name, ptr->reactor->name) == 0) { + + if (!ptr->counter) { + list_del(&ptr->list); + } else { + printk(KERN_WARNING + "rv: the rv_reactor %s is in use by %d monitor(s)\n", + ptr->reactor->name, ptr->counter); + printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", + ptr->reactor->name); + ret = -EBUSY; + break; + } + } + } + + mutex_unlock(&rv_interface_lock); + return ret; +} + +/* + * reacting_on interface. + */ +static bool __read_mostly reacting_on; + +/** + * rv_reacting_on - checks if reacting is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_reacting_on(void) +{ + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_rmb(); + return READ_ONCE(reacting_on); +} + +static ssize_t reacting_on_read_data(struct file *filp, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + char *buff; + + buff = rv_reacting_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +static void turn_reacting_off(void) +{ + WRITE_ONCE(reacting_on, false); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static void turn_reacting_on(void) +{ + WRITE_ONCE(reacting_on, true); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static ssize_t reacting_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_reacting_on(); + else + turn_reacting_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations reacting_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = reacting_on_write_data, + .read = reacting_on_read_data, +}; + +/** + * reactor_populate_monitor - creates per monitor reactors file + * @mdef: monitor's definition. + * + * Returns 0 if successful, error otherwise. + */ +int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + struct dentry *tmp; + + tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + if (!tmp) + return -ENOMEM; + + /* + * Configure as the rv_nop reactor. + */ + mdef->rdef = get_reactor_rdef_by_name("nop"); + mdef->rdef->counter++; + mdef->reacting = false; + + return 0; +} + +/** + * reactor_cleanup_monitor - cleanup a monitor reference + * @mdef: monitor's definition. + */ +void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + lockdep_assert_held(&rv_interface_lock); + mdef->rdef->counter--; + WARN_ON_ONCE(mdef->rdef->counter < 0); +} + +/* + * Nop reactor register + */ +static void rv_nop_reaction(char *msg) +{ +} + +static struct rv_reactor rv_nop = { + .name = "nop", + .description = "no-operation reactor: do nothing.", + .react = rv_nop_reaction +}; + +int init_rv_reactors(struct dentry *root_dir) +{ + struct dentry *available, *reacting; + int retval; + + available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL, + &available_reactors_ops); + if (!available) + goto out_err; + + reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); + if (!reacting) + goto rm_available; + + retval = __rv_register_reactor(&rv_nop); + if (retval) + goto rm_reacting; + + turn_reacting_on(); + + return 0; + +rm_reacting: + rv_remove(reacting); +rm_available: + rv_remove(available); +out_err: + return -ENOMEM; +} diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0c517c8c8999..d3005279165d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5569,13 +5569,13 @@ static const char readme_msg[] = #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" - "\t Format: p[:[<group>/]<event>] <place> [<args>]\n" - "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n" + "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n" + "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n" #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/]<event> <field> [<field>]\n" #endif - "\t e[:[<group>/]<event>] <attached-group>.<attached-event> [<args>]\n" - "\t -:[<group>/]<event>\n" + "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>]\n" + "\t -:[<group>/][<event>]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n" @@ -9101,6 +9101,16 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size return 0; } +static void free_trace_buffer(struct array_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + static int allocate_trace_buffers(struct trace_array *tr, int size) { int ret; @@ -9113,10 +9123,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { - ring_buffer_free(tr->array_buffer.buffer); - tr->array_buffer.buffer = NULL; - free_percpu(tr->array_buffer.data); - tr->array_buffer.data = NULL; + free_trace_buffer(&tr->array_buffer); return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; @@ -9131,16 +9138,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } -static void free_trace_buffer(struct array_buffer *buf) -{ - if (buf->buffer) { - ring_buffer_free(buf->buffer); - buf->buffer = NULL; - free_percpu(buf->data); - buf->data = NULL; - } -} - static void free_trace_buffers(struct trace_array *tr) { if (!tr) @@ -9772,6 +9769,8 @@ static __init int tracer_init_tracefs(void) tracer_init_tracefs_work_func(NULL); } + rv_init_interface(); + return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ff816fb41e48..900e75d96c84 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2005,4 +2005,13 @@ struct trace_min_max_param { extern const struct file_operations trace_min_max_fops; +#ifdef CONFIG_RV +extern int rv_init_interface(void); +#else +static inline int rv_init_interface(void) +{ + return 0; +} +#endif + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 076b447a1b88..154996684fb5 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -101,7 +101,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (event[0] == '\0') { + if (!system && event[0] == '\0') { ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 7d4478525c66..1783e3478912 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -125,6 +125,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, * We match the following: * event only - match all eprobes with event name * system and event only - match all system/event probes + * system only - match all system probes * * The below has the above satisfied with more arguments: * @@ -143,7 +144,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, return false; /* Must match the event name */ - if (strcmp(trace_probe_name(&ep->tp), event) != 0) + if (event[0] != '\0' && strcmp(trace_probe_name(&ep->tp), event) != 0) return false; /* No arguments match all */ @@ -226,6 +227,7 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) struct probe_arg *parg = &ep->tp.args[i]; struct ftrace_event_field *field; struct list_head *head; + int ret = -ENOENT; head = trace_get_fields(ep->event); list_for_each_entry(field, head, link) { @@ -235,9 +237,20 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) return 0; } } + + /* + * Argument not found on event. But allow for comm and COMM + * to be used to get the current->comm. + */ + if (strcmp(parg->code->data, "COMM") == 0 || + strcmp(parg->code->data, "comm") == 0) { + parg->code->op = FETCH_OP_COMM; + ret = 0; + } + kfree(parg->code->data); parg->code->data = NULL; - return -ENOENT; + return ret; } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -310,6 +323,27 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) addr = rec + field->offset; + if (is_string_field(field)) { + switch (field->filter_type) { + case FILTER_DYN_STRING: + val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_RDYN_STRING: + val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_STATIC_STRING: + val = (unsigned long)addr; + break; + case FILTER_PTR_STRING: + val = (unsigned long)(*(char *)addr); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + return val; + } + switch (field->size) { case 1: if (field->is_signed) @@ -341,16 +375,38 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) static int get_eprobe_size(struct trace_probe *tp, void *rec) { + struct fetch_insn *code; struct probe_arg *arg; int i, len, ret = 0; for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; - if (unlikely(arg->dynamic)) { + if (arg->dynamic) { unsigned long val; - val = get_event_field(arg->code, rec); - len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + code = arg->code; + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + continue; + } + code++; + len = process_fetch_insn_bottom(code, val, NULL, NULL); if (len > 0) ret += len; } @@ -368,8 +424,28 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, { unsigned long val; - val = get_event_field(code, rec); - return process_fetch_insn_bottom(code + 1, val, dest, base); + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + return -EILSEQ; + } + code++; + return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) @@ -838,8 +914,15 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ if (ret) return ret; - if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) + if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) { ret = trace_eprobe_tp_arg_update(ep, i); + if (ret) + trace_probe_log_err(0, BAD_ATTACH_ARG); + } + + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); return ret; } @@ -848,7 +931,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) { /* * Argument syntax: - * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS] + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] * Fetch args: * <name>=$<field>[:TYPE] */ @@ -858,6 +941,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) struct trace_eprobe *ep = NULL; char buf1[MAX_EVENT_NAME_LEN]; char buf2[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; int ret = 0; int i; @@ -869,25 +953,25 @@ static int __trace_eprobe_create(int argc, const char *argv[]) event = strchr(&argv[0][1], ':'); if (event) { event++; - ret = traceprobe_parse_event_name(&event, &group, buf1, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { - strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); - sanitize_event_name(buf1); - event = buf1; } - if (!is_good_name(event) || !is_good_name(group)) - goto parse_error; + trace_probe_log_set_index(1); sys_event = argv[1]; - ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, - sys_event - argv[1]); - if (ret || !sys_name) - goto parse_error; - if (!is_good_name(sys_event) || !is_good_name(sys_name)) + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); + if (ret || !sys_event || !sys_name) { + trace_probe_log_err(0, NO_EVENT_INFO); goto parse_error; + } + + if (!event) { + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); + sanitize_event_name(buf1); + event = buf1; + } mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); @@ -896,6 +980,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (IS_ERR(ep)) { ret = PTR_ERR(ep); + if (ret == -ENODEV) + trace_probe_log_err(0, BAD_ATTACH_EVENT); /* This must return -ENOMEM or missing event, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); ep = NULL; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a114549720d6..61e3a2620fa3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -157,7 +157,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) int i; if (--tp_event->perf_refcount > 0) - goto out; + return; tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); @@ -176,8 +176,6 @@ static void perf_trace_event_unreg(struct perf_event *p_event) perf_trace_buf[i] = NULL; } } -out: - trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -241,6 +239,7 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); } @@ -292,6 +291,7 @@ void perf_kprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); @@ -347,6 +347,7 @@ void perf_uprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_uprobe(p_event->tp_event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 181f08186d32..0356cae0cf74 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -176,6 +176,7 @@ static int trace_define_generic_fields(void) __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); + __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e87a46794079..fdf784620c28 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4455,7 +4455,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, ret = parse_var_defs(hist_data); if (ret) - goto out; + return ret; ret = create_val_fields(hist_data, file); if (ret) @@ -4466,8 +4466,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, goto out; ret = create_key_fields(hist_data, file); - if (ret) - goto out; + out: free_var_defs(hist_data); diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 706e1686b5eb..a6621c52ce45 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -567,7 +567,7 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) * to allow user_event files to be less locked down. The extreme case * being "other" has read/write access to user_events_data/status. * - * When not locked down, processes may not have have permissions to + * When not locked down, processes may not have permissions to * add/remove calls themselves to tracefs. We need to temporarily * switch to root file permission to allow for this scenario. */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a245ea673715..23f7f0ec4f4c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -163,7 +163,8 @@ static bool trace_kprobe_match(const char *system, const char *event, { struct trace_kprobe *tk = to_trace_kprobe(ev); - return strcmp(trace_probe_name(&tk->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tk->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && trace_kprobe_match_command_head(tk, argc, argv); } @@ -708,11 +709,11 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* * Argument syntax: * - Add kprobe: - * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: - * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]KSYM[+0] [FETCHARGS] * Or - * p:[GRP/]EVENT] [MOD:]KSYM[+0]%return [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+0]%return [FETCHARGS] * * Fetch args: * $retval : fetch return value @@ -739,6 +740,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL; switch (argv[0][0]) { @@ -833,11 +835,13 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { + } + + if (!event) { /* Make a new event name */ if (symbol) snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 80863c6508e5..36dff277de46 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -257,6 +257,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, } len = strlen(event); if (len == 0) { + if (slash) { + *pevent = NULL; + return 0; + } trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; } else if (len > MAX_EVENT_NAME_LEN) { @@ -279,7 +283,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, int ret = 0; int len; - if (strcmp(arg, "retval") == 0) { + if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; + } else if (strcmp(arg, "retval") == 0) { if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; } else { @@ -303,7 +314,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else goto inval_var; - } else if (strcmp(arg, "comm") == 0) { + } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == @@ -319,13 +330,6 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif - } else if (flags & TPARG_FL_TPOINT) { - if (code->data) - return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -380,6 +384,11 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ + if (flags & TPARG_FL_TPOINT) { + /* eprobes do not handle registers */ + trace_probe_log_err(offs, BAD_VAR); + break; + } ret = regs_query_register_offset(arg + 1); if (ret >= 0) { code->op = FETCH_OP_REG; @@ -613,9 +622,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, /* * Since $comm and immediate string can not be dereferenced, - * we can find those by strcmp. + * we can find those by strcmp. But ignore for eprobes. */ - if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { + if (!(flags & TPARG_FL_TPOINT) && + (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || + strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 92cc149af0fd..3b3869ae8cfd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -442,7 +442,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(FAIL_REG_PROBE, "Failed to register probe event"),\ C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\ - C(SAME_PROBE, "There is already the exact same probe event"), + C(SAME_PROBE, "There is already the exact same probe event"),\ + C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ + C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ + C(BAD_ATTACH_ARG, "Attached event does not have this field"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 88ba5b4bd0c5..fb58e86dd117 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -313,7 +313,8 @@ static bool trace_uprobe_match(const char *system, const char *event, { struct trace_uprobe *tu = to_trace_uprobe(ev); - return strcmp(trace_probe_name(&tu->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tu->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && trace_uprobe_match_command_head(tu, argc, argv); } @@ -533,7 +534,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS] + * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { @@ -541,6 +542,7 @@ static int __trace_uprobe_create(int argc, const char **argv) const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; enum probe_print_type ptype; struct path path; unsigned long offset, ref_ctr_offset; @@ -645,11 +647,13 @@ static int __trace_uprobe_create(int argc, const char **argv) /* setup a probe */ trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto fail_address_parse; - } else { + } + + if (!event) { char *tail; char *ptr; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ecb0e8346e65..8e61f21e7e33 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -537,7 +537,7 @@ int lockup_detector_offline_cpu(unsigned int cpu) return 0; } -static void lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -557,6 +557,13 @@ static void lockup_detector_reconfigure(void) __lockup_detector_cleanup(); } +void lockup_detector_reconfigure(void) +{ + mutex_lock(&watchdog_mutex); + __lockup_detector_reconfigure(); + mutex_unlock(&watchdog_mutex); +} + /* * Create the watchdog infrastructure and configure the detector(s). */ @@ -573,13 +580,13 @@ static __init void lockup_detector_setup(void) return; mutex_lock(&watchdog_mutex); - lockup_detector_reconfigure(); + __lockup_detector_reconfigure(); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static void lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -587,9 +594,13 @@ static void lockup_detector_reconfigure(void) watchdog_nmi_start(); cpus_read_unlock(); } +void lockup_detector_reconfigure(void) +{ + __lockup_detector_reconfigure(); +} static inline void lockup_detector_setup(void) { - lockup_detector_reconfigure(); + __lockup_detector_reconfigure(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -629,7 +640,7 @@ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - lockup_detector_reconfigure(); + __lockup_detector_reconfigure(); } /* |