diff options
Diffstat (limited to 'kernel')
166 files changed, 7405 insertions, 3831 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 186c49582f45..56f4ee97f328 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -67,6 +67,7 @@ obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o diff --git a/kernel/audit.c b/kernel/audit.c index 121d37e700a6..e4bbe2c70c26 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -718,7 +718,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, { int rc = 0; struct sk_buff *skb; - static unsigned int failed = 0; + unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ @@ -735,32 +735,30 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, continue; } +retry: /* grab an extra skb reference in case of error */ skb_get(skb); rc = netlink_unicast(sk, skb, portid, 0); if (rc < 0) { - /* fatal failure for our queue flush attempt? */ + /* send failed - try a few times unless fatal error */ if (++failed >= retry_limit || rc == -ECONNREFUSED || rc == -EPERM) { - /* yes - error processing for the queue */ sk = NULL; if (err_hook) (*err_hook)(skb); - if (!skb_hook) - goto out; - /* keep processing with the skb_hook */ + if (rc == -EAGAIN) + rc = 0; + /* continue to drain the queue */ continue; } else - /* no - requeue to preserve ordering */ - skb_queue_head(queue, skb); + goto retry; } else { - /* it worked - drop the extra reference and continue */ + /* skb sent - drop the extra reference and continue */ consume_skb(skb); failed = 0; } } -out: return (rc >= 0 ? 0 : rc); } @@ -1446,7 +1444,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; } - sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); + sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); if (!sig_data) { if (audit_sig_sid) security_release_secctx(ctx, len); @@ -1459,7 +1457,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) security_release_secctx(ctx, len); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, - sig_data, sizeof(*sig_data) + len); + sig_data, struct_size(sig_data, ctx, len)); kfree(sig_data); break; case AUDIT_TTY_GET: { @@ -1542,6 +1540,20 @@ static void audit_receive(struct sk_buff *skb) nlh = nlmsg_next(nlh, &len); } audit_ctl_unlock(); + + /* can't block with the ctrl lock, so penalize the sender now */ + if (audit_backlog_limit && + (skb_queue_len(&audit_queue) > audit_backlog_limit)) { + DECLARE_WAITQUEUE(wait, current); + + /* wake kauditd to try and flush the queue */ + wake_up_interruptible(&kauditd_wait); + + add_wait_queue_exclusive(&audit_backlog_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(audit_backlog_wait_time); + remove_wait_queue(&audit_backlog_wait, &wait); + } } /* Log information about who is connecting to the audit multicast socket */ @@ -1609,7 +1621,8 @@ static int __net_init audit_net_init(struct net *net) audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } - aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + /* limit the timeout in case auditd is blocked/stopped */ + aunet->sk->sk_sndtimeo = HZ / 10; return 0; } @@ -1825,7 +1838,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block - * while holding the mutex */ + * while holding the mutex, although we do penalize the sender + * later in audit_receive() when it is safe to block + */ if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; @@ -2132,7 +2147,7 @@ int audit_log_task_context(struct audit_buffer *ab) int error; u32 sid; - security_task_getsecid_subj(current, &sid); + security_current_getsecid_subj(&sid); if (!sid) return 0; @@ -2353,7 +2368,7 @@ int audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = auid; else audit_sig_uid = uid; - security_task_getsecid_subj(current, &audit_sig_sid); + security_current_getsecid_subj(&audit_sig_sid); } return audit_signal_info_syscall(t); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 72324afcffef..e7315d487163 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s) { struct audit_tree *tree; - tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); + tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL); if (tree) { refcount_set(&tree->count, 1); tree->goner = 0; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d75acb014ccd..42d99896e7a6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -637,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) void *bufp; int i; - data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); + data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL); if (unlikely(!data)) return NULL; memset(data, 0, sizeof(*data)); @@ -1092,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q) break; skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1, data, - sizeof(*data) + data->buflen); + struct_size(data, buf, data->buflen)); if (skb) skb_queue_tail(q, skb); kfree(data); @@ -1368,8 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype) case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: if (f->lsm_rule) { - security_task_getsecid_subj(current, - &sid); + security_current_getsecid_subj(&sid); result = security_audit_rule_match(sid, f->type, f->op, f->lsm_rule); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b517947bfa48..fce5d43a933f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -666,7 +666,16 @@ static int audit_filter_rules(struct task_struct *tsk, logged upon error */ if (f->lsm_rule) { if (need_sid) { - security_task_getsecid_subj(tsk, &sid); + /* @tsk should always be equal to + * @current with the exception of + * fork()/copy_process() in which case + * the new @tsk creds are still a dup + * of @current's creds so we can still + * use security_current_getsecid_subj() + * here even though it always refs + * @current's creds + */ + security_current_getsecid_subj(&sid); need_sid = 0; } result = security_audit_rule_match(sid, f->type, diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index cf6ca339f3cd..c1a9be6a4b9f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ + +obj-$(CONFIG_BPF_SYSCALL) += relo_core.o +$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE + $(call if_changed_rule,cc_o_c) diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 277a05e9c984..b141a1346f72 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -82,6 +82,11 @@ static int bloom_map_delete_elem(struct bpf_map *map, void *value) return -EOPNOTSUPP; } +static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; @@ -192,6 +197,7 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bloom_map_alloc, .map_free = bloom_map_free, + .map_get_next_key = bloom_map_get_next_key, .map_push_elem = bloom_map_push_elem, .map_peek_elem = bloom_map_peek_elem, .map_pop_elem = bloom_map_pop_elem, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 96ceed0e0fb5..e29d9e3d853e 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -17,6 +17,7 @@ #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> #include <linux/fdtable.h> +#include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(inode_cache); @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, if (!bsb) return NULL; - inode_storage = rcu_dereference(bsb->storage); + inode_storage = + rcu_dereference_check(bsb->storage, bpf_rcu_lock_held()); if (!inode_storage) return NULL; @@ -172,6 +174,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, { struct bpf_local_storage_data *sdata; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; @@ -204,6 +207,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, BPF_CALL_2(bpf_inode_storage_delete, struct bpf_map *, map, struct inode *, inode) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!inode) return -EINVAL; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b2ee45064e06..b7aef5b3416d 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -714,3 +714,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = { .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; + +/* maximum number of loops */ +#define MAX_LOOPS BIT(23) + +BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, + u64, flags) +{ + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 ret; + u32 i; + + if (flags) + return -EINVAL; + if (nr_loops > MAX_LOOPS) + return -E2BIG; + + for (i = 0; i < nr_loops; i++) { + ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) + return i + 1; + } + + return i; +} + +const struct bpf_func_proto bpf_loop_proto = { + .func = bpf_loop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b305270b7a4b..71de2a89869c 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -11,6 +11,9 @@ #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate.h> +#include <linux/rcupdate_trace.h> +#include <linux/rcupdate_wait.h> #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) @@ -81,6 +84,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; } +void bpf_local_storage_free_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage *local_storage; + + local_storage = container_of(rcu, struct bpf_local_storage, rcu); + kfree_rcu(local_storage, rcu); +} + +static void bpf_selem_free_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage_elem *selem; + + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + kfree_rcu(selem, rcu); +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. @@ -93,7 +112,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, bool free_local_storage; void *owner; - smap = rcu_dereference(SDATA(selem)->smap); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); owner = local_storage->owner; /* All uncharging on the owner must be done first. @@ -118,12 +137,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, * * Although the unlock will be done under * rcu_read_lock(), it is more intutivie to - * read if kfree_rcu(local_storage, rcu) is done + * read if the freeing of the storage is done * after the raw_spin_unlock_bh(&local_storage->lock). * * Hence, a "bool free_local_storage" is returned - * to the caller which then calls the kfree_rcu() - * after unlock. + * to the caller which then calls then frees the storage after + * all the RCU grace periods have expired. */ } hlist_del_init_rcu(&selem->snode); @@ -131,8 +150,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - kfree_rcu(selem, rcu); - + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); return free_local_storage; } @@ -146,7 +164,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) /* selem has already been unlinked from sk */ return; - local_storage = rcu_dereference(selem->local_storage); + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( @@ -154,7 +173,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) - kfree_rcu(local_storage, rcu); + call_rcu_tasks_trace(&local_storage->rcu, + bpf_local_storage_free_rcu); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -174,7 +194,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) /* selem has already be unlinked from smap */ return; - smap = rcu_dereference(SDATA(selem)->smap); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, selem); raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) @@ -213,12 +233,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem; /* Fast path (cache hit) */ - sdata = rcu_dereference(local_storage->cache[smap->cache_idx]); + sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx], + bpf_rcu_lock_held()); if (sdata && rcu_access_pointer(sdata->smap) == smap) return sdata; /* Slow path (cache miss) */ - hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + hlist_for_each_entry_rcu(selem, &local_storage->list, snode, + rcu_read_lock_trace_held()) if (rcu_access_pointer(SDATA(selem)->smap) == smap) break; @@ -306,7 +328,8 @@ int bpf_local_storage_alloc(void *owner, * bucket->list, first_selem can be freed immediately * (instead of kfree_rcu) because * bpf_local_storage_map_free() does a - * synchronize_rcu() before walking the bucket->list. + * synchronize_rcu_mult (waiting for both sleepable and + * normal programs) before walking the bucket->list. * Hence, no one is accessing selem from the * bucket->list under rcu_read_lock(). */ @@ -342,7 +365,8 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, !map_value_has_spin_lock(&smap->map))) return ERR_PTR(-EINVAL); - local_storage = rcu_dereference(*owner_storage(smap, owner)); + local_storage = rcu_dereference_check(*owner_storage(smap, owner), + bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { /* Very first elem for the owner */ err = check_flags(NULL, map_flags); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 8ecfe4752769..21069dbe9138 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -165,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) break; } - if (btf_member_bitfield_size(t, member)) { + if (__btf_member_bitfield_size(t, member)) { pr_warn("bit field member %s in struct %s is not supported\n", mname, st_ops->name); break; @@ -296,7 +296,7 @@ static int check_zero_holes(const struct btf_type *t, void *data) const struct btf_type *mtype; for_each_member(i, t, member) { - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (moff > prev_mend && memchr_inv(data + prev_mend, 0, moff - prev_mend)) return -EINVAL; @@ -387,7 +387,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_prog *prog; u32 moff; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index ebfa8bc90892..5da7bed0f5f6 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -17,6 +17,7 @@ #include <uapi/linux/btf.h> #include <linux/btf_ids.h> #include <linux/fdtable.h> +#include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(task_cache); @@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, struct bpf_local_storage *task_storage; struct bpf_local_storage_map *smap; - task_storage = rcu_dereference(task->bpf_storage); + task_storage = + rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held()); if (!task_storage) return NULL; @@ -229,6 +231,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, { struct bpf_local_storage_data *sdata; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; @@ -260,6 +263,7 @@ BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, { int ret; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; @@ -323,7 +327,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_task_struct_ids[0], + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; @@ -334,5 +338,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_task_struct_ids[0], + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbc3ad07e21b..e16dafeb2450 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -25,6 +25,7 @@ #include <linux/kobject.h> #include <linux/sysfs.h> #include <net/sock.h> +#include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus @@ -282,6 +283,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = "DATASEC", [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", + [BTF_KIND_TYPE_TAG] = "TYPE_TAG", }; const char *btf_type_str(const struct btf_type *t) @@ -418,6 +420,7 @@ static bool btf_type_is_modifier(const struct btf_type *t) case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: return true; } @@ -834,7 +837,7 @@ static const char *btf_show_name(struct btf_show *show) const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)]; const char *name = NULL, *prefix = "", *parens = ""; const struct btf_member *m = show->state.member; - const struct btf_type *t = show->state.type; + const struct btf_type *t; const struct btf_array *array; u32 id = show->state.type_id; const char *member = NULL; @@ -1737,6 +1740,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: id = type->type; type = btf_type_by_id(btf, type->type); break; @@ -2345,6 +2349,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { + const char *value; + if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; @@ -2360,7 +2366,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - /* typedef type must have a valid name, and other ref types, + /* typedef/type_tag type must have a valid name, and other ref types, * volatile, const, restrict, should have a null name. */ if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { @@ -2369,6 +2375,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } + } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) { + value = btf_name_by_offset(env->btf, t->name_off); + if (!value || !value[0]) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } } else { if (t->name_off) { btf_verifier_log_type(env, t, "Invalid name"); @@ -2958,7 +2970,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } - offset = btf_member_bit_offset(t, member); + offset = __btf_member_bit_offset(t, member); if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); @@ -3083,7 +3095,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t if (off != -ENOENT) /* only one such field is allowed */ return -E2BIG; - off = btf_member_bit_offset(t, member); + off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ return -EINVAL; @@ -3173,8 +3185,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, btf_show_start_member(show, member); - member_offset = btf_member_bit_offset(t, member); - bitfield_size = btf_member_bitfield_size(t, member); + member_offset = __btf_member_bit_offset(t, member); + bitfield_size = __btf_member_bitfield_size(t, member); bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { @@ -4059,6 +4071,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = &datasec_ops, [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, + [BTF_KIND_TYPE_TAG] = &modifier_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -4460,8 +4473,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, log->len_total = log_size; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || - !log->level || !log->ubuf) { + if (!bpf_verifier_log_attr_valid(log)) { err = -EINVAL; goto errout; } @@ -4814,7 +4826,7 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) return prog->aux->attach_btf; } -static bool is_string_ptr(struct btf *btf, const struct btf_type *t) +static bool is_int_ptr(struct btf *btf, const struct btf_type *t) { /* t comes in already as a pointer */ t = btf_type_by_id(btf, t->type); @@ -4823,8 +4835,7 @@ static bool is_string_ptr(struct btf *btf, const struct btf_type *t) if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST) t = btf_type_by_id(btf, t->type); - /* char, signed char, unsigned char */ - return btf_type_is_int(t) && t->size == 1; + return btf_type_is_int(t); } bool btf_ctx_access(int off, int size, enum bpf_access_type type, @@ -4929,10 +4940,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + u32 type, flag; - if (ctx_arg_info->offset == off && - (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || - ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + type = base_type(ctx_arg_info->reg_type); + flag = type_flag(ctx_arg_info->reg_type); + if (ctx_arg_info->offset == off && type == PTR_TO_BUF && + (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; } @@ -4945,7 +4958,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, */ return true; - if (is_string_ptr(btf, t)) + if (is_int_ptr(btf, t)) return true; /* this is a pointer to another type */ @@ -5048,7 +5061,7 @@ again: if (array_elem->nelems != 0) goto error; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (off < moff) goto error; @@ -5071,14 +5084,14 @@ error: for_each_member(i, t, member) { /* offset of the field in bytes */ - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; - if (btf_member_bitfield_size(t, member)) { - u32 end_bit = btf_member_bit_offset(t, member) + - btf_member_bitfield_size(t, member); + if (__btf_member_bitfield_size(t, member)) { + u32 end_bit = __btf_member_bit_offset(t, member) + + __btf_member_bitfield_size(t, member); /* off <= moff instead of off == moff because clang * does not generate a BTF member for anonymous @@ -5563,12 +5576,53 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #endif }; +/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ +static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int rec) +{ + const struct btf_type *member_type; + const struct btf_member *member; + u32 i; + + if (!btf_type_is_struct(t)) + return false; + + for_each_member(i, t, member) { + const struct btf_array *array; + + member_type = btf_type_skip_modifiers(btf, member->type, NULL); + if (btf_type_is_struct(member_type)) { + if (rec >= 3) { + bpf_log(log, "max struct nesting depth exceeded\n"); + return false; + } + if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1)) + return false; + continue; + } + if (btf_type_is_array(member_type)) { + array = btf_type_array(member_type); + if (!array->nelems) + return false; + member_type = btf_type_skip_modifiers(btf, array->type, NULL); + if (!btf_type_is_scalar(member_type)) + return false; + continue; + } + if (!btf_type_is_scalar(member_type)) + return false; + } + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok) { struct bpf_verifier_log *log = &env->log; + bool is_kfunc = btf_is_kernel(btf); const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; @@ -5621,7 +5675,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - if (btf_is_kernel(btf)) { + if (btf_get_prog_ctx_type(log, btf, t, + env->prog->type, i)) { + /* If function expects ctx type in BTF check that caller + * is passing PTR_TO_CTX. + */ + if (reg->type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_type_str(t)); + return -EINVAL; + } + if (check_ptr_off_reg(env, reg, regno)) + return -EINVAL; + } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) { const struct btf_type *reg_ref_t; const struct btf *reg_btf; const char *reg_ref_tname; @@ -5637,14 +5704,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - } else if (reg2btf_ids[reg->type]) { + } else { reg_btf = btf_vmlinux; reg_ref_id = *reg2btf_ids[reg->type]; - } else { - bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", - func_name, i, - btf_type_str(ref_t), ref_tname, regno); - return -EINVAL; } reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, @@ -5660,23 +5722,24 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname); return -EINVAL; } - } else if (btf_get_prog_ctx_type(log, btf, t, - env->prog->type, i)) { - /* If function expects ctx type in BTF check that caller - * is passing PTR_TO_CTX. - */ - if (reg->type != PTR_TO_CTX) { - bpf_log(log, - "arg#%d expected pointer to ctx, but got %s\n", - i, btf_type_str(t)); - return -EINVAL; - } - if (check_ctx_reg(env, reg, regno)) - return -EINVAL; } else if (ptr_to_mem_ok) { const struct btf_type *resolve_ret; u32 type_size; + if (is_kfunc) { + /* Permit pointer to mem, but only when argument + * type is pointer to scalar, or struct composed + * (recursively) of scalars. + */ + if (!btf_type_is_scalar(ref_t) && + !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) { + bpf_log(log, + "arg#%d pointer type %s %s must point to scalar or struct with scalar\n", + i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + } + resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) { bpf_log(log, @@ -5689,6 +5752,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (check_mem_reg(env, reg, regno, type_size)) return -EINVAL; } else { + bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i, + is_kfunc ? "kernel " : "", func_name, func_id); return -EINVAL; } } @@ -5738,7 +5803,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs) { - return btf_check_func_arg_match(env, btf, func_id, regs, false); + return btf_check_func_arg_match(env, btf, func_id, regs, true); } /* Convert BTF of a function into bpf_reg_state if possible @@ -5846,7 +5911,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return -EINVAL; } - reg->type = PTR_TO_MEM_OR_NULL; + reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; reg->id = ++env->id_gen; continue; @@ -6157,6 +6222,8 @@ btf_module_read(struct file *file, struct kobject *kobj, return len; } +static void purge_cand_cache(struct btf *btf); + static int btf_module_notify(struct notifier_block *nb, unsigned long op, void *module) { @@ -6191,6 +6258,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, goto out; } + purge_cand_cache(NULL); mutex_lock(&btf_module_mutex); btf_mod->module = module; btf_mod->btf = btf; @@ -6233,6 +6301,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, list_del(&btf_mod->list); if (btf_mod->sysfs_attr) sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); + purge_cand_cache(btf_mod->btf); btf_put(btf_mod->btf); kfree(btf_mod->sysfs_attr); kfree(btf_mod); @@ -6336,21 +6405,19 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .func = bpf_btf_find_by_name_kind, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) +BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) +#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type) +BTF_TRACING_TYPE_xxx +#undef BTF_TRACING_TYPE /* BTF ID set registration API for modules */ -struct kfunc_btf_id_list { - struct list_head list; - struct mutex mutex; -}; - #ifdef CONFIG_DEBUG_INFO_BTF_MODULES void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, @@ -6376,8 +6443,6 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, { struct kfunc_btf_id_set *s; - if (!owner) - return false; mutex_lock(&klist->mutex); list_for_each_entry(s, &klist->list, list) { if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) { @@ -6389,8 +6454,6 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, return false; } -#endif - #define DEFINE_KFUNC_BTF_ID_LIST(name) \ struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \ __MUTEX_INITIALIZER(name.mutex) }; \ @@ -6398,3 +6461,386 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); + +#endif + +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return -EOPNOTSUPP; +} + +static bool bpf_core_is_flavor_sep(const char *s) +{ + /* check X___Y name pattern, where X and Y are not underscores */ + return s[0] != '_' && /* X */ + s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */ + s[4] != '_'; /* Y */ +} + +size_t bpf_core_essential_name_len(const char *name) +{ + size_t n = strlen(name); + int i; + + for (i = n - 5; i >= 0; i--) { + if (bpf_core_is_flavor_sep(name + i)) + return i + 1; + } + return n; +} + +struct bpf_cand_cache { + const char *name; + u32 name_len; + u16 kind; + u16 cnt; + struct { + const struct btf *btf; + u32 id; + } cands[]; +}; + +static void bpf_free_cands(struct bpf_cand_cache *cands) +{ + if (!cands->cnt) + /* empty candidate array was allocated on stack */ + return; + kfree(cands); +} + +static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands) +{ + kfree(cands->name); + kfree(cands); +} + +#define VMLINUX_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE]; + +#define MODULE_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE]; + +static DEFINE_MUTEX(cand_cache_mutex); + +static void __print_cand_cache(struct bpf_verifier_log *log, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc; + int i, j; + + for (i = 0; i < cache_size; i++) { + cc = cache[i]; + if (!cc) + continue; + bpf_log(log, "[%d]%s(", i, cc->name); + for (j = 0; j < cc->cnt; j++) { + bpf_log(log, "%d", cc->cands[j].id); + if (j < cc->cnt - 1) + bpf_log(log, " "); + } + bpf_log(log, "), "); + } +} + +static void print_cand_cache(struct bpf_verifier_log *log) +{ + mutex_lock(&cand_cache_mutex); + bpf_log(log, "vmlinux_cand_cache:"); + __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + bpf_log(log, "\nmodule_cand_cache:"); + __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE); + bpf_log(log, "\n"); + mutex_unlock(&cand_cache_mutex); +} + +static u32 hash_cands(struct bpf_cand_cache *cands) +{ + return jhash(cands->name, cands->name_len, 0); +} + +static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size]; + + if (cc && cc->name_len == cands->name_len && + !strncmp(cc->name, cands->name, cands->name_len)) + return cc; + return NULL; +} + +static size_t sizeof_cands(int cnt) +{ + return offsetof(struct bpf_cand_cache, cands[cnt]); +} + +static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands; + + if (*cc) { + bpf_free_cands_from_cache(*cc); + *cc = NULL; + } + new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL); + if (!new_cands) { + bpf_free_cands(cands); + return ERR_PTR(-ENOMEM); + } + /* strdup the name, since it will stay in cache. + * the cands->name points to strings in prog's BTF and the prog can be unloaded. + */ + new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL); + bpf_free_cands(cands); + if (!new_cands->name) { + kfree(new_cands); + return ERR_PTR(-ENOMEM); + } + *cc = new_cands; + return new_cands; +} + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc; + int i, j; + + for (i = 0; i < cache_size; i++) { + cc = cache[i]; + if (!cc) + continue; + if (!btf) { + /* when new module is loaded purge all of module_cand_cache, + * since new module might have candidates with the name + * that matches cached cands. + */ + bpf_free_cands_from_cache(cc); + cache[i] = NULL; + continue; + } + /* when module is unloaded purge cache entries + * that match module's btf + */ + for (j = 0; j < cc->cnt; j++) + if (cc->cands[j].btf == btf) { + bpf_free_cands_from_cache(cc); + cache[i] = NULL; + break; + } + } + +} + +static void purge_cand_cache(struct btf *btf) +{ + mutex_lock(&cand_cache_mutex); + __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE); + mutex_unlock(&cand_cache_mutex); +} +#endif + +static struct bpf_cand_cache * +bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, + int targ_start_id) +{ + struct bpf_cand_cache *new_cands; + const struct btf_type *t; + const char *targ_name; + size_t targ_essent_len; + int n, i; + + n = btf_nr_types(targ_btf); + for (i = targ_start_id; i < n; i++) { + t = btf_type_by_id(targ_btf, i); + if (btf_kind(t) != cands->kind) + continue; + + targ_name = btf_name_by_offset(targ_btf, t->name_off); + if (!targ_name) + continue; + + /* the resched point is before strncmp to make sure that search + * for non-existing name will have a chance to schedule(). + */ + cond_resched(); + + if (strncmp(cands->name, targ_name, cands->name_len) != 0) + continue; + + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != cands->name_len) + continue; + + /* most of the time there is only one candidate for a given kind+name pair */ + new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL); + if (!new_cands) { + bpf_free_cands(cands); + return ERR_PTR(-ENOMEM); + } + + memcpy(new_cands, cands, sizeof_cands(cands->cnt)); + bpf_free_cands(cands); + cands = new_cands; + cands->cands[cands->cnt].btf = targ_btf; + cands->cands[cands->cnt].id = i; + cands->cnt++; + } + return cands; +} + +static struct bpf_cand_cache * +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) +{ + struct bpf_cand_cache *cands, *cc, local_cand = {}; + const struct btf *local_btf = ctx->btf; + const struct btf_type *local_type; + const struct btf *main_btf; + size_t local_essent_len; + struct btf *mod_btf; + const char *name; + int id; + + main_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(main_btf)) + return ERR_CAST(main_btf); + + local_type = btf_type_by_id(local_btf, local_type_id); + if (!local_type) + return ERR_PTR(-EINVAL); + + name = btf_name_by_offset(local_btf, local_type->name_off); + if (str_is_empty(name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(name); + + cands = &local_cand; + cands->name = name; + cands->kind = btf_kind(local_type); + cands->name_len = local_essent_len; + + cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + /* cands is a pointer to stack here */ + if (cc) { + if (cc->cnt) + return cc; + goto check_modules; + } + + /* Attempt to find target candidates in vmlinux BTF first */ + cands = bpf_core_add_cands(cands, main_btf, 1); + if (IS_ERR(cands)) + return ERR_CAST(cands); + + /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */ + + /* populate cache even when cands->cnt == 0 */ + cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + if (IS_ERR(cc)) + return ERR_CAST(cc); + + /* if vmlinux BTF has any candidate, don't go for module BTFs */ + if (cc->cnt) + return cc; + +check_modules: + /* cands is a pointer to stack here and cands->cnt == 0 */ + cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); + if (cc) + /* if cache has it return it even if cc->cnt == 0 */ + return cc; + + /* If candidate is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, mod_btf, id) { + if (!btf_is_module(mod_btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(mod_btf); + spin_unlock_bh(&btf_idr_lock); + cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); + if (IS_ERR(cands)) { + btf_put(mod_btf); + return ERR_CAST(cands); + } + spin_lock_bh(&btf_idr_lock); + btf_put(mod_btf); + } + spin_unlock_bh(&btf_idr_lock); + /* cands is a pointer to kmalloced memory here if cands->cnt > 0 + * or pointer to stack if cands->cnd == 0. + * Copy it into the cache even when cands->cnt == 0 and + * return the result. + */ + return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); +} + +int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, + int relo_idx, void *insn) +{ + bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; + struct bpf_core_cand_list cands = {}; + struct bpf_core_spec *specs; + int err; + + /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" + * into arrays of btf_ids of struct fields and array indices. + */ + specs = kcalloc(3, sizeof(*specs), GFP_KERNEL); + if (!specs) + return -ENOMEM; + + if (need_cands) { + struct bpf_cand_cache *cc; + int i; + + mutex_lock(&cand_cache_mutex); + cc = bpf_core_find_cands(ctx, relo->type_id); + if (IS_ERR(cc)) { + bpf_log(ctx->log, "target candidate search failed for %d\n", + relo->type_id); + err = PTR_ERR(cc); + goto out; + } + if (cc->cnt) { + cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); + if (!cands.cands) { + err = -ENOMEM; + goto out; + } + } + for (i = 0; i < cc->cnt; i++) { + bpf_log(ctx->log, + "CO-RE relocating %s %s: found target candidate [%d]\n", + btf_kind_str[cc->kind], cc->name, cc->cands[i].id); + cands.cands[i].btf = cc->cands[i].btf; + cands.cands[i].id = cc->cands[i].id; + } + cands.len = cc->cnt; + /* cand_cache_mutex needs to span the cache lookup and + * copy of btf pointer into bpf_core_cand_list, + * since module can be unloaded while bpf_core_apply_relo_insn + * is working with module's btf. + */ + } + + err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, + relo, relo_idx, ctx->btf, &cands, specs); +out: + kfree(specs); + if (need_cands) { + kfree(cands.cands); + mutex_unlock(&cand_cache_mutex); + if (ctx->log->level & BPF_LOG_LEVEL2) + print_cand_cache(ctx->log); + } + return err; +} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 43eb3501721b..514b4681a90a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1789,7 +1789,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2405e39d800f..de3e5bc6781f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1574,7 +1574,8 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) + + if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; @@ -1891,7 +1892,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp) /** * bpf_prog_select_runtime - select exec runtime for BPF program - * @fp: bpf_prog populated with internal BPF program + * @fp: bpf_prog populated with BPF program * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. @@ -2300,7 +2301,6 @@ static void bpf_prog_free_deferred(struct work_struct *work) } } -/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 585b2b77ccc4..b3e6b9422238 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -195,7 +195,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, } return; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(skb->dev, rcpu->prog, act); @@ -254,7 +254,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_DROP: xdp_return_frame(xdpf); @@ -746,15 +746,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) list_add(&bq->flush_node, flush_list); } -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct xdp_frame *xdpf; - - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - /* Info needed when constructing SKB on remote CPU */ xdpf->dev_rx = dev_rx; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f02d04540c0c..fe019dbdb3f0 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -348,7 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, frames[nframes++] = xdpf; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dev, xdp_prog, act); @@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, bq->q[bq->count++] = xdpf; } -static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct xdp_frame *xdpf; int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + err = xdp_ok_fwd_dev(dev, xdpf->len); if (unlikely(err)) return err; - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } @@ -507,7 +502,7 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev __skb_push(skb, skb->mac_len); break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dst->dev, dst->xdp_prog, act); @@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev return act; } -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { - return __xdp_enqueue(dev, xdp, dev_rx, NULL); + return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; - return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); + return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); } -static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp) +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) { if (!obj || !obj->dev->netdev_ops->ndo_xdp_xmit) return false; - if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data)) + if (xdp_ok_fwd_dev(obj->dev, xdpf->len)) return false; return true; @@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes) return n; } -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; - struct xdp_frame *xdpf; int num_excluded = 0; unsigned int i; int err; @@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, excluded_devices[num_excluded++] = dev_rx->ifindex; } - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); - if (!is_valid_dst(dst, xdp)) + if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) @@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { - if (!is_valid_dst(dst, xdp)) + if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 649f07623df6..01cfdf40c838 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2,6 +2,7 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> +#include <linux/bpf-cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> @@ -530,7 +531,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -558,13 +559,27 @@ const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; #endif +BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) +{ + return strncmp(s1, s2, s1_sz); +} + +const struct bpf_func_proto bpf_strncmp_proto = { + .func = bpf_strncmp, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_CONST_STR, +}; + BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, struct bpf_pidns_info *, nsdata, u32, size) { @@ -630,7 +645,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -667,7 +682,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; @@ -680,7 +695,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; @@ -1011,7 +1026,7 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, - .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1376,6 +1391,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_query_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_loop: + return &bpf_loop_proto; + case BPF_FUNC_strncmp: + return &bpf_strncmp_proto; default: break; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 80da1db47c68..5a8d9f7467bf 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -648,12 +648,22 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) int opt; opt = fs_parse(fc, bpf_fs_parameters, param, &result); - if (opt < 0) + if (opt < 0) { /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - return opt == -ENOPARAM ? 0 : opt; + if (opt == -ENOPARAM) { + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; + + return 0; + } + + if (opt < 0) + return opt; + } switch (opt) { case OPT_MODE: diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 035e9e3a7132..23f7f9d08a62 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -163,8 +163,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return 0; } - new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) + - map->value_size, + new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, map->numa_node); if (!new) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 423549d2c52e..5763cc7ac4f1 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -412,7 +412,7 @@ static int trie_update_elem(struct bpf_map *map, rcu_assign_pointer(im_node->child[1], node); } - /* Finally, assign the intermediate node to the determined spot */ + /* Finally, assign the intermediate node to the determined slot */ rcu_assign_pointer(*slot, im_node); out: diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6a9542af4212..b0fa190b0979 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__bpf_map_elem, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, }; diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h new file mode 100644 index 000000000000..5d18d7d85bef --- /dev/null +++ b/kernel/bpf/mmap_unlock_work.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021 Facebook + */ + +#ifndef __MMAP_UNLOCK_WORK_H__ +#define __MMAP_UNLOCK_WORK_H__ +#include <linux/irq_work.h> + +/* irq_work to run mmap_read_unlock() in irq_work */ +struct mmap_unlock_irq_work { + struct irq_work irq_work; + struct mm_struct *mm; +}; + +DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); + +/* + * We cannot do mmap_read_unlock() when the irq is disabled, because of + * risk to deadlock with rq_lock. To look up vma when the irqs are + * disabled, we need to run mmap_read_unlock() in irq_work. We use a + * percpu variable to do the irq_work. If the irq_work is already used + * by another lookup, we fall over. + */ +static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr) +{ + struct mmap_unlock_irq_work *work = NULL; + bool irq_work_busy = false; + + if (irqs_disabled()) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + work = this_cpu_ptr(&mmap_unlock_work); + if (irq_work_is_busy(&work->irq_work)) { + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } + } else { + /* + * PREEMPT_RT does not allow to trylock mmap sem in + * interrupt disabled context. Force the fallback code. + */ + irq_work_busy = true; + } + } + + *work_ptr = work; + return irq_work_busy; +} + +static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm) +{ + if (!work) { + mmap_read_unlock(mm); + } else { + work->mm = mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); + irq_work_queue(&work->irq_work); + } +} + +#endif /* __MMAP_UNLOCK_WORK_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 542f275bf252..868cc2c43899 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> +#include <linux/bpf-netns.h> #include <linux/filter.h> #include <net/net_namespace.h> diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 93a55391791a..556a769b5b80 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -152,16 +152,12 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - u64 array_size; if (!bpf_capable()) return ERR_PTR(-EPERM); - array_size = sizeof(*array); - array_size += (u64)attr->max_entries * sizeof(struct sock *); - /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size, numa_node); + array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 9e0c10c6892a..638d7fd7b375 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = { .func = bpf_ringbuf_output, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6e75bbee39f0..22c8ae94e4c1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,10 +7,10 @@ #include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> -#include <linux/irq_work.h> #include <linux/btf_ids.h> #include <linux/buildid.h> #include "percpu_freelist.h" +#include "mmap_unlock_work.h" #define STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ @@ -31,25 +31,6 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; -/* irq_work to run up_read() for build_id lookup in nmi context */ -struct stack_map_irq_work { - struct irq_work irq_work; - struct mm_struct *mm; -}; - -static void do_up_read(struct irq_work *entry) -{ - struct stack_map_irq_work *work; - - if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) - return; - - work = container_of(entry, struct stack_map_irq_work, irq_work); - mmap_read_unlock_non_owner(work->mm); -} - -static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); - static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); @@ -149,35 +130,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; + struct mmap_unlock_irq_work *work = NULL; + bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); struct vm_area_struct *vma; - bool irq_work_busy = false; - struct stack_map_irq_work *work = NULL; - - if (irqs_disabled()) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { - work = this_cpu_ptr(&up_read_work); - if (irq_work_is_busy(&work->irq_work)) { - /* cannot queue more up_read, fallback */ - irq_work_busy = true; - } - } else { - /* - * PREEMPT_RT does not allow to trylock mmap sem in - * interrupt disabled context. Force the fallback code. - */ - irq_work_busy = true; - } - } - /* - * We cannot do up_read() when the irq is disabled, because of - * risk to deadlock with rq_lock. To do build_id lookup when the - * irqs are disabled, we need to run up_read() in irq_work. We use - * a percpu variable to do the irq_work. If the irq_work is - * already used by another lookup, we fall back to report ips. - * - * Same fallback is used for kernel stack (!user) on a stackmap - * with build_id. + /* If the irq_work is in use, fall back to report ips. Same + * fallback is used for kernel stack (!user) on a stackmap with + * build_id. */ if (!user || !current || !current->mm || irq_work_busy || !mmap_read_trylock(current->mm)) { @@ -203,19 +162,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } - - if (!work) { - mmap_read_unlock(current->mm); - } else { - work->mm = current->mm; - - /* The lock will be released once we're out of interrupt - * context. Tell lockdep that we've released it now so - * it doesn't complain that we forgot to release it. - */ - rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); - irq_work_queue(&work->irq_work); - } + bpf_mmap_unlock_mm(work, current->mm); } static struct perf_callchain_entry * @@ -525,13 +472,14 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { struct pt_regs *regs; - long res; + long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); - res = __bpf_get_stack(regs, task, NULL, buf, size, flags); + if (regs) + res = __bpf_get_stack(regs, task, NULL, buf, size, flags); put_task_stack(task); return res; @@ -542,7 +490,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &btf_task_struct_ids[0], + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, @@ -719,16 +667,3 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_btf_name = "bpf_stack_map", .map_btf_id = &stack_trace_map_btf_id, }; - -static int __init stack_map_init(void) -{ - int cpu; - struct stack_map_irq_work *work; - - for_each_possible_cpu(cpu) { - work = per_cpu_ptr(&up_read_work, cpu); - init_irq_work(&work->irq_work, do_up_read); - } - return 0; -} -subsys_initcall(stack_map_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1033ee8c0caf..fa4505f9b611 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2,6 +2,7 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> +#include <linux/bpf-cgroup.h> #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> #include <linux/bpf_verifier.h> @@ -2198,7 +2199,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array +#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) { @@ -4772,7 +4773,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -4819,7 +4820,7 @@ const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, - .arg2_type = ARG_CONST_SIZE, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index b48750bfba5a..d94696198ef8 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -8,6 +8,7 @@ #include <linux/fdtable.h> #include <linux/filter.h> #include <linux/btf_ids.h> +#include "mmap_unlock_work.h" struct bpf_iter_seq_task_common { struct pid_namespace *ns; @@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = { .show = task_vma_seq_show, }; -BTF_ID_LIST(btf_task_file_ids) -BTF_ID(struct, file) -BTF_ID(struct, vm_area_struct) - static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, .init_seq_private = init_seq_pidns, @@ -586,23 +583,88 @@ static struct bpf_iter_reg task_vma_reg_info = { .seq_info = &task_vma_seq_info, }; +BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, + bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) +{ + struct mmap_unlock_irq_work *work = NULL; + struct vm_area_struct *vma; + bool irq_work_busy = false; + struct mm_struct *mm; + int ret = -ENOENT; + + if (flags) + return -EINVAL; + + if (!task) + return -ENOENT; + + mm = task->mm; + if (!mm) + return -ENOENT; + + irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + + if (irq_work_busy || !mmap_read_trylock(mm)) + return -EBUSY; + + vma = find_vma(mm, start); + + if (vma && vma->vm_start <= start && vma->vm_end > start) { + callback_fn((u64)(long)task, (u64)(long)vma, + (u64)(long)callback_ctx, 0, 0); + ret = 0; + } + bpf_mmap_unlock_mm(work, mm); + return ret; +} + +const struct bpf_func_proto bpf_find_vma_proto = { + .func = bpf_find_vma, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_FUNC, + .arg4_type = ARG_PTR_TO_STACK_OR_NULL, + .arg5_type = ARG_ANYTHING, +}; + +DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); + +static void do_mmap_read_unlock(struct irq_work *entry) +{ + struct mmap_unlock_irq_work *work; + + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) + return; + + work = container_of(entry, struct mmap_unlock_irq_work, irq_work); + mmap_read_unlock_non_owner(work->mm); +} + static int __init task_iter_init(void) { - int ret; + struct mmap_unlock_irq_work *work; + int ret, cpu; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&mmap_unlock_work, cpu); + init_irq_work(&work->irq_work, do_mmap_read_unlock); + } - task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; + task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; ret = bpf_iter_reg_target(&task_reg_info); if (ret) return ret; - task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0]; + task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; ret = bpf_iter_reg_target(&task_file_reg_info); if (ret) return ret; - task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e98de5e73ba5..4b6974a195c1 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -27,6 +27,14 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +bool bpf_prog_has_trampoline(const struct bpf_prog *prog) +{ + enum bpf_attach_type eatype = prog->expected_attach_type; + + return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN; +} + void *bpf_jit_alloc_exec_page(void) { void *image; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 50efda51515b..a39eedecc93a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4,6 +4,7 @@ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io */ #include <uapi/linux/btf.h> +#include <linux/bpf-cgroup.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> @@ -293,13 +294,15 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, "verifier log line truncated - local buffer too short\n"); - n = min(log->len_total - log->len_used - 1, n); - log->kbuf[n] = '\0'; - if (log->level == BPF_LOG_KERNEL) { - pr_err("BPF:%s\n", log->kbuf); + bool newline = n > 0 && log->kbuf[n - 1] == '\n'; + + pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); return; } + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) log->len_used += n; else @@ -439,18 +442,6 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; } -static bool reg_type_may_be_null(enum bpf_reg_type type) -{ - return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL || - type == PTR_TO_MEM_OR_NULL || - type == PTR_TO_RDONLY_BUF_OR_NULL || - type == PTR_TO_RDWR_BUF_OR_NULL; -} - static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && @@ -459,12 +450,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_MEM || - type == PTR_TO_MEM_OR_NULL; + return base_type(type) == PTR_TO_SOCKET || + base_type(type) == PTR_TO_TCP_SOCK || + base_type(type) == PTR_TO_MEM; +} + +static bool type_is_rdonly_mem(u32 type) +{ + return type & MEM_RDONLY; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -472,14 +465,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } -static bool arg_type_may_be_null(enum bpf_arg_type type) +static bool type_may_be_null(u32 type) { - return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_CTX_OR_NULL || - type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || - type == ARG_PTR_TO_STACK_OR_NULL; + return type & PTR_MAYBE_NULL; } /* Determine whether the function releases some resources allocated by another @@ -539,39 +527,56 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } -/* string representation of 'enum bpf_reg_type' */ -static const char * const reg_type_str[] = { - [NOT_INIT] = "?", - [SCALAR_VALUE] = "inv", - [PTR_TO_CTX] = "ctx", - [CONST_PTR_TO_MAP] = "map_ptr", - [PTR_TO_MAP_VALUE] = "map_value", - [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", - [PTR_TO_STACK] = "fp", - [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", - [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", - [PTR_TO_TP_BUFFER] = "tp_buffer", - [PTR_TO_XDP_SOCK] = "xdp_sock", - [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", - [PTR_TO_MEM] = "mem", - [PTR_TO_MEM_OR_NULL] = "mem_or_null", - [PTR_TO_RDONLY_BUF] = "rdonly_buf", - [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", - [PTR_TO_RDWR_BUF] = "rdwr_buf", - [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", - [PTR_TO_FUNC] = "func", - [PTR_TO_MAP_KEY] = "map_key", -}; +/* string representation of 'enum bpf_reg_type' + * + * Note that reg_type_str() can not appear more than once in a single verbose() + * statement. + */ +static const char *reg_type_str(struct bpf_verifier_env *env, + enum bpf_reg_type type) +{ + char postfix[16] = {0}, prefix[16] = {0}; + static const char * const str[] = { + [NOT_INIT] = "?", + [SCALAR_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_STACK] = "fp", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", + [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", + [PTR_TO_MEM] = "mem", + [PTR_TO_BUF] = "buf", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", + }; + + if (type & PTR_MAYBE_NULL) { + if (base_type(type) == PTR_TO_BTF_ID || + base_type(type) == PTR_TO_PERCPU_BTF_ID) + strncpy(postfix, "or_null_", 16); + else + strncpy(postfix, "_or_null", 16); + } + + if (type & MEM_RDONLY) + strncpy(prefix, "rdonly_", 16); + if (type & MEM_ALLOC) + strncpy(prefix, "alloc_", 16); + + snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", + prefix, str[base_type(type)], postfix); + return env->type_str_buf; +} static char slot_type_char[] = { [STACK_INVALID] = '?', @@ -606,6 +611,44 @@ static const char *kernel_type_name(const struct btf* btf, u32 id) return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } +static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) +{ + env->scratched_regs |= 1U << regno; +} + +static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) +{ + env->scratched_stack_slots |= 1ULL << spi; +} + +static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) +{ + return (env->scratched_regs >> regno) & 1; +} + +static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) +{ + return (env->scratched_stack_slots >> regno) & 1; +} + +static bool verifier_state_scratched(const struct bpf_verifier_env *env) +{ + return env->scratched_regs || env->scratched_stack_slots; +} + +static void mark_verifier_state_clean(struct bpf_verifier_env *env) +{ + env->scratched_regs = 0U; + env->scratched_stack_slots = 0ULL; +} + +/* Used for printing the entire verifier state. */ +static void mark_verifier_state_scratched(struct bpf_verifier_env *env) +{ + env->scratched_regs = ~0U; + env->scratched_stack_slots = ~0ULL; +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -621,7 +664,8 @@ static void scrub_spilled_slot(u8 *stype) } static void print_verifier_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state) + const struct bpf_func_state *state, + bool print_all) { const struct bpf_reg_state *reg; enum bpf_reg_type t; @@ -634,9 +678,11 @@ static void print_verifier_state(struct bpf_verifier_env *env, t = reg->type; if (t == NOT_INIT) continue; + if (!print_all && !reg_scratched(env, i)) + continue; verbose(env, " R%d", i); print_liveness(env, reg->live); - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && @@ -644,9 +690,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || - t == PTR_TO_BTF_ID_OR_NULL || - t == PTR_TO_PERCPU_BTF_ID) + if (base_type(t) == PTR_TO_BTF_ID || + base_type(t) == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -655,10 +700,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); - else if (t == CONST_PTR_TO_MAP || - t == PTR_TO_MAP_KEY || - t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) + else if (base_type(t) == CONST_PTR_TO_MAP || + base_type(t) == PTR_TO_MAP_KEY || + base_type(t) == PTR_TO_MAP_VALUE) verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); @@ -723,12 +767,14 @@ static void print_verifier_state(struct bpf_verifier_env *env, types_buf[BPF_REG_SIZE] = 0; if (!valid) continue; + if (!print_all && !stack_slot_scratched(env, i)) + continue; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, state->stack[i].spilled_ptr.live); if (is_spilled_reg(&state->stack[i])) { reg = &state->stack[i].spilled_ptr; t = reg->type; - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) @@ -748,6 +794,26 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (state->in_async_callback_fn) verbose(env, " async_cb"); verbose(env, "\n"); + mark_verifier_state_clean(env); +} + +static inline u32 vlog_alignment(u32 pos) +{ + return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), + BPF_LOG_MIN_ALIGNMENT) - pos - 1; +} + +static void print_insn_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state) +{ + if (env->prev_log_len && env->prev_log_len == env->log.len_used) { + /* remove new line character */ + bpf_vlog_reset(&env->log, env->prev_log_len - 1); + verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' '); + } else { + verbose(env, "%d:", env->insn_idx); + } + print_verifier_state(env, state, false); } /* copy array src of length n * size bytes to dst. dst is reallocated if it's too @@ -1141,8 +1207,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) { - switch (reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: { + if (base_type(reg->type) == PTR_TO_MAP_VALUE) { const struct bpf_map *map = reg->map_ptr; if (map->inner_map_meta) { @@ -1161,32 +1226,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) } else { reg->type = PTR_TO_MAP_VALUE; } - break; - } - case PTR_TO_SOCKET_OR_NULL: - reg->type = PTR_TO_SOCKET; - break; - case PTR_TO_SOCK_COMMON_OR_NULL: - reg->type = PTR_TO_SOCK_COMMON; - break; - case PTR_TO_TCP_SOCK_OR_NULL: - reg->type = PTR_TO_TCP_SOCK; - break; - case PTR_TO_BTF_ID_OR_NULL: - reg->type = PTR_TO_BTF_ID; - break; - case PTR_TO_MEM_OR_NULL: - reg->type = PTR_TO_MEM; - break; - case PTR_TO_RDONLY_BUF_OR_NULL: - reg->type = PTR_TO_RDONLY_BUF; - break; - case PTR_TO_RDWR_BUF_OR_NULL: - reg->type = PTR_TO_RDWR_BUF; - break; - default: - WARN_ONCE(1, "unknown nullable register type"); + return; } + + reg->type &= ~PTR_MAYBE_NULL; } static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) @@ -1366,22 +1409,28 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } +static bool __reg32_bound_s64(s32 a) +{ + return a >= 0 && a <= S32_MAX; +} + static void __reg_assign_32_into_64(struct bpf_reg_state *reg) { reg->umin_value = reg->u32_min_value; reg->umax_value = reg->u32_max_value; - /* Attempt to pull 32-bit signed bounds into 64-bit bounds - * but must be positive otherwise set to worse case bounds - * and refine later from tnum. + + /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must + * be positive otherwise set to worse case bounds and refine later + * from tnum. */ - if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0) - reg->smax_value = reg->s32_max_value; - else - reg->smax_value = U32_MAX; - if (reg->s32_min_value >= 0) + if (__reg32_bound_s64(reg->s32_min_value) && + __reg32_bound_s64(reg->s32_max_value)) { reg->smin_value = reg->s32_min_value; - else + reg->smax_value = reg->s32_max_value; + } else { reg->smin_value = 0; + reg->smax_value = U32_MAX; + } } static void __reg_combine_32_into_64(struct bpf_reg_state *reg) @@ -1538,6 +1587,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->frameno = frameno; state->subprogno = subprogno; init_reg_state(env, state); + mark_verifier_state_scratched(env); } /* Similar to push_stack(), but for async callbacks */ @@ -2041,7 +2091,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, break; if (parent->live & REG_LIVE_DONE) { verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str[parent->type], + reg_type_str(env, parent->type), parent->var_off.value, parent->off); return -EFAULT; } @@ -2225,6 +2275,8 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return -EINVAL; } + mark_reg_scratched(env, regno); + reg = ®s[regno]; rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { @@ -2329,7 +2381,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (insn->code == 0) return 0; - if (env->log.level & BPF_LOG_LEVEL) { + if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); verbose(env, "%d: ", idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); @@ -2379,8 +2431,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, */ if (insn->src_reg != BPF_REG_FP) return 0; - if (BPF_SIZE(insn->code) != BPF_DW) - return 0; /* dreg = *(u64 *)[fp - off] was a fill from the stack. * that [fp - off] slot contains scalar that needs to be @@ -2403,8 +2453,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, /* scalars can only be spilled into stack */ if (insn->dst_reg != BPF_REG_FP) return 0; - if (BPF_SIZE(insn->code) != BPF_DW) - return 0; spi = (-insn->off - 1) / BPF_REG_SIZE; if (spi >= 64) { verbose(env, "BUG spi %d\n", spi); @@ -2587,7 +2635,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; - if (env->log.level & BPF_LOG_LEVEL) + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); for (i = last_idx;;) { if (skip_first) { @@ -2674,11 +2722,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, new_marks = true; reg->precise = true; } - if (env->log.level & BPF_LOG_LEVEL) { - print_verifier_state(env, func); - verbose(env, "parent %s regs=%x stack=%llx marks\n", + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "parent %s regs=%x stack=%llx marks:", new_marks ? "didn't have" : "already had", reg_mask, stack_mask); + print_verifier_state(env, func, true); } if (!reg_mask && !stack_mask) @@ -2704,9 +2752,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) static bool is_spillable_regtype(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_MAP_VALUE: - case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -2715,21 +2762,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: - case PTR_TO_RDONLY_BUF: - case PTR_TO_RDONLY_BUF_OR_NULL: - case PTR_TO_RDWR_BUF: - case PTR_TO_RDWR_BUF_OR_NULL: + case PTR_TO_BUF: case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: - case PTR_TO_MEM_OR_NULL: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: return true; @@ -2834,6 +2873,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, env->insn_aux_data[insn_idx].sanitize_stack_spill = true; } + mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { @@ -2955,6 +2995,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, slot = -i - 1; spi = slot / BPF_REG_SIZE; stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + mark_stack_slot_scratched(env, spi); if (!env->allow_ptr_leaks && *stype != NOT_INIT @@ -3371,11 +3412,8 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. - */ - if (env->log.level & BPF_LOG_LEVEL) - print_verifier_state(env, state); - - /* The minimum value is only important with signed + * + * The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use @@ -3570,7 +3608,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) { + if (base_type(*reg_type) == PTR_TO_BTF_ID) { *btf = info.btf; *btf_id = info.btf_id; } else { @@ -3638,7 +3676,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, } verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str[reg->type], off, size); + regno, reg_type_str(env, reg->type), off, size); return -EACCES; } @@ -3933,16 +3971,17 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) { - /* Access to ctx or passing it to a helper is only allowed in - * its original, unmodified form. + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. */ - if (reg->off) { - verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", - regno, reg->off); + if (!fixed_off_ok && reg->off) { + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); return -EACCES; } @@ -3950,13 +3989,20 @@ int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } return 0; } +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -4365,15 +4411,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } - } else if (reg->type == PTR_TO_MEM) { + } else if (base_type(reg->type) == PTR_TO_MEM) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + + if (type_may_be_null(reg->type)) { + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + + if (t == BPF_WRITE && rdonly_mem) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into mem\n", value_regno); return -EACCES; } + err = check_mem_region_access(env, regno, off, size, reg->mem_size, false); - if (!err && t == BPF_READ && value_regno >= 0) + if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -4386,7 +4447,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; @@ -4403,7 +4464,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (reg_type_may_be_null(reg_type)) + if (type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -4411,8 +4472,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID || - reg_type == PTR_TO_BTF_ID_OR_NULL) { + if (base_type(reg_type) == PTR_TO_BTF_ID) { regs[value_regno].btf = btf; regs[value_regno].btf_id = btf_id; } @@ -4465,7 +4525,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); + regno, reg_type_str(env, reg->type)); return -EACCES; } err = check_sock_access(env, insn_idx, regno, off, size, t); @@ -4481,26 +4541,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); - } else if (reg->type == PTR_TO_RDONLY_BUF) { - if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); - return -EACCES; + } else if (base_type(reg->type) == PTR_TO_BUF) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + const char *buf_info; + u32 *max_access; + + if (rdonly_mem) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; } + err = check_buffer_access(env, reg, regno, off, size, false, - "rdonly", - &env->prog->aux->max_rdonly_access); - if (!err && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_RDWR_BUF) { - err = check_buffer_access(env, reg, regno, off, size, false, - "rdwr", - &env->prog->aux->max_rdwr_access); - if (!err && t == BPF_READ && value_regno >= 0) + buf_info, max_access); + + if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EACCES; } @@ -4551,9 +4617,16 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (insn->imm == BPF_CMPXCHG) { /* Check comparison of R0 with memory location */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); + const u32 aux_reg = BPF_REG_0; + + err = check_reg_arg(env, aux_reg, SRC_OP); if (err) return err; + + if (is_pointer_value(env, aux_reg)) { + verbose(env, "R%d leaks addr into mem\n", aux_reg); + return -EACCES; + } } if (is_pointer_value(env, insn->src_reg)) { @@ -4567,7 +4640,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } @@ -4588,13 +4661,19 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i load_reg = -1; } - /* check whether we can read the memory */ + /* Check whether we can read the memory, with second call for fetch + * case to simulate the register fill. + */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, true); + BPF_SIZE(insn->code), BPF_READ, -1, true); + if (!err && load_reg >= 0) + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, load_reg, + true); if (err) return err; - /* check whether we can write into the same memory */ + /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true); if (err) @@ -4744,8 +4823,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + const char *buf_info; + u32 *max_access; - switch (reg->type) { + switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, @@ -4764,18 +4845,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); - case PTR_TO_RDONLY_BUF: - if (meta && meta->raw_mode) - return -EACCES; - return check_buffer_access(env, reg, regno, reg->off, - access_size, zero_size_allowed, - "rdonly", - &env->prog->aux->max_rdonly_access); - case PTR_TO_RDWR_BUF: + case PTR_TO_BUF: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) + return -EACCES; + + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; + } return check_buffer_access(env, reg, regno, reg->off, access_size, zero_size_allowed, - "rdwr", - &env->prog->aux->max_rdwr_access); + buf_info, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, @@ -4787,9 +4870,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, register_is_null(reg)) return 0; - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); + verbose(env, "R%d type=%s ", regno, + reg_type_str(env, reg->type)); + verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; } } @@ -4800,7 +4883,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, if (register_is_null(reg)) return 0; - if (reg_type_may_be_null(reg->type)) { + if (type_may_be_null(reg->type)) { /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -4948,9 +5031,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { - return type == ARG_PTR_TO_MEM || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_UNINIT_MEM; + return base_type(type) == ARG_PTR_TO_MEM || + base_type(type) == ARG_PTR_TO_UNINIT_MEM; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -5055,8 +5137,8 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, - PTR_TO_RDONLY_BUF, - PTR_TO_RDWR_BUF, + PTR_TO_MEM | MEM_ALLOC, + PTR_TO_BUF, }, }; @@ -5073,7 +5155,7 @@ static const struct bpf_reg_types int_ptr_types = { static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -5087,31 +5169,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_MAP_PTR] = &const_map_ptr_types, [ARG_PTR_TO_CTX] = &context_types, - [ARG_PTR_TO_CTX_OR_NULL] = &context_types, [ARG_PTR_TO_SOCK_COMMON] = &sock_types, #ifdef CONFIG_NET [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, #endif [ARG_PTR_TO_SOCKET] = &fullsock_types, - [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, - [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, [ARG_PTR_TO_FUNC] = &func_ptr_types, - [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, + [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, }; @@ -5125,12 +5202,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const struct bpf_reg_types *compatible; int i, j; - compatible = compatible_reg_types[arg_type]; + compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); return -EFAULT; } + /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY + * + * Same for MAYBE_NULL: + * + * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL + * + * Therefore we fold these flags depending on the arg_type before comparison. + */ + if (arg_type & MEM_RDONLY) + type &= ~MEM_RDONLY; + if (arg_type & PTR_MAYBE_NULL) + type &= ~PTR_MAYBE_NULL; + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -5140,14 +5232,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) - verbose(env, "%s, ", reg_type_str[compatible->types[j]]); - verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); + verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); return -EACCES; found: - if (type == PTR_TO_BTF_ID) { + if (reg->type == PTR_TO_BTF_ID) { if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -5163,12 +5255,6 @@ found: kernel_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } } return 0; @@ -5206,15 +5292,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; } - if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + if (register_is_null(reg) && type_may_be_null(arg_type)) /* A NULL register has a SCALAR_VALUE type, so skip * type checking. */ @@ -5224,10 +5309,33 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - if (type == PTR_TO_CTX) { - err = check_ctx_reg(env, reg, regno); + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type == ARG_PTR_TO_ALLOC_MEM) + goto force_off_check; + break; + /* All the rest must be rejected: */ + default: +force_off_check: + err = __check_ptr_off_reg(env, reg, regno, + type == PTR_TO_BTF_ID); if (err < 0) return err; + break; } skip_type_check: @@ -5283,10 +5391,11 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE || - (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && - !register_is_null(reg)) || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (type_may_be_null(arg_type) && register_is_null(reg)) + return 0; + /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ @@ -5950,6 +6059,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) { struct bpf_verifier_state *async_cb; @@ -6008,9 +6118,9 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); - print_verifier_state(env, caller); + print_verifier_state(env, caller, true); verbose(env, "callee:\n"); - print_verifier_state(env, callee); + print_verifier_state(env, callee, true); } return 0; } @@ -6101,6 +6211,27 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_loop_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, + * u64 flags); + * callback_fn(u32 index, void *callback_ctx); + */ + callee->regs[BPF_REG_1].type = SCALAR_VALUE; + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + return 0; +} + static int set_timer_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -6130,6 +6261,33 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_find_vma_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_find_vma(struct task_struct *task, u64 addr, + * void *callback_fn, void *callback_ctx, u64 flags) + * (callback_fn)(struct task_struct *task, + * struct vm_area_struct *vma, void *callback_ctx); + */ + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1]; + + callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].btf = btf_vmlinux; + callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA], + + /* pointer to stack or null */ + callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -6177,9 +6335,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) *insn_idx = callee->callsite + 1; if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); - print_verifier_state(env, callee); + print_verifier_state(env, callee, true); verbose(env, "to caller at %d:\n", *insn_idx); - print_verifier_state(env, caller); + print_verifier_state(env, caller, true); } /* clear everything in the callee */ free_func_state(callee); @@ -6345,13 +6503,11 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, static int check_get_func_ip(struct bpf_verifier_env *env) { - enum bpf_attach_type eatype = env->prog->expected_attach_type; enum bpf_prog_type type = resolve_prog_type(env->prog); int func_id = BPF_FUNC_get_func_ip; if (type == BPF_PROG_TYPE_TRACING) { - if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT && - eatype != BPF_MODIFY_RETURN) { + if (!bpf_prog_has_trampoline(env->prog)) { verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n", func_id_name(func_id), func_id); return -ENOTSUPP; @@ -6370,6 +6526,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; + enum bpf_return_type ret_type; + enum bpf_type_flag ret_flag; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; int insn_idx = *insn_idx_p; @@ -6447,13 +6605,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (func_id == BPF_FUNC_tail_call) { - err = check_reference_leak(env); - if (err) { - verbose(env, "tail_call would lead to reference leak\n"); - return err; - } - } else if (is_release_function(func_id)) { + if (is_release_function(func_id)) { err = release_reference(env, meta.ref_obj_id); if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", @@ -6464,35 +6616,47 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); - /* check that flags argument in get_local_storage(map, flags) is 0, - * this is required because get_local_storage() can't return an error. - */ - if (func_id == BPF_FUNC_get_local_storage && - !register_is_null(®s[BPF_REG_2])) { - verbose(env, "get_local_storage() doesn't support non-zero flags\n"); - return -EINVAL; - } - - if (func_id == BPF_FUNC_for_each_map_elem) { + switch (func_id) { + case BPF_FUNC_tail_call: + err = check_reference_leak(env); + if (err) { + verbose(env, "tail_call would lead to reference leak\n"); + return err; + } + break; + case BPF_FUNC_get_local_storage: + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (!register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + break; + case BPF_FUNC_for_each_map_elem: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_map_elem_callback_state); - if (err < 0) - return -EINVAL; - } - - if (func_id == BPF_FUNC_timer_set_callback) { + break; + case BPF_FUNC_timer_set_callback: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_timer_callback_state); - if (err < 0) - return -EINVAL; - } - - if (func_id == BPF_FUNC_snprintf) { + break; + case BPF_FUNC_find_vma: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_find_vma_callback_state); + break; + case BPF_FUNC_snprintf: err = check_bpf_snprintf_call(env, regs); - if (err < 0) - return err; + break; + case BPF_FUNC_loop: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_loop_callback_state); + break; } + if (err) + return err; + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -6503,13 +6667,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* update return register (already marked as written above) */ - if (fn->ret_type == RET_INTEGER) { + ret_type = fn->ret_type; + ret_flag = type_flag(fn->ret_type); + if (ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); - } else if (fn->ret_type == RET_VOID) { + } else if (ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || - fn->ret_type == RET_PTR_TO_MAP_VALUE) { + } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -6523,28 +6688,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].map_uid = meta.map_uid; - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - if (map_value_has_spin_lock(meta.map_ptr)) - regs[BPF_REG_0].id = ++env->id_gen; - } else { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; + if (!type_may_be_null(ret_type) && + map_value_has_spin_lock(meta.map_ptr)) { + regs[BPF_REG_0].id = ++env->id_gen; } - } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; + } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; + } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; + } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { + } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -6562,29 +6724,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn tname, PTR_ERR(ret)); return -EINVAL; } - regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_MEM : PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { - regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + /* MEM_RDONLY may be carried from ret_flag, but it + * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise + * it will confuse the check of PTR_TO_BTF_ID in + * check_mem_access(). + */ + ret_flag &= ~MEM_RDONLY; + + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || - fn->ret_type == RET_PTR_TO_BTF_ID) { + } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ? - PTR_TO_BTF_ID : - PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { - verbose(env, "invalid return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "invalid return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), + func_id); return -EINVAL; } /* current BPF helper definitions are only coming from @@ -6593,12 +6756,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].btf = btf_vmlinux; regs[BPF_REG_0].btf_id = ret_btf_id; } else { - verbose(env, "unknown return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "unknown return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; } - if (reg_type_may_be_null(regs[BPF_REG_0].type)) + if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; if (is_ptr_cast_function(func_id)) { @@ -6807,25 +6970,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", - reg_type_str[type], val); + reg_type_str(env, type), val); return false; } if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str[type], reg->off); + reg_type_str(env, type), reg->off); return false; } if (smin == S64_MIN) { verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", - reg_type_str[type]); + reg_type_str(env, type)); return false; } if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { verbose(env, "value %lld makes %s pointer be out of bounds\n", - smin, reg_type_str[type]); + smin, reg_type_str(env, type)); return false; } @@ -7202,11 +7365,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - switch (ptr_reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: + if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; + } + + switch (base_type(ptr_reg->type)) { case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) @@ -7214,14 +7379,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; default: break; @@ -8194,12 +8356,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, state); + print_verifier_state(env, state, true); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, state); + print_verifier_state(env, state, true); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -8308,6 +8470,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->dst_reg); } zext_32_to_64(dst_reg); + + __update_reg_bounds(dst_reg); + __reg_deduce_bounds(dst_reg); + __reg_bound_offset(dst_reg); } } else { /* case: R = imm @@ -8422,7 +8588,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, new_range = dst_reg->off; if (range_right_open) - new_range--; + new_range++; /* Examples for register markings: * @@ -8940,17 +9106,17 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id && + if (type_may_be_null(reg->type) && reg->id == id && !WARN_ON_ONCE(!reg->id)) { - /* Old offset (both fixed and variable parts) should - * have been known-zero, because we don't allow pointer - * arithmetic on pointers that might be NULL. - */ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0) || reg->off)) { - __mark_reg_known_zero(reg); - reg->off = 0; + /* Old offset (both fixed and variable parts) should + * have been known-zero, because we don't allow pointer + * arithmetic on pointers that might be NULL. If we + * see this happening, don't convert the register. + */ + return; } if (is_null) { reg->type = SCALAR_VALUE; @@ -9318,7 +9484,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - reg_type_may_be_null(dst_reg->type)) { + type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ @@ -9334,7 +9500,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level & BPF_LOG_LEVEL) - print_verifier_state(env, this_branch->frame[this_branch->curframe]); + print_insn_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -9369,11 +9535,15 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - if (insn->src_reg == BPF_PSEUDO_BTF_ID) { - mark_reg_known_zero(env, regs, insn->dst_reg); + /* All special src_reg cases are listed below. From this point onwards + * we either succeed and assign a corresponding dst_reg->type after + * zeroing the offset, or fail and reject the program. + */ + mark_reg_known_zero(env, regs, insn->dst_reg); + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { dst_reg->type = aux->btf_var.reg_type; - switch (dst_reg->type) { + switch (base_type(dst_reg->type)) { case PTR_TO_MEM: dst_reg->mem_size = aux->btf_var.mem_size; break; @@ -9409,7 +9579,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || @@ -9513,7 +9682,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } - err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + err = check_ptr_off_reg(env, ®s[ctx_reg], ctx_reg); if (err < 0) return err; @@ -9572,7 +9741,7 @@ static int check_return_code(struct bpf_verifier_env *env) /* enforce return zero from async callbacks like timer */ if (reg->type != SCALAR_VALUE) { verbose(env, "In async callback the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } @@ -9586,7 +9755,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (is_subprog) { if (reg->type != SCALAR_VALUE) { verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } return 0; @@ -9650,7 +9819,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } @@ -10233,6 +10402,78 @@ err_free: return err; } +#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo) +#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE + +static int check_core_relo(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + u32 i, nr_core_relo, ncopy, expected_size, rec_size; + struct bpf_core_relo core_relo = {}; + struct bpf_prog *prog = env->prog; + const struct btf *btf = prog->aux->btf; + struct bpf_core_ctx ctx = { + .log = &env->log, + .btf = btf, + }; + bpfptr_t u_core_relo; + int err; + + nr_core_relo = attr->core_relo_cnt; + if (!nr_core_relo) + return 0; + if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo)) + return -EINVAL; + + rec_size = attr->core_relo_rec_size; + if (rec_size < MIN_CORE_RELO_SIZE || + rec_size > MAX_CORE_RELO_SIZE || + rec_size % sizeof(u32)) + return -EINVAL; + + u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel); + expected_size = sizeof(struct bpf_core_relo); + ncopy = min_t(u32, expected_size, rec_size); + + /* Unlike func_info and line_info, copy and apply each CO-RE + * relocation record one at a time. + */ + for (i = 0; i < nr_core_relo; i++) { + /* future proofing when sizeof(bpf_core_relo) changes */ + err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size); + if (err) { + if (err == -E2BIG) { + verbose(env, "nonzero tailing record in core_relo"); + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, core_relo_rec_size), + &expected_size, sizeof(expected_size))) + err = -EFAULT; + } + break; + } + + if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) { + err = -EFAULT; + break; + } + + if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) { + verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n", + i, core_relo.insn_off, prog->len); + err = -EINVAL; + break; + } + + err = bpf_core_apply(&ctx, &core_relo, i, + &prog->insnsi[core_relo.insn_off / 8]); + if (err) + break; + bpfptr_add(&u_core_relo, rec_size); + } + return err; +} + static int check_btf_info(struct bpf_verifier_env *env, const union bpf_attr *attr, bpfptr_t uattr) @@ -10263,6 +10504,10 @@ static int check_btf_info(struct bpf_verifier_env *env, if (err) return err; + err = check_core_relo(env, attr, uattr); + if (err) + return err; + return 0; } @@ -10431,7 +10676,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; if (rcur->type == NOT_INIT) return false; - switch (rold->type) { + switch (base_type(rold->type)) { case SCALAR_VALUE: if (env->explore_alu_limits) return false; @@ -10453,6 +10698,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: + /* a PTR_TO_MAP_VALUE could be safe to use as a + * PTR_TO_MAP_VALUE_OR_NULL into the same map. + * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- + * checked, doing so could have affected others with the same + * id, and we can't check for that because we lost the id when + * we converted to a PTR_TO_MAP_VALUE. + */ + if (type_may_be_null(rold->type)) { + if (!type_may_be_null(rcur->type)) + return false; + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) + return false; + /* Check our ids match any regs they're supposed to */ + return check_ids(rold->id, rcur->id, idmap); + } + /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. * 'id' is not compared, since it's only used for maps with @@ -10464,20 +10725,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_MAP_VALUE_OR_NULL: - /* a PTR_TO_MAP_VALUE could be safe to use as a - * PTR_TO_MAP_VALUE_OR_NULL into the same map. - * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- - * checked, doing so could have affected others with the same - * id, and we can't check for that because we lost the id when - * we converted to a PTR_TO_MAP_VALUE. - */ - if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) - return false; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) - return false; - /* Check our ids match any regs they're supposed to */ - return check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: if (rcur->type != rold->type) @@ -10506,11 +10753,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -11036,17 +11280,13 @@ next: /* Return true if it's OK to have the same insn return a different type. */ static bool reg_type_mismatch_ok(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_CTX: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -11126,16 +11366,12 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (env->log.level & BPF_LOG_LEVEL2 || - (env->log.level & BPF_LOG_LEVEL && do_print_state)) { - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "%d:", env->insn_idx); - else - verbose(env, "\nfrom %d to %d%s:", - env->prev_insn_idx, env->insn_idx, - env->cur_state->speculative ? - " (speculative execution)" : ""); - print_verifier_state(env, state->frame[state->curframe]); + if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) { + verbose(env, "\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); + print_verifier_state(env, state->frame[state->curframe], true); do_print_state = false; } @@ -11146,9 +11382,15 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; + if (verifier_state_scratched(env)) + print_insn_state(env, state->frame[state->curframe]); + verbose_linfo(env, env->insn_idx, "; "); + env->prev_log_len = env->log.len_used; verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + env->prev_insn_print_len = env->log.len_used - env->prev_log_len; + env->prev_log_len = env->log.len_used; } if (bpf_prog_is_dev_bound(env->prog->aux)) { @@ -11270,7 +11512,7 @@ static int do_check(struct bpf_verifier_env *env) if (is_ctx_reg(env, insn->dst_reg)) { verbose(env, "BPF_ST stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } @@ -11357,6 +11599,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: + mark_verifier_state_scratched(env); update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, &env->insn_idx, pop_log); @@ -11522,7 +11765,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, err = -EINVAL; goto err_put; } - aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; @@ -11682,6 +11925,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } break; case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: break; default: verbose(env, @@ -12865,6 +13111,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + enum bpf_attach_type eatype = prog->expected_attach_type; bool expect_blinding = bpf_jit_blinding_enabled(prog); enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; @@ -13235,11 +13482,79 @@ patch_map_ops_generic: continue; } + /* Implement bpf_get_func_arg inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_arg) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); + insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); + insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); + insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); + insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); + insn_buf[7] = BPF_JMP_A(1); + insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + cnt = 9; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + + /* Implement bpf_get_func_ret inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_ret) { + if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); + insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0); + cnt = 6; + } else { + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP); + cnt = 1; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + + /* Implement get_func_arg_cnt inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_arg_cnt) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + if (!new_prog) + return -ENOMEM; + + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + /* Implement bpf_get_func_ip inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ip) { - /* Load IP address from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + /* Load IP address from ctx - 16 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16); new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); if (!new_prog) @@ -13353,7 +13668,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); - else if (regs[i].type == PTR_TO_MEM_OR_NULL) { + else if (base_type(regs[i].type) == PTR_TO_MEM) { const u32 mem_size = regs[i].mem_size; mark_reg_known_zero(env, regs, i); @@ -13941,13 +14256,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) log->ubuf = (char __user *) (unsigned long) attr->log_buf; log->len_total = attr->log_size; - ret = -EINVAL; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || - !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) + if (!bpf_verifier_log_attr_valid(log)) { + ret = -EINVAL; goto err_unlock; + } } + mark_verifier_state_clean(env); + if (IS_ERR(btf_vmlinux)) { /* Either gcc or pahole or kernel are broken. */ verbose(env, "in-kernel BTF is malformed\n"); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bfbeabc17a9d..6e36e854b512 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -65,6 +65,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) return container_of(kfc, struct cgroup_fs_context, kfc); } +struct cgroup_pidlist; + +struct cgroup_file_ctx { + struct cgroup_namespace *ns; + + struct { + void *trigger; + } psi; + + struct { + bool started; + struct css_task_iter iter; + } procs; + + struct { + struct cgroup_pidlist *pidlist; + } procs1; +}; + /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 81c9e0685948..41e0837a5a0b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -394,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * next pid to display, if any */ struct kernfs_open_file *of = s->private; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -403,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) mutex_lock(&cgrp->pidlist_mutex); /* - * !NULL @of->priv indicates that this isn't the first start() - * after open. If the matching pidlist is around, we can use that. - * Look for it. Note that @of->priv can't be used directly. It - * could already have been destroyed. + * !NULL @ctx->procs1.pidlist indicates that this isn't the first + * start() after open. If the matching pidlist is around, we can use + * that. Look for it. Note that @ctx->procs1.pidlist can't be used + * directly. It could already have been destroyed. */ - if (of->priv) - of->priv = cgroup_pidlist_find(cgrp, type); + if (ctx->procs1.pidlist) + ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ - if (!of->priv) { - ret = pidlist_array_load(cgrp, type, - (struct cgroup_pidlist **)&of->priv); + if (!ctx->procs1.pidlist) { + ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } - l = of->priv; + l = ctx->procs1.pidlist; if (pid) { int end = l->length; @@ -449,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, @@ -460,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* @@ -504,10 +506,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, goto out_unlock; /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. + * Even if we're attaching all tasks in the thread group, we only need + * to check permissions on one of them. Check permissions using the + * credentials from file open to protect against inherited fd attacks. */ - cred = current_cred(); + cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 919194de39c8..9d05c3ca2d5e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -30,6 +30,7 @@ #include "cgroup-internal.h" +#include <linux/bpf-cgroup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> @@ -2650,11 +2651,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - if (!list_empty(&src_cset->mg_preload_node)) return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); @@ -3630,6 +3631,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { + struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; @@ -3641,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3648,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&of->priv, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3679,12 +3686,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { - return psi_trigger_poll(&of->priv, of->file, pt); + struct cgroup_file_ctx *ctx = of->priv; + + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static void cgroup_pressure_release(struct kernfs_open_file *of) { - psi_trigger_replace(&of->priv, NULL); + struct cgroup_file_ctx *ctx = of->priv; + + psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) @@ -3811,24 +3822,43 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx; + int ret; - if (cft->open) - return cft->open(of); - return 0; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); + of->priv = ctx; + + if (!cft->open) + return 0; + + ret = cft->open(of); + if (ret) { + put_cgroup_ns(ctx->ns); + kfree(ctx); + } + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); + put_cgroup_ns(ctx->ns); + kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; @@ -3845,7 +3875,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) @@ -4751,21 +4781,21 @@ void css_task_iter_end(struct css_task_iter *it) static void cgroup_procs_release(struct kernfs_open_file *of) { - if (of->priv) { - css_task_iter_end(of->priv); - kfree(of->priv); - } + struct cgroup_file_ctx *ctx = of->priv; + + if (ctx->procs.started) + css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; - return css_task_iter_next(it); + return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, @@ -4773,21 +4803,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ - if (!it) { + if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); - - it = kzalloc(sizeof(*it), GFP_KERNEL); - if (!it) - return ERR_PTR(-ENOMEM); - of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); + ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); @@ -4838,9 +4865,9 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb) + struct super_block *sb, + struct cgroup_namespace *ns) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; int ret; @@ -4869,11 +4896,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb, bool threadgroup) + struct super_block *sb, bool threadgroup, + struct cgroup_namespace *ns) { int ret = 0; - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; @@ -4890,8 +4918,10 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp, static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; + const struct cred *saved_cred; ssize_t ret; bool locked; @@ -4909,9 +4939,16 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - /* process and thread migrations follow same delegation rule */ + /* + * Process and thread migrations follow same delegation rule. Check + * permissions using the credentials from file open to protect against + * inherited fd attacks. + */ + saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, threadgroup); + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); + revert_creds(saved_cred); if (ret) goto out_finish; @@ -5711,7 +5748,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; - css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); + css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); @@ -6130,7 +6167,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) goto err; ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, - !(kargs->flags & CLONE_THREAD)); + !(kargs->flags & CLONE_THREAD), + current->nsproxy->cgroup_ns); if (ret) goto err; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d0e163a02099..dc653ab26e50 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -616,19 +616,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) struct cpuset *c, *par; int ret; - rcu_read_lock(); - - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; - - /* Remaining checks don't apply to root cpuset */ - ret = 0; + /* The checks don't apply to root cpuset */ if (cur == &top_cpuset) - goto out; + return 0; + rcu_read_lock(); par = parent_cs(cur); /* On legacy hierarchy, we must be a subset of our parent cpuset. */ @@ -3536,7 +3528,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ - int allowed; /* is allocation in zone z allowed? */ + bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1486768f2318..9d331ba44870 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. */ - if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) + if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; raw_spin_lock_irqsave(cpu_lock, flags); @@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; + struct cgroup *parent; if (pos == root) return NULL; @@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * We're gonna walk down to the first leaf and visit/remove it. We * can pick whatever unvisited node as the starting point. */ - if (!pos) + if (!pos) { pos = root; - else + /* return NULL if this subtree is not on-list */ + if (!cgroup_rstat_cpu(pos, cpu)->updated_next) + return NULL; + } else { pos = cgroup_parent(pos); + } /* walk down to the first leaf */ while (true) { @@ -115,33 +120,25 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - if (rstatc->updated_next) { - struct cgroup *parent = cgroup_parent(pos); - - if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; - - prstatc = cgroup_rstat_cpu(parent, cpu); - nextp = &prstatc->updated_children; - while (true) { - struct cgroup_rstat_cpu *nrstatc; - - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - if (*nextp == pos) - break; - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; - } - *nextp = rstatc->updated_next; - } + parent = cgroup_parent(pos); + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; - rstatc->updated_next = NULL; - return pos; + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (*nextp != pos) { + struct cgroup_rstat_cpu *nrstatc; + + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + *nextp = rstatc->updated_next; } - /* only happens for @root */ - return NULL; + rstatc->updated_next = NULL; + return pos; } /* see cgroup_rstat_flush() */ diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config new file mode 100644 index 000000000000..e9ffb0cc1eec --- /dev/null +++ b/kernel/configs/debug.config @@ -0,0 +1,105 @@ +# The config is based on running daily CI for enterprise Linux distros to +# seek regressions on linux-next builds on different bare-metal and virtual +# platforms. It can be used for example, +# +# $ make ARCH=arm64 defconfig debug.config +# +# Keep alphabetically sorted inside each section. +# +# printk and dmesg options +# +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DYNAMIC_DEBUG=y +CONFIG_PRINTK_CALLER=y +CONFIG_PRINTK_TIME=y +CONFIG_SYMBOLIC_ERRNAME=y +# +# Compile-time checks and compiler options +# +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_SECTION_MISMATCH=y +CONFIG_FRAME_WARN=2048 +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +# +# Generic Kernel Debugging Instruments +# +# CONFIG_UBSAN_ALIGNMENT is not set +# CONFIG_UBSAN_DIV_ZERO is not set +# CONFIG_UBSAN_TRAP is not set +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_FS_ALLOW_ALL=y +CONFIG_DEBUG_IRQFLAGS=y +CONFIG_UBSAN=y +CONFIG_UBSAN_BOOL=y +CONFIG_UBSAN_BOUNDS=y +CONFIG_UBSAN_ENUM=y +CONFIG_UBSAN_SHIFT=y +CONFIG_UBSAN_UNREACHABLE=y +# +# Memory Debugging +# +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_WX is not set +# CONFIG_KFENCE is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_SLUB_STATS is not set +CONFIG_PAGE_EXTENSION=y +CONFIG_PAGE_OWNER=y +CONFIG_DEBUG_KMEMLEAK=y +CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y +CONFIG_DEBUG_OBJECTS=y +CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 +CONFIG_DEBUG_OBJECTS_FREE=y +CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y +CONFIG_DEBUG_OBJECTS_RCU_HEAD=y +CONFIG_DEBUG_OBJECTS_TIMERS=y +CONFIG_DEBUG_OBJECTS_WORK=y +CONFIG_DEBUG_PER_CPU_MAPS=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_VIRTUAL=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_PGFLAGS=y +CONFIG_DEBUG_VM_RB=y +CONFIG_DEBUG_VM_VMACACHE=y +CONFIG_GENERIC_PTDUMP=y +CONFIG_KASAN=y +CONFIG_KASAN_GENERIC=y +CONFIG_KASAN_INLINE=y +CONFIG_KASAN_VMALLOC=y +CONFIG_PTDUMP_DEBUGFS=y +CONFIG_SCHED_STACK_END_CHECK=y +CONFIG_SLUB_DEBUG_ON=y +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_DEBUG_ATOMIC_SLEEP=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SOFTLOCKUP_DETECTOR=y +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +# CONFIG_PROVE_RAW_LOCK_NESTING is not set +CONFIG_PROVE_LOCKING=y +# +# Debug kernel data structures +# +CONFIG_BUG_ON_DATA_CORRUPTION=y +# +# RCU Debugging +# +CONFIG_PROVE_RCU=y +CONFIG_PROVE_RCU_LIST=y +# +# Tracers +# +CONFIG_BRANCH_PROFILE_NONE=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 192e43a87407..407a2568f35e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -31,6 +31,7 @@ #include <linux/smpboot.h> #include <linux/relay.h> #include <linux/slab.h> +#include <linux/scs.h> #include <linux/percpu-rwsem.h> #include <linux/cpuset.h> @@ -588,6 +589,12 @@ static int bringup_cpu(unsigned int cpu) int ret; /* + * Reset stale stack state from the last time this CPU was online. + */ + scs_task_reset(idle); + kasan_unpoison_task_stack(idle); + + /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. * Prevent irq alloc/free across the bringup. diff --git a/kernel/crash_core.c b/kernel/crash_core.c index eb53f5ec62c9..256cf6db573c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -6,6 +6,7 @@ #include <linux/buildid.h> #include <linux/crash_core.h> +#include <linux/init.h> #include <linux/utsname.h> #include <linux/vmalloc.h> @@ -295,6 +296,16 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } +/* + * Add a dummy early_param handler to mark crashkernel= as a known command line + * parameter and suppress incorrect warnings in init/main.c. + */ +static int __init parse_crashkernel_dummy(char *arg) +{ + return 0; +} +early_param("crashkernel", parse_crashkernel_dummy); + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 51530d5b15a8..c5e8cea9e05f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -100,19 +100,10 @@ void __delayacct_blkio_start(void) */ void __delayacct_blkio_end(struct task_struct *p) { - struct task_delay_info *delays = p->delays; - u64 *total; - u32 *count; - - if (p->delays->flags & DELAYACCT_PF_SWAPIN) { - total = &delays->swapin_delay; - count = &delays->swapin_count; - } else { - total = &delays->blkio_delay; - count = &delays->blkio_count; - } - - delayacct_end(&delays->lock, &delays->blkio_start, total, count); + delayacct_end(&p->delays->lock, + &p->delays->blkio_start, + &p->delays->blkio_delay, + &p->delays->blkio_count); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -164,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; + tmp = d->compact_delay_total + tsk->delays->compact_delay; + d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; + d->compact_count += tsk->delays->compact_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -179,8 +173,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) unsigned long flags; raw_spin_lock_irqsave(&tsk->delays->lock, flags); - ret = nsec_to_clock_t(tsk->delays->blkio_delay + - tsk->delays->swapin_delay); + ret = nsec_to_clock_t(tsk->delays->blkio_delay); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } @@ -210,3 +203,29 @@ void __delayacct_thrashing_end(void) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count); } + +void __delayacct_swapin_start(void) +{ + current->delays->swapin_start = local_clock(); +} + +void __delayacct_swapin_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->swapin_start, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); +} + +void __delayacct_compact_start(void) +{ + current->delays->compact_start = local_clock(); +} + +void __delayacct_compact_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->compact_start, + ¤t->delays->compact_delay, + ¤t->delays->compact_count); +} diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 4c6c5e0635e3..50f48e9e4598 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -75,15 +75,45 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size) +{ + if (!force_dma_unencrypted(dev)) + return 0; + return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size)); +} + +static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) +{ + int ret; + + if (!force_dma_unencrypted(dev)) + return 0; + ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size)); + if (ret) + pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n"); + return ret; +} + static void __dma_direct_free_pages(struct device *dev, struct page *page, size_t size) { - if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && - swiotlb_free(dev, page, size)) + if (swiotlb_free(dev, page, size)) return; dma_free_contiguous(dev, page, size); } +static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size) +{ + struct page *page = swiotlb_alloc(dev, size); + + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + swiotlb_free(dev, page, size); + return NULL; + } + + return page; +} + static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp) { @@ -93,18 +123,11 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, WARN_ON_ONCE(!PAGE_ALIGNED(size)); + if (is_swiotlb_for_alloc(dev)) + return dma_direct_alloc_swiotlb(dev, size); + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); - if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && - is_swiotlb_for_alloc(dev)) { - page = swiotlb_alloc(dev, size); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - __dma_direct_free_pages(dev, page, size); - return NULL; - } - return page; - } - page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); @@ -133,6 +156,15 @@ again: return page; } +/* + * Check if a potentially blocking operations needs to dip into the atomic + * pools for the given device/gfp. + */ +static bool dma_direct_use_pool(struct device *dev, gfp_t gfp) +{ + return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev); +} + static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { @@ -140,6 +172,9 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, u64 phys_mask; void *ret; + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))) + return NULL; + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); @@ -149,64 +184,103 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, return ret; } +static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + struct page *page; + + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + if (!page) + return NULL; + + /* remove any dirty cache lines on the kernel alias */ + if (!PageHighMem(page)) + arch_dma_prep_coherent(page, size); + + /* return the page pointer as the opaque cookie */ + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return page; +} + void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { + bool remap = false, set_uncached = false; struct page *page; void *ret; - int err; size = PAGE_ALIGN(size); if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); - if (!page) - return NULL; - /* remove any dirty cache lines on the kernel alias */ - if (!PageHighMem(page)) - arch_dma_prep_coherent(page, size); - *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); - /* return the page pointer as the opaque cookie */ - return page; - } + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) + return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp); - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && - !dev_is_dma_coherent(dev) && - !is_swiotlb_for_alloc(dev)) - return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + if (!dev_is_dma_coherent(dev)) { + /* + * Fallback to the arch handler if it exists. This should + * eventually go away. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && + !is_swiotlb_for_alloc(dev)) + return arch_dma_alloc(dev, size, dma_handle, gfp, + attrs); - if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && - !dev_is_dma_coherent(dev)) - return dma_alloc_from_global_coherent(dev, size, dma_handle); + /* + * If there is a global pool, always allocate from it for + * non-coherent devices. + */ + if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL)) + return dma_alloc_from_global_coherent(dev, size, + dma_handle); + + /* + * Otherwise remap if the architecture is asking for it. But + * given that remapping memory is a blocking operation we'll + * instead have to dip into the atomic pools. + */ + remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP); + if (remap) { + if (dma_direct_use_pool(dev, gfp)) + return dma_direct_alloc_from_pool(dev, size, + dma_handle, gfp); + } else { + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED)) + return NULL; + set_uncached = true; + } + } /* - * Remapping or decrypting memory may block. If either is required and - * we can't block, allocate the memory from the atomic pools. - * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must - * set up another device coherent pool by shared-dma-pool and use - * dma_alloc_from_dev_coherent instead. + * Decrypting memory may block, so allocate the memory from the atomic + * pools if we can't block. */ - if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - !gfpflags_allow_blocking(gfp) && - (force_dma_unencrypted(dev) || - (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev))) && - !is_swiotlb_for_alloc(dev)) + if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); if (!page) return NULL; + if (PageHighMem(page)) { + /* + * Depending on the cma= arguments and per-arch setup, + * dma_alloc_contiguous could return highmem pages. + * Without remapping there is no way to return them here, so + * log an error and fail. + */ + if (!IS_ENABLED(CONFIG_DMA_REMAP)) { + dev_info(dev, "Rejecting highmem page from CMA.\n"); + goto out_free_pages; + } + remap = true; + set_uncached = false; + } - if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev)) || - (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { + if (remap) { /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); @@ -216,56 +290,27 @@ void *dma_direct_alloc(struct device *dev, size_t size, __builtin_return_address(0)); if (!ret) goto out_free_pages; - if (force_dma_unencrypted(dev)) { - err = set_memory_decrypted((unsigned long)ret, - 1 << get_order(size)); - if (err) - goto out_free_pages; - } - memset(ret, 0, size); - goto done; - } - - if (PageHighMem(page)) { - /* - * Depending on the cma= arguments and per-arch setup - * dma_alloc_contiguous could return highmem pages. - * Without remapping there is no way to return them here, - * so log an error and fail. - */ - dev_info(dev, "Rejecting highmem page from CMA.\n"); - goto out_free_pages; - } - - ret = page_address(page); - if (force_dma_unencrypted(dev)) { - err = set_memory_decrypted((unsigned long)ret, - 1 << get_order(size)); - if (err) + } else { + ret = page_address(page); + if (dma_set_decrypted(dev, ret, size)) goto out_free_pages; } memset(ret, 0, size); - if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !dev_is_dma_coherent(dev)) { + if (set_uncached) { arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) goto out_encrypt_pages; } -done: + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; out_encrypt_pages: - if (force_dma_unencrypted(dev)) { - err = set_memory_encrypted((unsigned long)page_address(page), - 1 << get_order(size)); - /* If memory cannot be re-encrypted, it must be leaked */ - if (err) - return NULL; - } + if (dma_set_encrypted(dev, page_address(page), size)) + return NULL; out_free_pages: __dma_direct_free_pages(dev, page, size); return NULL; @@ -304,13 +349,14 @@ void dma_direct_free(struct device *dev, size_t size, dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; - if (force_dma_unencrypted(dev)) - set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); - - if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) + if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) { vunmap(cpu_addr); - else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) - arch_dma_clear_uncached(cpu_addr, size); + } else { + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) + arch_dma_clear_uncached(cpu_addr, size); + if (dma_set_encrypted(dev, cpu_addr, 1 << page_order)) + return; + } __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } @@ -321,9 +367,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && - !is_swiotlb_for_alloc(dev)) + if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp); @@ -341,11 +385,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) { - if (set_memory_decrypted((unsigned long)ret, - 1 << get_order(size))) - goto out_free_pages; - } + if (dma_set_decrypted(dev, ret, size)) + goto out_free_pages; memset(ret, 0, size); *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; @@ -366,9 +407,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, dma_free_from_pool(dev, vaddr, size)) return; - if (force_dma_unencrypted(dev)) - set_memory_encrypted((unsigned long)vaddr, 1 << page_order); - + if (dma_set_encrypted(dev, vaddr, 1 << page_order)) + return; __dma_direct_free_pages(dev, page, size); } diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 5f84e6cdb78e..4d40dcce7604 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void) GFP_KERNEL); if (!atomic_pool_kernel) ret = -ENOMEM; - if (IS_ENABLED(CONFIG_ZONE_DMA)) { + if (has_managed_dma()) { atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA); if (!atomic_pool_dma) @@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) if (prev == NULL) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) return atomic_pool_dma32; - if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + if (atomic_pool_dma && (gfp & GFP_DMA)) return atomic_pool_dma; return atomic_pool_kernel; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 8e840fbbed7c..f1e7ea160b43 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -50,6 +50,7 @@ #include <asm/io.h> #include <asm/dma.h> +#include <linux/io.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/iommu-helper.h> @@ -72,6 +73,8 @@ enum swiotlb_force swiotlb_force; struct io_tlb_mem io_tlb_default_mem; +phys_addr_t swiotlb_unencrypted_base; + /* * Max segment that we can provide which (if pages are contingous) will * not be bounced (unless SWIOTLB_FORCE is set). @@ -156,6 +159,34 @@ static inline unsigned long nr_slots(u64 val) } /* + * Remap swioltb memory in the unencrypted physical address space + * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP + * Isolation VMs). + */ +#ifdef CONFIG_HAS_IOMEM +static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) +{ + void *vaddr = NULL; + + if (swiotlb_unencrypted_base) { + phys_addr_t paddr = mem->start + swiotlb_unencrypted_base; + + vaddr = memremap(paddr, bytes, MEMREMAP_WB); + if (!vaddr) + pr_err("Failed to map the unencrypted memory %pa size %lx.\n", + &paddr, bytes); + } + + return vaddr; +} +#else +static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) +{ + return NULL; +} +#endif + +/* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to * call SWIOTLB when the operations are possible. It needs to be called @@ -172,7 +203,12 @@ void __init swiotlb_update_mem_attributes(void) vaddr = phys_to_virt(mem->start); bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); + + mem->vaddr = swiotlb_mem_remap(mem, bytes); + if (!mem->vaddr) + mem->vaddr = vaddr; + + memset(mem->vaddr, 0, bytes); } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, @@ -196,7 +232,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; } + + /* + * If swiotlb_unencrypted_base is set, the bounce buffer memory will + * be remapped and cleared in swiotlb_update_mem_attributes. + */ + if (swiotlb_unencrypted_base) + return; + memset(vaddr, 0, bytes); + mem->vaddr = vaddr; + return; } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) @@ -371,7 +417,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); - unsigned char *vaddr = phys_to_virt(tlb_addr); + unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; unsigned int tlb_offset, orig_addr_offset; if (orig_addr == INVALID_PHYS_ADDR) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index d5a61d565ad5..bad713684c2e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -187,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, /* Check if any of the above work has queued a deferred wakeup */ tick_nohz_user_enter_prepare(); - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); } /* Return the latest work state for arch_exit_to_user_mode() */ @@ -196,7 +196,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, static void exit_to_user_mode_prepare(struct pt_regs *regs) { - unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + unsigned long ti_work = read_thread_flags(); lockdep_assert_irqs_disabled(); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 49972ee99aff..96d476e06c77 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -26,7 +26,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ret) return ret; - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); return 0; } @@ -43,7 +43,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) * disabled in the inner loop before going into guest mode. No need * to disable interrupts here. */ - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) return 0; diff --git a/kernel/events/core.c b/kernel/events/core.c index 523106a506ee..76c754e45d01 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -674,6 +674,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state) WRITE_ONCE(event->state, state); } +/* + * UP store-release, load-acquire + */ + +#define __store_release(ptr, val) \ +do { \ + barrier(); \ + WRITE_ONCE(*(ptr), (val)); \ +} while (0) + +#define __load_acquire(ptr) \ +({ \ + __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ + barrier(); \ + ___p; \ +}) + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -719,34 +736,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return t->time; } -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); + struct perf_cgroup_info *t; - info = this_cpu_ptr(cgrp->info); + t = per_cpu_ptr(event->cgrp->info, event->cpu); + if (!__load_acquire(&t->active)) + return t->time; + now += READ_ONCE(t->timeoffset); + return now; +} - info->time += now - info->timestamp; +static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + if (adv) + info->time += now - info->timestamp; info->timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; + struct perf_cgroup_info *info; if (cgrp) { + u64 now = perf_clock(); + for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); - __update_cgrp_time(cgrp); + info = this_cpu_ptr(cgrp->info); + + __update_cgrp_time(info, now, true); + if (final) + __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { + struct perf_cgroup_info *info; struct perf_cgroup *cgrp; /* @@ -760,8 +794,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - __update_cgrp_time(event->cgrp); + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { + info = this_cpu_ptr(event->cgrp->info); + __update_cgrp_time(info, perf_clock(), true); + } } static inline void @@ -785,7 +821,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + __update_cgrp_time(info, ctx->timestamp, false); + __store_release(&info->active, 1); } } @@ -982,14 +1019,6 @@ out: } static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - -static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; @@ -1066,7 +1095,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) { } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, + bool final) { } @@ -1098,12 +1128,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +static inline u64 perf_cgroup_event_time(struct perf_event *event) { + return 0; } -static inline u64 perf_cgroup_event_time(struct perf_event *event) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } @@ -1525,22 +1555,59 @@ static void perf_unpin_context(struct perf_event_context *ctx) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_event_context *ctx) +static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); - ctx->time += now - ctx->timestamp; + if (adv) + ctx->time += now - ctx->timestamp; ctx->timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +} + +static void update_context_time(struct perf_event_context *ctx) +{ + __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + if (unlikely(!ctx)) + return 0; + if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx ? ctx->time : 0; + return ctx->time; +} + +static u64 perf_event_time_now(struct perf_event *event, u64 now) +{ + struct perf_event_context *ctx = event->ctx; + + if (unlikely(!ctx)) + return 0; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time_now(event, now); + + if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) + return ctx->time; + + now += READ_ONCE(ctx->timeoffset); + return now; } static enum event_type_t get_event_type(struct perf_event *event) @@ -1808,6 +1875,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -1999,6 +2068,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2346,7 +2417,7 @@ __perf_remove_from_context(struct perf_event *event, if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, false); } event_sched_out(event, cpuctx, ctx); @@ -2357,6 +2428,9 @@ __perf_remove_from_context(struct perf_event *event, list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { + if (ctx == &cpuctx->ctx) + update_cgrp_time_from_cpuctx(cpuctx, true); + ctx->is_active = 0; ctx->rotate_necessary = 0; if (ctx->task) { @@ -2388,7 +2462,11 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); - if (!ctx->is_active) { + /* + * Cgroup events are per-cpu events, and must IPI because of + * cgrp_cpuctx_list. + */ + if (!ctx->is_active && !is_cgroup_event(event)) { __perf_remove_from_context(event, __get_cpu_context(ctx), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); @@ -2478,40 +2556,6 @@ void perf_event_disable_inatomic(struct perf_event *event) irq_work_queue(&event->pending); } -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, event->tstamp); - else - event->shadow_ctx_time = event->tstamp - ctx->timestamp; -} - #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -2552,8 +2596,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx); - perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { @@ -2857,11 +2899,14 @@ perf_install_in_context(struct perf_event_context *ctx, * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. + * Similarly, cgroup events for the context also needs the IPI to + * manipulate the cgrp_cpuctx_list. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ - if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && + ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); @@ -3247,16 +3292,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - - if (ctx->task) { - WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) - cpuctx->task_ctx = NULL; - } - /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last @@ -3270,7 +3305,22 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ @@ -3707,13 +3757,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +/* + * Because the userpage is strictly per-event (there is no concept of context, + * so there cannot be a context indirection), every userpage must be updated + * when context time starts :-( + * + * IOW, we must not miss EVENT_TIME edges. + */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_update_userpage(event); return true; @@ -3797,13 +3853,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task) { int is_active = ctx->is_active; - u64 now; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; + if (is_active ^ EVENT_TIME) { + /* start ctx time */ + __update_context_time(ctx, false); + perf_cgroup_set_timestamp(task, ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) @@ -3814,13 +3880,6 @@ ctx_sched_in(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (is_active & EVENT_TIME) { - /* start ctx time */ - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - } - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -4414,6 +4473,18 @@ static inline u64 perf_event_count(struct perf_event *event) return local64_read(&event->count) + atomic64_read(&event->child_count); } +static void calc_timer_values(struct perf_event *event, + u64 *now, + u64 *enabled, + u64 *running) +{ + u64 ctx_time; + + *now = perf_clock(); + ctx_time = perf_event_time_now(event, *now); + __perf_update_times(event, ctx_time, enabled, running); +} + /* * NMI-safe method to read a local event, that is an event that * is: @@ -4473,10 +4544,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 now = event->shadow_ctx_time + perf_clock(); - u64 __enabled, __running; + u64 __enabled, __running, __now;; - __perf_update_times(event, now, &__enabled, &__running); + calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) @@ -5798,18 +5868,6 @@ static int perf_event_index(struct perf_event *event) return event->pmu->event_idx(event); } -static void calc_timer_values(struct perf_event *event, - u64 *now, - u64 *enabled, - u64 *running) -{ - u64 ctx_time; - - *now = perf_clock(); - ctx_time = event->shadow_ctx_time + *now; - __perf_update_times(event, ctx_time, enabled, running); -} - static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; @@ -5934,6 +5992,8 @@ static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *old_rb = NULL; unsigned long flags; + WARN_ON_ONCE(event->parent); + if (event->rb) { /* * Should be impossible, we set this when removing @@ -5991,6 +6051,9 @@ static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6004,6 +6067,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6349,7 +6415,6 @@ accounting: ring_buffer_attach(event, rb); perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { @@ -6521,26 +6586,43 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ -struct perf_guest_info_callbacks *perf_guest_cbs; +#ifdef CONFIG_GUEST_PERF_EVENTS +struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); +DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + +void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = cbs; - return 0; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) + return; + + rcu_assign_pointer(perf_guest_cbs, cbs); + static_call_update(__perf_guest_state, cbs->state); + static_call_update(__perf_guest_get_ip, cbs->get_ip); + + /* Implementing ->handle_intel_pt_intr is optional. */ + if (cbs->handle_intel_pt_intr) + static_call_update(__perf_guest_handle_intel_pt_intr, + cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = NULL; - return 0; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) + return; + + rcu_assign_pointer(perf_guest_cbs, NULL); + static_call_update(__perf_guest_state, (void *)&__static_call_return0); + static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, + (void *)&__static_call_return0); + synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +#endif static void perf_output_sample_regs(struct perf_output_handle *handle, @@ -6696,7 +6778,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event, if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) goto out; @@ -6762,7 +6844,7 @@ static void perf_aux_sample_output(struct perf_event *event, if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) return; @@ -9759,6 +9841,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, continue; if (event->attr.config != entry->type) continue; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + continue; if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } diff --git a/kernel/exit.c b/kernel/exit.c index f702a6a63686..b00a25bb4ab9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -116,7 +116,7 @@ static void __exit_signal(struct task_struct *tsk) * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) - wake_up_process(sig->group_exit_task); + wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); @@ -697,7 +697,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) - wake_up_process(tsk->signal->group_exit_task); + wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { @@ -735,37 +735,22 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - /* - * We can get here from a kernel oops, sometimes with preemption off. - * Start by checking for critical errors. - * Then fix up important state like USER_DS and preemption. - * Then do everything else. - */ - WARN_ON(blk_needs_flush_plug(tsk)); - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); - /* - * If do_exit is called because this processes oopsed, it's possible + * If do_dead is called because this processes oopsed, it's possible * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before * continuing. Amongst other possible reasons, this is to prevent * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. + * + * On uptodate architectures force_uaccess_begin is a noop. On + * architectures that still have set_fs/get_fs in addition to handling + * oopses handles kernel threads that run as set_fs(KERNEL_DS) by + * default. */ force_uaccess_begin(); - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - - profile_task_exit(tsk); kcov_task_exit(tsk); coredump_task_exit(tsk); @@ -773,17 +758,6 @@ void __noreturn do_exit(long code) validate_creds_for_do_exit(tsk); - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - pr_alert("Fixing recursive fault but reboot is needed!\n"); - futex_exit_recursive(tsk); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } - io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ @@ -882,16 +856,46 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } -EXPORT_SYMBOL_GPL(do_exit); -void complete_and_exit(struct completion *comp, long code) +void __noreturn make_task_dead(int signr) { - if (comp) - complete(comp); + /* + * Take the task off the cpu after something catastrophic has + * happened. + * + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ + struct task_struct *tsk = current; - do_exit(code); + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + /* + * We're taking recursive faults here in make_task_dead. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + pr_alert("Fixing recursive fault but reboot is needed!\n"); + futex_exit_recursive(tsk); + tsk->exit_state = EXIT_DEAD; + refcount_inc(&tsk->rcu_users); + do_task_dead(); + } + + do_exit(signr); } -EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) { @@ -907,17 +911,19 @@ do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; @@ -1012,7 +1018,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - status = p->exit_code; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..d75a528f7b21 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> @@ -365,12 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; + dup_vma_anon_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { + free_vma_anon_name(vma); kmem_cache_free(vm_area_cachep, vma); } @@ -754,9 +757,7 @@ void __put_task_struct(struct task_struct *tsk) delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); sched_core_free(tsk); - - if (!profile_handoff_task(tsk)) - free_task(tsk); + free_task(tsk); } EXPORT_SYMBOL_GPL(__put_task_struct); @@ -950,7 +951,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - tsk->pf_io_worker = NULL; + tsk->worker_private = NULL; account_kernel_stack(tsk, 1); @@ -1556,32 +1557,6 @@ out: return error; } -static int copy_io(unsigned long clone_flags, struct task_struct *tsk) -{ -#ifdef CONFIG_BLOCK - struct io_context *ioc = current->io_context; - struct io_context *new_ioc; - - if (!ioc) - return 0; - /* - * Share io context with parent, if CLONE_IO is set - */ - if (clone_flags & CLONE_IO) { - ioc_task_link(ioc); - tsk->io_context = ioc; - } else if (ioprio_valid(ioc->ioprio)) { - new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); - if (unlikely(!new_ioc)) - return -ENOMEM; - - new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); - } -#endif - return 0; -} - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -2032,12 +2007,6 @@ static __latent_entropy struct task_struct *copy_process( siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } - /* - * This _must_ happen before we call free_task(), i.e. before we jump - * to any of the bad_fork_* labels. This is to avoid freeing - * p->set_child_tid which is (ab)used as a kthread's data pointer for - * kernel threads (PF_KTHREAD). - */ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? @@ -2118,12 +2087,16 @@ static __latent_entropy struct task_struct *copy_process( p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); + if (p->flags & PF_KTHREAD) { + if (!set_kthread_struct(p)) + goto bad_fork_cleanup_delayacct; + } #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_threadgroup_lock; + goto bad_fork_cleanup_delayacct; } #endif #ifdef CONFIG_CPUSETS @@ -2460,8 +2433,8 @@ bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); -bad_fork_cleanup_threadgroup_lock: #endif +bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 25d8a88b32e5..51dd822a8060 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -41,11 +41,6 @@ #include "futex.h" #include "../locking/rtmutex_common.h" -#ifndef CONFIG_HAVE_FUTEX_CMPXCHG -int __read_mostly futex_cmpxchg_enabled; -#endif - - /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they @@ -776,9 +771,6 @@ static void exit_robust_list(struct task_struct *curr) unsigned long futex_offset; int rc; - if (!futex_cmpxchg_enabled) - return; - /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -874,9 +866,6 @@ static void compat_exit_robust_list(struct task_struct *curr) compat_long_t futex_offset; int rc; - if (!futex_cmpxchg_enabled) - return; - /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -950,8 +939,6 @@ static void exit_pi_state_list(struct task_struct *curr) struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; - if (!futex_cmpxchg_enabled) - return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -1044,7 +1031,7 @@ static void futex_cleanup(struct task_struct *tsk) * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * - * This is called from the recursive fault handling path in do_exit(). + * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can @@ -1125,26 +1112,6 @@ void futex_exit_release(struct task_struct *tsk) futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } -static void __init futex_detect_cmpxchg(void) -{ -#ifndef CONFIG_HAVE_FUTEX_CMPXCHG - u32 curval; - - /* - * This will fail and we want it. Some arch implementations do - * runtime detection of the futex_atomic_cmpxchg_inatomic() - * functionality. We want to know that before we call in any - * of the complex code paths. Also we want to prevent - * registration of robust lists in that case. NULL is - * guaranteed to fault and we get -EFAULT on functional - * implementation, the non-functional ones will return - * -ENOSYS. - */ - if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT) - futex_cmpxchg_enabled = 1; -#endif -} - static int __init futex_init(void) { unsigned int futex_shift; @@ -1163,8 +1130,6 @@ static int __init futex_init(void) futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; - futex_detect_cmpxchg(); - for (i = 0; i < futex_hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 040ae4277cb0..c264cbeab71c 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -27,12 +27,6 @@ #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 -#ifdef CONFIG_HAVE_FUTEX_CMPXCHG -#define futex_cmpxchg_enabled 1 -#else -extern int __read_mostly futex_cmpxchg_enabled; -#endif - #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); #else diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 6f91a07a6a83..086a22d1adb7 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -29,8 +29,6 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) { - if (!futex_cmpxchg_enabled) - return -ENOSYS; /* * The kernel knows only one size for now: */ @@ -56,9 +54,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, unsigned long ret; struct task_struct *p; - if (!futex_cmpxchg_enabled) - return -ENOSYS; - rcu_read_lock(); ret = -ESRCH; @@ -104,17 +99,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, } switch (cmd) { - case FUTEX_LOCK_PI: - case FUTEX_LOCK_PI2: - case FUTEX_UNLOCK_PI: - case FUTEX_TRYLOCK_PI: - case FUTEX_WAIT_REQUEUE_PI: - case FUTEX_CMP_REQUEUE_PI: - if (!futex_cmpxchg_enabled) - return -ENOSYS; - } - - switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; @@ -323,9 +307,6 @@ COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, compat_size_t, len) { - if (!futex_cmpxchg_enabled) - return -ENOSYS; - if (unlikely(len != sizeof(*head))) return -EINVAL; @@ -342,9 +323,6 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, unsigned long ret; struct task_struct *p; - if (!futex_cmpxchg_enabled) - return -ENOSYS; - rcu_read_lock(); ret = -ESRCH; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 053447183ac5..04f4ebdc3cf5 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - depends on !CC_IS_CLANG || CLANG_VERSION >= 110000 depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR select CONSTRUCTORS default n diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9888e2bc8c76..52501e5f7655 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,7 +63,9 @@ static struct task_struct *watchdog_task; * Should we dump all CPUs backtraces in a hung task event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ /* @@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked, MAX_SCHEDULE_TIMEOUT; } +#ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); +static struct ctl_table hung_task_sysctls[] = { +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_NEG_ONE, + }, + {} +}; + +static void __init hung_task_sysctl_init(void) +{ + register_sysctl_init("kernel", hung_task_sysctls); +} +#else +#define hung_task_sysctl_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + + static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) @@ -310,6 +384,7 @@ static int __init hung_task_init(void) pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + hung_task_sysctl_init(); return 0; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f895265d7548..c09324663088 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -575,8 +575,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq); */ void handle_untracked_irq(struct irq_desc *desc) { - unsigned int flags = 0; - raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) @@ -593,7 +591,7 @@ void handle_untracked_irq(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); - __handle_irq_event_percpu(desc, &flags); + __handle_irq_event_percpu(desc); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 6f29bf4c8515..f0862eb6b506 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -451,7 +451,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) } -struct irq_domain_ops irq_generic_chip_ops = { +const struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27182003b879..9489f93b3db3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int irq = desc->irq_data.irq; @@ -174,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags } __irq_wake_thread(desc, action); - - fallthrough; /* to add to randomness */ - case IRQ_HANDLED: - *flags |= action->flags; break; default: @@ -193,11 +189,10 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval; - unsigned int flags = 0; - retval = __handle_irq_event_percpu(desc, &flags); + retval = __handle_irq_event_percpu(desc); - add_interrupt_randomness(desc->irq_data.irq, flags); + add_interrupt_randomness(desc->irq_data.irq); if (!irq_settings_no_debug(desc)) note_interrupt(desc, retval); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 54363527feea..99cbdf55a8bd 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -103,7 +103,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data, extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7405e384e5ed..f23ffd30385b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -486,7 +486,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) } EXPORT_SYMBOL_GPL(irq_force_affinity); -int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) +int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, + bool setaffinity) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); @@ -495,12 +496,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) return -EINVAL; desc->affinity_hint = m; irq_put_desc_unlock(desc, flags); - /* set the initial affinity to prevent every interrupt being on CPU0 */ - if (m) + if (m && setaffinity) __irq_set_affinity(irq, m, false); return 0; } -EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7f350ae59c5f..2bdfce5edafd 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -14,12 +14,15 @@ #include <linux/irqdomain.h> #include <linux/msi.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <linux/pci.h> #include "internals.h" +static inline int msi_sysfs_create_group(struct device *dev); + /** - * alloc_msi_entry - Allocate an initialized msi_desc + * msi_alloc_desc - Allocate an initialized msi_desc * @dev: Pointer to the device for which this is allocated * @nvec: The number of vectors used in this entry * @affinity: Optional pointer to an affinity mask array size of @nvec @@ -29,34 +32,134 @@ * * Return: pointer to allocated &msi_desc on success or %NULL on failure */ -struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, - const struct irq_affinity_desc *affinity) +static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec, + const struct irq_affinity_desc *affinity) { - struct msi_desc *desc; + struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); - desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) return NULL; - INIT_LIST_HEAD(&desc->list); desc->dev = dev; desc->nvec_used = nvec; if (affinity) { - desc->affinity = kmemdup(affinity, - nvec * sizeof(*desc->affinity), GFP_KERNEL); + desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL); if (!desc->affinity) { kfree(desc); return NULL; } } - return desc; } -void free_msi_entry(struct msi_desc *entry) +static void msi_free_desc(struct msi_desc *desc) { - kfree(entry->affinity); - kfree(entry); + kfree(desc->affinity); + kfree(desc); +} + +static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index) +{ + int ret; + + desc->msi_index = index; + ret = xa_insert(&md->__store, index, desc, GFP_KERNEL); + if (ret) + msi_free_desc(desc); + return ret; +} + +/** + * msi_add_msi_desc - Allocate and initialize a MSI descriptor + * @dev: Pointer to the device for which the descriptor is allocated + * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor + * + * Return: 0 on success or an appropriate failure code. + */ +int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) +{ + struct msi_desc *desc; + + lockdep_assert_held(&dev->msi.data->mutex); + + desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity); + if (!desc) + return -ENOMEM; + + /* Copy type specific data to the new descriptor. */ + desc->pci = init_desc->pci; + return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index); +} + +/** + * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors + * @dev: Pointer to the device for which the descriptors are allocated + * @index: Index for the first MSI descriptor + * @ndesc: Number of descriptors to allocate + * + * Return: 0 on success or an appropriate failure code. + */ +static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc) +{ + unsigned int idx, last = index + ndesc - 1; + struct msi_desc *desc; + int ret; + + lockdep_assert_held(&dev->msi.data->mutex); + + for (idx = index; idx <= last; idx++) { + desc = msi_alloc_desc(dev, 1, NULL); + if (!desc) + goto fail_mem; + ret = msi_insert_desc(dev->msi.data, desc, idx); + if (ret) + goto fail; + } + return 0; + +fail_mem: + ret = -ENOMEM; +fail: + msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last); + return ret; +} + +static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter) +{ + switch (filter) { + case MSI_DESC_ALL: + return true; + case MSI_DESC_NOTASSOCIATED: + return !desc->irq; + case MSI_DESC_ASSOCIATED: + return !!desc->irq; + } + WARN_ON_ONCE(1); + return false; +} + +/** + * msi_free_msi_descs_range - Free MSI descriptors of a device + * @dev: Device to free the descriptors + * @filter: Descriptor state filter + * @first_index: Index to start freeing from + * @last_index: Last index to be freed + */ +void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, + unsigned int first_index, unsigned int last_index) +{ + struct xarray *xa = &dev->msi.data->__store; + struct msi_desc *desc; + unsigned long idx; + + lockdep_assert_held(&dev->msi.data->mutex); + + xa_for_each_range(xa, idx, desc, first_index, last_index) { + if (msi_desc_match(desc, filter)) { + xa_erase(xa, idx); + msi_free_desc(desc); + } + } } void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) @@ -72,139 +175,290 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) } EXPORT_SYMBOL_GPL(get_cached_msi_msg); -static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, - char *buf) +static void msi_device_data_release(struct device *dev, void *res) { - struct msi_desc *entry; - bool is_msix = false; - unsigned long irq; - int retval; + struct msi_device_data *md = res; - retval = kstrtoul(attr->attr.name, 10, &irq); - if (retval) - return retval; + WARN_ON_ONCE(!xa_empty(&md->__store)); + xa_destroy(&md->__store); + dev->msi.data = NULL; +} - entry = irq_get_msi_desc(irq); - if (!entry) - return -ENODEV; +/** + * msi_setup_device_data - Setup MSI device data + * @dev: Device for which MSI device data should be set up + * + * Return: 0 on success, appropriate error code otherwise + * + * This can be called more than once for @dev. If the MSI device data is + * already allocated the call succeeds. The allocated memory is + * automatically released when the device is destroyed. + */ +int msi_setup_device_data(struct device *dev) +{ + struct msi_device_data *md; + int ret; - if (dev_is_pci(dev)) - is_msix = entry->msi_attrib.is_msix; + if (dev->msi.data) + return 0; - return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi"); + md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL); + if (!md) + return -ENOMEM; + + ret = msi_sysfs_create_group(dev); + if (ret) { + devres_free(md); + return ret; + } + + xa_init(&md->__store); + mutex_init(&md->mutex); + dev->msi.data = md; + devres_add(dev, md); + return 0; +} + +/** + * msi_lock_descs - Lock the MSI descriptor storage of a device + * @dev: Device to operate on + */ +void msi_lock_descs(struct device *dev) +{ + mutex_lock(&dev->msi.data->mutex); +} +EXPORT_SYMBOL_GPL(msi_lock_descs); + +/** + * msi_unlock_descs - Unlock the MSI descriptor storage of a device + * @dev: Device to operate on + */ +void msi_unlock_descs(struct device *dev) +{ + /* Invalidate the index wich was cached by the iterator */ + dev->msi.data->__iter_idx = MSI_MAX_INDEX; + mutex_unlock(&dev->msi.data->mutex); +} +EXPORT_SYMBOL_GPL(msi_unlock_descs); + +static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter) +{ + struct msi_desc *desc; + + xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) { + if (msi_desc_match(desc, filter)) + return desc; + } + md->__iter_idx = MSI_MAX_INDEX; + return NULL; } /** - * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices - * @dev: The device(PCI, platform etc) who will get sysfs entries + * msi_first_desc - Get the first MSI descriptor of a device + * @dev: Device to operate on + * @filter: Descriptor state filter + * + * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs() + * must be invoked before the call. * - * Return attribute_group ** so that specific bus MSI can save it to - * somewhere during initilizing msi irqs. If devices has no MSI irq, - * return NULL; if it fails to populate sysfs, return ERR_PTR + * Return: Pointer to the first MSI descriptor matching the search + * criteria, NULL if none found. */ -const struct attribute_group **msi_populate_sysfs(struct device *dev) -{ - const struct attribute_group **msi_irq_groups; - struct attribute **msi_attrs, *msi_attr; - struct device_attribute *msi_dev_attr; - struct attribute_group *msi_irq_group; - struct msi_desc *entry; - int ret = -ENOMEM; - int num_msi = 0; - int count = 0; - int i; +struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter) +{ + struct msi_device_data *md = dev->msi.data; - /* Determine how many msi entries we have */ - for_each_msi_entry(entry, dev) - num_msi += entry->nvec_used; - if (!num_msi) + if (WARN_ON_ONCE(!md)) return NULL; - /* Dynamically create the MSI attributes for the device */ - msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL); - if (!msi_attrs) - return ERR_PTR(-ENOMEM); - - for_each_msi_entry(entry, dev) { - for (i = 0; i < entry->nvec_used; i++) { - msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL); - if (!msi_dev_attr) - goto error_attrs; - msi_attrs[count] = &msi_dev_attr->attr; - - sysfs_attr_init(&msi_dev_attr->attr); - msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d", - entry->irq + i); - if (!msi_dev_attr->attr.name) - goto error_attrs; - msi_dev_attr->attr.mode = 0444; - msi_dev_attr->show = msi_mode_show; - ++count; + lockdep_assert_held(&md->mutex); + + md->__iter_idx = 0; + return msi_find_desc(md, filter); +} +EXPORT_SYMBOL_GPL(msi_first_desc); + +/** + * msi_next_desc - Get the next MSI descriptor of a device + * @dev: Device to operate on + * + * The first invocation of msi_next_desc() has to be preceeded by a + * successful invocation of __msi_first_desc(). Consecutive invocations are + * only valid if the previous one was successful. All these operations have + * to be done within the same MSI mutex held region. + * + * Return: Pointer to the next MSI descriptor matching the search + * criteria, NULL if none found. + */ +struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter) +{ + struct msi_device_data *md = dev->msi.data; + + if (WARN_ON_ONCE(!md)) + return NULL; + + lockdep_assert_held(&md->mutex); + + if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX) + return NULL; + + md->__iter_idx++; + return msi_find_desc(md, filter); +} +EXPORT_SYMBOL_GPL(msi_next_desc); + +/** + * msi_get_virq - Return Linux interrupt number of a MSI interrupt + * @dev: Device to operate on + * @index: MSI interrupt index to look for (0-based) + * + * Return: The Linux interrupt number on success (> 0), 0 if not found + */ +unsigned int msi_get_virq(struct device *dev, unsigned int index) +{ + struct msi_desc *desc; + unsigned int ret = 0; + bool pcimsi; + + if (!dev->msi.data) + return 0; + + pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false; + + msi_lock_descs(dev); + desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index); + if (desc && desc->irq) { + /* + * PCI-MSI has only one descriptor for multiple interrupts. + * PCI-MSIX and platform MSI use a descriptor per + * interrupt. + */ + if (pcimsi) { + if (index < desc->nvec_used) + ret = desc->irq + index; + } else { + ret = desc->irq; } } + msi_unlock_descs(dev); + return ret; +} +EXPORT_SYMBOL_GPL(msi_get_virq); - msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL); - if (!msi_irq_group) - goto error_attrs; - msi_irq_group->name = "msi_irqs"; - msi_irq_group->attrs = msi_attrs; +#ifdef CONFIG_SYSFS +static struct attribute *msi_dev_attrs[] = { + NULL +}; - msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL); - if (!msi_irq_groups) - goto error_irq_group; - msi_irq_groups[0] = msi_irq_group; +static const struct attribute_group msi_irqs_group = { + .name = "msi_irqs", + .attrs = msi_dev_attrs, +}; - ret = sysfs_create_groups(&dev->kobj, msi_irq_groups); - if (ret) - goto error_irq_groups; - - return msi_irq_groups; - -error_irq_groups: - kfree(msi_irq_groups); -error_irq_group: - kfree(msi_irq_group); -error_attrs: - count = 0; - msi_attr = msi_attrs[count]; - while (msi_attr) { - msi_dev_attr = container_of(msi_attr, struct device_attribute, attr); - kfree(msi_attr->name); - kfree(msi_dev_attr); - ++count; - msi_attr = msi_attrs[count]; +static inline int msi_sysfs_create_group(struct device *dev) +{ + return devm_device_add_group(dev, &msi_irqs_group); +} + +static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + /* MSI vs. MSIX is per device not per interrupt */ + bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false; + + return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi"); +} + +static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) +{ + struct device_attribute *attrs = desc->sysfs_attrs; + int i; + + if (!attrs) + return; + + desc->sysfs_attrs = NULL; + for (i = 0; i < desc->nvec_used; i++) { + if (attrs[i].show) + sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name); + kfree(attrs[i].attr.name); } - kfree(msi_attrs); - return ERR_PTR(ret); + kfree(attrs); } +static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) +{ + struct device_attribute *attrs; + int ret, i; + + attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return -ENOMEM; + + desc->sysfs_attrs = attrs; + for (i = 0; i < desc->nvec_used; i++) { + sysfs_attr_init(&attrs[i].attr); + attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i); + if (!attrs[i].attr.name) { + ret = -ENOMEM; + goto fail; + } + + attrs[i].attr.mode = 0444; + attrs[i].show = msi_mode_show; + + ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name); + if (ret) { + attrs[i].show = NULL; + goto fail; + } + } + return 0; + +fail: + msi_sysfs_remove_desc(dev, desc); + return ret; +} + +#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS /** - * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices - * @dev: The device(PCI, platform etc) who will remove sysfs entries - * @msi_irq_groups: attribute_group for device msi_irqs entries + * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device + * @dev: The device (PCI, platform etc) which will get sysfs entries */ -void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups) -{ - struct device_attribute *dev_attr; - struct attribute **msi_attrs; - int count = 0; - - if (msi_irq_groups) { - sysfs_remove_groups(&dev->kobj, msi_irq_groups); - msi_attrs = msi_irq_groups[0]->attrs; - while (msi_attrs[count]) { - dev_attr = container_of(msi_attrs[count], - struct device_attribute, attr); - kfree(dev_attr->attr.name); - kfree(dev_attr); - ++count; - } - kfree(msi_attrs); - kfree(msi_irq_groups[0]); - kfree(msi_irq_groups); +int msi_device_populate_sysfs(struct device *dev) +{ + struct msi_desc *desc; + int ret; + + msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { + if (desc->sysfs_attrs) + continue; + ret = msi_sysfs_populate_desc(dev, desc); + if (ret) + return ret; } + return 0; } +/** + * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device + * @dev: The device (PCI, platform etc) for which to remove + * sysfs entries + */ +void msi_device_destroy_sysfs(struct device *dev) +{ + struct msi_desc *desc; + + msi_for_each_desc(desc, dev, MSI_DESC_ALL) + msi_sysfs_remove_desc(dev, desc); +} +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */ +#else /* CONFIG_SYSFS */ +static inline int msi_sysfs_create_group(struct device *dev) { return 0; } +static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; } +static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { } +#endif /* !CONFIG_SYSFS */ + #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN static inline void irq_chip_write_msi_msg(struct irq_data *data, struct msi_msg *msg) @@ -456,43 +710,38 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, } int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, - int virq, int nvec, msi_alloc_info_t *arg) + int virq_base, int nvec, msi_alloc_info_t *arg) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; struct msi_desc *desc; - int ret = 0; + int ret, virq; - for_each_msi_entry(desc, dev) { - /* Don't even try the multi-MSI brain damage. */ - if (WARN_ON(!desc->irq || desc->nvec_used != 1)) { - ret = -EINVAL; - break; - } + msi_lock_descs(dev); + ret = msi_add_simple_msi_descs(dev, virq_base, nvec); + if (ret) + goto unlock; - if (!(desc->irq >= virq && desc->irq < (virq + nvec))) - continue; + for (virq = virq_base; virq < virq_base + nvec; virq++) { + desc = xa_load(&dev->msi.data->__store, virq); + desc->irq = virq; ops->set_desc(arg, desc); - /* Assumes the domain mutex is held! */ - ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1, - arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (ret) - break; + goto fail; - irq_set_msi_desc_off(desc->irq, 0, desc); - } - - if (ret) { - /* Mop up the damage */ - for_each_msi_entry(desc, dev) { - if (!(desc->irq >= virq && desc->irq < (virq + nvec))) - continue; - - irq_domain_free_irqs_common(domain, desc->irq, 1); - } + irq_set_msi_desc(virq, desc); } + msi_unlock_descs(dev); + return 0; +fail: + for (--virq; virq >= virq_base; virq--) + irq_domain_free_irqs_common(domain, virq, 1); + msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1); +unlock: + msi_unlock_descs(dev); return ret; } @@ -531,8 +780,59 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, * Checking the first MSI descriptor is sufficient. MSIX supports * masking and MSI does so when the can_mask attribute is set. */ - desc = first_msi_entry(dev); - return desc->msi_attrib.is_msix || desc->msi_attrib.can_mask; + desc = msi_first_desc(dev, MSI_DESC_ALL); + return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask; +} + +static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc, + int allocated) +{ + switch(domain->bus_token) { + case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_VMD_MSI: + if (IS_ENABLED(CONFIG_PCI_MSI)) + break; + fallthrough; + default: + return -ENOSPC; + } + + /* Let a failed PCI multi MSI allocation retry */ + if (desc->nvec_used > 1) + return 1; + + /* If there was a successful allocation let the caller know */ + return allocated ? allocated : -ENOSPC; +} + +#define VIRQ_CAN_RESERVE 0x01 +#define VIRQ_ACTIVATE 0x02 +#define VIRQ_NOMASK_QUIRK 0x04 + +static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags) +{ + struct irq_data *irqd = irq_domain_get_irq_data(domain, virq); + int ret; + + if (!(vflags & VIRQ_CAN_RESERVE)) { + irqd_clr_can_reserve(irqd); + if (vflags & VIRQ_NOMASK_QUIRK) + irqd_set_msi_nomask_quirk(irqd); + } + + if (!(vflags & VIRQ_ACTIVATE)) + return 0; + + ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE); + if (ret) + return ret; + /* + * If the interrupt uses reservation mode, clear the activated bit + * so request_irq() will assign the final vector. + */ + if (vflags & VIRQ_CAN_RESERVE) + irqd_clr_activated(irqd); + return 0; } int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, @@ -540,83 +840,103 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - struct irq_data *irq_data; - struct msi_desc *desc; msi_alloc_info_t arg = { }; + unsigned int vflags = 0; + struct msi_desc *desc; + int allocated = 0; int i, ret, virq; - bool can_reserve; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) return ret; - for_each_msi_entry(desc, dev) { + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) + vflags |= VIRQ_ACTIVATE; + + /* + * Interrupt can use a reserved vector and will not occupy + * a real device vector until the interrupt is requested. + */ + if (msi_check_reservation_mode(domain, info, dev)) { + vflags |= VIRQ_CAN_RESERVE; + /* + * MSI affinity setting requires a special quirk (X86) when + * reservation mode is active. + */ + if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) + vflags |= VIRQ_NOMASK_QUIRK; + } + + msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) { ops->set_desc(&arg, desc); virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false, desc->affinity); - if (virq < 0) { - ret = -ENOSPC; - if (ops->handle_error) - ret = ops->handle_error(domain, desc, ret); - if (ops->msi_finish) - ops->msi_finish(&arg, ret); - return ret; - } + if (virq < 0) + return msi_handle_pci_fail(domain, desc, allocated); for (i = 0; i < desc->nvec_used; i++) { irq_set_msi_desc_off(virq, i, desc); irq_debugfs_copy_devname(virq + i, dev); + ret = msi_init_virq(domain, virq + i, vflags); + if (ret) + return ret; + } + if (info->flags & MSI_FLAG_DEV_SYSFS) { + ret = msi_sysfs_populate_desc(dev, desc); + if (ret) + return ret; } + allocated++; } + return 0; +} - if (ops->msi_finish) - ops->msi_finish(&arg, 0); +static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info, + struct device *dev, + unsigned int num_descs) +{ + if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS)) + return 0; - can_reserve = msi_check_reservation_mode(domain, info, dev); + return msi_add_simple_msi_descs(dev, 0, num_descs); +} - /* - * This flag is set by the PCI layer as we need to activate - * the MSI entries before the PCI layer enables MSI in the - * card. Otherwise the card latches a random msi message. - */ - if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) - goto skip_activate; - - for_each_msi_vector(desc, i, dev) { - if (desc->irq == i) { - virq = desc->irq; - dev_dbg(dev, "irq [%d-%d] for MSI\n", - virq, virq + desc->nvec_used - 1); - } +/** + * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @nvec: The number of interrupts to allocate + * + * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() + * pair. Use this for MSI irqdomains which implement their own vector + * allocation/free. + * + * Return: %0 on success or an error code. + */ +int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev, + int nvec) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + int ret; - irq_data = irq_domain_get_irq_data(domain, i); - if (!can_reserve) { - irqd_clr_can_reserve(irq_data); - if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) - irqd_set_msi_nomask_quirk(irq_data); - } - ret = irq_domain_activate_irq(irq_data, can_reserve); - if (ret) - goto cleanup; - } + lockdep_assert_held(&dev->msi.data->mutex); -skip_activate: - /* - * If these interrupts use reservation mode, clear the activated bit - * so request_irq() will assign the final vector. - */ - if (can_reserve) { - for_each_msi_vector(desc, i, dev) { - irq_data = irq_domain_get_irq_data(domain, i); - irqd_clr_activated(irq_data); - } - } - return 0; + ret = msi_domain_add_simple_msi_descs(info, dev, nvec); + if (ret) + return ret; -cleanup: - msi_domain_free_irqs(domain, dev); + ret = ops->domain_alloc_irqs(domain, dev, nvec); + if (ret) + msi_domain_free_irqs_descs_locked(domain, dev); return ret; } @@ -629,52 +949,78 @@ cleanup: * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec) +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec) { - struct msi_domain_info *info = domain->host_data; - struct msi_domain_ops *ops = info->ops; + int ret; - return ops->domain_alloc_irqs(domain, dev, nvec); + msi_lock_descs(dev); + ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec); + msi_unlock_descs(dev); + return ret; } void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { - struct irq_data *irq_data; + struct msi_domain_info *info = domain->host_data; + struct irq_data *irqd; struct msi_desc *desc; int i; - for_each_msi_vector(desc, i, dev) { - irq_data = irq_domain_get_irq_data(domain, i); - if (irqd_is_activated(irq_data)) - irq_domain_deactivate_irq(irq_data); - } - - for_each_msi_entry(desc, dev) { - /* - * We might have failed to allocate an MSI early - * enough that there is no IRQ associated to this - * entry. If that's the case, don't do anything. - */ - if (desc->irq) { - irq_domain_free_irqs(desc->irq, desc->nvec_used); - desc->irq = 0; + /* Only handle MSI entries which have an interrupt associated */ + msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { + /* Make sure all interrupts are deactivated */ + for (i = 0; i < desc->nvec_used; i++) { + irqd = irq_domain_get_irq_data(domain, desc->irq + i); + if (irqd && irqd_is_activated(irqd)) + irq_domain_deactivate_irq(irqd); } + + irq_domain_free_irqs(desc->irq, desc->nvec_used); + if (info->flags & MSI_FLAG_DEV_SYSFS) + msi_sysfs_remove_desc(dev, desc); + desc->irq = 0; } } +static void msi_domain_free_msi_descs(struct msi_domain_info *info, + struct device *dev) +{ + if (info->flags & MSI_FLAG_FREE_MSI_DESCS) + msi_free_msi_descs(dev); +} + /** - * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev + * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev * @domain: The domain to managing the interrupts * @dev: Pointer to device struct of the device for which the interrupts * are free + * + * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() + * pair. Use this for MSI irqdomains which implement their own vector + * allocation. */ -void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - return ops->domain_free_irqs(domain, dev); + lockdep_assert_held(&dev->msi.data->mutex); + + ops->domain_free_irqs(domain, dev); + msi_domain_free_msi_descs(info, dev); +} + +/** + * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + msi_lock_descs(dev); + msi_domain_free_irqs_descs_locked(domain, dev); + msi_unlock_descs(dev); } /** diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ee595ec09778..623b8136e9af 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -137,7 +137,7 @@ static inline int irq_select_affinity_usr(unsigned int irq) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); + unsigned int irq = (int)(long)pde_data(file_inode(file)); cpumask_var_t new_value; int err; @@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_proc_show, pde_data(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_list_proc_show, pde_data(inode)); } static const struct proc_ops irq_affinity_proc_ops = { @@ -265,7 +265,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, PDE_DATA(inode)); + return single_open(file, default_affinity_show, pde_data(inode)); } static const struct proc_ops default_affinity_proc_ops = { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3011bc33a5ba..951c93216fc4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -243,6 +243,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; + cond_resched(); } return 0; } diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index c2bb07f5bcc7..4f35d1bced6a 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -8,9 +8,12 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ + $(call cc-option,-mno-outline-atomics) \ -fno-stack-protector -DDISABLE_BRANCH_PROFILING obj-y := core.o debugfs.o report.o + +KCSAN_INSTRUMENT_BARRIERS_selftest.o := y obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4b84c8e7884b..fe12dfe254ec 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -40,15 +40,17 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644); module_param_named(skip_watch, kcsan_skip_watch, long, 0644); module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444); +#ifdef CONFIG_KCSAN_WEAK_MEMORY +static bool kcsan_weak_memory = true; +module_param_named(weak_memory, kcsan_weak_memory, bool, 0644); +#else +#define kcsan_weak_memory false +#endif + bool kcsan_enabled; /* Per-CPU kcsan_ctx for interrupts */ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { - .disable_count = 0, - .atomic_next = 0, - .atomic_nest_count = 0, - .in_flat_atomic = false, - .access_mask = 0, .scoped_accesses = {LIST_POISON1, NULL}, }; @@ -209,15 +211,17 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip); static noinline void kcsan_check_scoped_accesses(void) { struct kcsan_ctx *ctx = get_ctx(); - struct list_head *prev_save = ctx->scoped_accesses.prev; struct kcsan_scoped_access *scoped_access; - ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */ + if (ctx->disable_scoped) + return; + + ctx->disable_scoped++; list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) { check_access(scoped_access->ptr, scoped_access->size, scoped_access->type, scoped_access->ip); } - ctx->scoped_accesses.prev = prev_save; + ctx->disable_scoped--; } /* Rules for generic atomic accesses. Called from fast-path. */ @@ -325,6 +329,21 @@ static void delay_access(int type) udelay(delay); } +/* + * Reads the instrumented memory for value change detection; value change + * detection is currently done for accesses up to a size of 8 bytes. + */ +static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size) +{ + switch (size) { + case 1: return READ_ONCE(*(const u8 *)ptr); + case 2: return READ_ONCE(*(const u16 *)ptr); + case 4: return READ_ONCE(*(const u32 *)ptr); + case 8: return READ_ONCE(*(const u64 *)ptr); + default: return 0; /* Ignore; we do not diff the values. */ + } +} + void kcsan_save_irqtrace(struct task_struct *task) { #ifdef CONFIG_TRACE_IRQFLAGS @@ -339,6 +358,76 @@ void kcsan_restore_irqtrace(struct task_struct *task) #endif } +static __always_inline int get_kcsan_stack_depth(void) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + return current->kcsan_stack_depth; +#else + BUILD_BUG(); + return 0; +#endif +} + +static __always_inline void add_kcsan_stack_depth(int val) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + current->kcsan_stack_depth += val; +#else + BUILD_BUG(); +#endif +} + +static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + return ctx->disable_scoped ? NULL : &ctx->reorder_access; +#else + return NULL; +#endif +} + +static __always_inline bool +find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, + int type, unsigned long ip) +{ + struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx); + + if (!reorder_access) + return false; + + /* + * Note: If accesses are repeated while reorder_access is identical, + * never matches the new access, because !(type & KCSAN_ACCESS_SCOPED). + */ + return reorder_access->ptr == ptr && reorder_access->size == size && + reorder_access->type == type && reorder_access->ip == ip; +} + +static inline void +set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, + int type, unsigned long ip) +{ + struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx); + + if (!reorder_access || !kcsan_weak_memory) + return; + + /* + * To avoid nested interrupts or scheduler (which share kcsan_ctx) + * reading an inconsistent reorder_access, ensure that the below has + * exclusive access to reorder_access by disallowing concurrent use. + */ + ctx->disable_scoped++; + barrier(); + reorder_access->ptr = ptr; + reorder_access->size = size; + reorder_access->type = type | KCSAN_ACCESS_SCOPED; + reorder_access->ip = ip; + reorder_access->stack_depth = get_kcsan_stack_depth(); + barrier(); + ctx->disable_scoped--; +} + /* * Pull everything together: check_access() below contains the performance * critical operations; the fast-path (including check_access) functions should @@ -377,8 +466,10 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, * The access_mask check relies on value-change comparison. To avoid * reporting a race where e.g. the writer set up the watchpoint, but the * reader has access_mask!=0, we have to ignore the found watchpoint. + * + * reorder_access is never created from an access with access_mask set. */ - if (ctx->access_mask) + if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip)) return; /* @@ -428,11 +519,13 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0; atomic_long_t *watchpoint; u64 old, new, diff; - unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; + bool interrupt_watcher = kcsan_interrupt_watcher; unsigned long ua_flags = user_access_save(); struct kcsan_ctx *ctx = get_ctx(); + unsigned long access_mask = ctx->access_mask; unsigned long irq_flags = 0; + bool is_reorder_access; /* * Always reset kcsan_skip counter in slow-path to avoid underflow; see @@ -456,12 +549,32 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned } /* + * The local CPU cannot observe reordering of its own accesses, and + * therefore we need to take care of 2 cases to avoid false positives: + * + * 1. Races of the reordered access with interrupts. To avoid, if + * the current access is reorder_access, disable interrupts. + * 2. Avoid races of scoped accesses from nested interrupts (below). + */ + is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip); + if (is_reorder_access) + interrupt_watcher = false; + /* + * Avoid races of scoped accesses from nested interrupts (or scheduler). + * Assume setting up a watchpoint for a non-scoped (normal) access that + * also conflicts with a current scoped access. In a nested interrupt, + * which shares the context, it would check a conflicting scoped access. + * To avoid, disable scoped access checking. + */ + ctx->disable_scoped++; + + /* * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's * runtime is entered for every memory access, and potentially useful * information is lost if dirtied by KCSAN. */ kcsan_save_irqtrace(current); - if (!kcsan_interrupt_watcher) + if (!interrupt_watcher) local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); @@ -482,23 +595,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned * Read the current value, to later check and infer a race if the data * was modified via a non-instrumented access, e.g. from a device. */ - old = 0; - switch (size) { - case 1: - old = READ_ONCE(*(const u8 *)ptr); - break; - case 2: - old = READ_ONCE(*(const u16 *)ptr); - break; - case 4: - old = READ_ONCE(*(const u32 *)ptr); - break; - case 8: - old = READ_ONCE(*(const u64 *)ptr); - break; - default: - break; /* ignore; we do not diff the values */ - } + old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size); /* * Delay this thread, to increase probability of observing a racy @@ -510,23 +607,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned * Re-read value, and check if it is as expected; if not, we infer a * racy access. */ - access_mask = ctx->access_mask; - new = 0; - switch (size) { - case 1: - new = READ_ONCE(*(const u8 *)ptr); - break; - case 2: - new = READ_ONCE(*(const u16 *)ptr); - break; - case 4: - new = READ_ONCE(*(const u32 *)ptr); - break; - case 8: - new = READ_ONCE(*(const u64 *)ptr); - break; - default: - break; /* ignore; we do not diff the values */ + if (!is_reorder_access) { + new = read_instrumented_memory(ptr, size); + } else { + /* + * Reordered accesses cannot be used for value change detection, + * because the memory location may no longer be accessible and + * could result in a fault. + */ + new = 0; + access_mask = 0; } diff = old ^ new; @@ -596,10 +686,20 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned */ remove_watchpoint(watchpoint); atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); + out_unlock: - if (!kcsan_interrupt_watcher) + if (!interrupt_watcher) local_irq_restore(irq_flags); kcsan_restore_irqtrace(current); + ctx->disable_scoped--; + + /* + * Reordered accesses cannot be used for value change detection, + * therefore never consider for reordering if access_mask is set. + * ASSERT_EXCLUSIVE are not real accesses, ignore them as well. + */ + if (!access_mask && !is_assert) + set_reorder_access(ctx, ptr, size, type, ip); out: user_access_restore(ua_flags); } @@ -607,7 +707,6 @@ out: static __always_inline void check_access(const volatile void *ptr, size_t size, int type, unsigned long ip) { - const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; atomic_long_t *watchpoint; long encoded_watchpoint; @@ -618,12 +717,14 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip) if (unlikely(size == 0)) return; +again: /* * Avoid user_access_save in fast-path: find_watchpoint is safe without * user_access_save, as the address that ptr points to is only used to * check if a watchpoint exists; ptr is never dereferenced. */ - watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write, + watchpoint = find_watchpoint((unsigned long)ptr, size, + !(type & KCSAN_ACCESS_WRITE), &encoded_watchpoint); /* * It is safe to check kcsan_is_enabled() after find_watchpoint in the @@ -637,9 +738,42 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip) else { struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */ - if (unlikely(should_watch(ctx, ptr, size, type))) + if (unlikely(should_watch(ctx, ptr, size, type))) { kcsan_setup_watchpoint(ptr, size, type, ip); - else if (unlikely(ctx->scoped_accesses.prev)) + return; + } + + if (!(type & KCSAN_ACCESS_SCOPED)) { + struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx); + + if (reorder_access) { + /* + * reorder_access check: simulates reordering of + * the access after subsequent operations. + */ + ptr = reorder_access->ptr; + type = reorder_access->type; + ip = reorder_access->ip; + /* + * Upon a nested interrupt, this context's + * reorder_access can be modified (shared ctx). + * We know that upon return, reorder_access is + * always invalidated by setting size to 0 via + * __tsan_func_exit(). Therefore we must read + * and check size after the other fields. + */ + barrier(); + size = READ_ONCE(reorder_access->size); + if (size) + goto again; + } + } + + /* + * Always checked last, right before returning from runtime; + * if reorder_access is valid, checked after it was checked. + */ + if (unlikely(ctx->scoped_accesses.prev)) kcsan_check_scoped_accesses(); } } @@ -814,6 +948,22 @@ void __kcsan_check_access(const volatile void *ptr, size_t size, int type) } EXPORT_SYMBOL(__kcsan_check_access); +#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \ + void __kcsan_##name(void) \ + { \ + struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \ + if (!sa) \ + return; \ + if (order_before_cond) \ + sa->size = 0; \ + } \ + EXPORT_SYMBOL(__kcsan_##name) + +DEFINE_MEMORY_BARRIER(mb, true); +DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND)); +DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND)); +DEFINE_MEMORY_BARRIER(release, true); + /* * KCSAN uses the same instrumentation that is emitted by supported compilers * for ThreadSanitizer (TSAN). @@ -926,19 +1076,56 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8); DEFINE_TSAN_VOLATILE_READ_WRITE(16); /* - * The below are not required by KCSAN, but can still be emitted by the - * compiler. + * Function entry and exit are used to determine the validty of reorder_access. + * Reordering of the access ends at the end of the function scope where the + * access happened. This is done for two reasons: + * + * 1. Artificially limits the scope where missing barriers are detected. + * This minimizes false positives due to uninstrumented functions that + * contain the required barriers but were missed. + * + * 2. Simplifies generating the stack trace of the access. */ void __tsan_func_entry(void *call_pc); -void __tsan_func_entry(void *call_pc) +noinline void __tsan_func_entry(void *call_pc) { + if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + return; + + add_kcsan_stack_depth(1); } EXPORT_SYMBOL(__tsan_func_entry); + void __tsan_func_exit(void); -void __tsan_func_exit(void) +noinline void __tsan_func_exit(void) { + struct kcsan_scoped_access *reorder_access; + + if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + return; + + reorder_access = get_reorder_access(get_ctx()); + if (!reorder_access) + goto out; + + if (get_kcsan_stack_depth() <= reorder_access->stack_depth) { + /* + * Access check to catch cases where write without a barrier + * (supposed release) was last access in function: because + * instrumentation is inserted before the real access, a data + * race due to the write giving up a c-s would only be caught if + * we do the conflicting access after. + */ + check_access(reorder_access->ptr, reorder_access->size, + reorder_access->type, reorder_access->ip); + reorder_access->size = 0; + reorder_access->stack_depth = INT_MIN; + } +out: + add_kcsan_stack_depth(-1); } EXPORT_SYMBOL(__tsan_func_exit); + void __tsan_init(void); void __tsan_init(void) { @@ -961,10 +1148,19 @@ EXPORT_SYMBOL(__tsan_init); * functions, whose job is to also execute the operation itself. */ +static __always_inline void kcsan_atomic_builtin_memorder(int memorder) +{ + if (memorder == __ATOMIC_RELEASE || + memorder == __ATOMIC_SEQ_CST || + memorder == __ATOMIC_ACQ_REL) + __kcsan_release(); +} + #define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \ { \ + kcsan_atomic_builtin_memorder(memorder); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \ } \ @@ -974,6 +1170,7 @@ EXPORT_SYMBOL(__tsan_init); void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \ { \ + kcsan_atomic_builtin_memorder(memorder); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \ @@ -986,6 +1183,7 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ { \ + kcsan_atomic_builtin_memorder(memorder); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ @@ -1018,6 +1216,7 @@ EXPORT_SYMBOL(__tsan_init); int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ u##bits val, int mo, int fail_mo) \ { \ + kcsan_atomic_builtin_memorder(mo); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ @@ -1033,6 +1232,7 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ int mo, int fail_mo) \ { \ + kcsan_atomic_builtin_memorder(mo); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ @@ -1064,10 +1264,47 @@ DEFINE_TSAN_ATOMIC_OPS(64); void __tsan_atomic_thread_fence(int memorder); void __tsan_atomic_thread_fence(int memorder) { + kcsan_atomic_builtin_memorder(memorder); __atomic_thread_fence(memorder); } EXPORT_SYMBOL(__tsan_atomic_thread_fence); +/* + * In instrumented files, we emit instrumentation for barriers by mapping the + * kernel barriers to an __atomic_signal_fence(), which is interpreted specially + * and otherwise has no relation to a real __atomic_signal_fence(). No known + * kernel code uses __atomic_signal_fence(). + * + * Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which + * are turned into calls to __tsan_atomic_signal_fence(), such instrumentation + * can be disabled via the __no_kcsan function attribute (vs. an explicit call + * which could not). When __no_kcsan is requested, __atomic_signal_fence() + * generates no code. + * + * Note: The result of using __atomic_signal_fence() with KCSAN enabled is + * potentially limiting the compiler's ability to reorder operations; however, + * if barriers were instrumented with explicit calls (without LTO), the compiler + * couldn't optimize much anyway. The result of a hypothetical architecture + * using __atomic_signal_fence() in normal code would be KCSAN false negatives. + */ void __tsan_atomic_signal_fence(int memorder); -void __tsan_atomic_signal_fence(int memorder) { } +noinline void __tsan_atomic_signal_fence(int memorder) +{ + switch (memorder) { + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb: + __kcsan_mb(); + break; + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb: + __kcsan_wmb(); + break; + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb: + __kcsan_rmb(); + break; + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release: + __kcsan_release(); + break; + default: + break; + } +} EXPORT_SYMBOL(__tsan_atomic_signal_fence); diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 660729238588..a36fca063a73 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -16,9 +16,12 @@ #define pr_fmt(fmt) "kcsan_test: " fmt #include <kunit/test.h> +#include <linux/atomic.h> +#include <linux/bitops.h> #include <linux/jiffies.h> #include <linux/kcsan-checks.h> #include <linux/kernel.h> +#include <linux/mutex.h> #include <linux/sched.h> #include <linux/seqlock.h> #include <linux/spinlock.h> @@ -151,7 +154,7 @@ struct expect_report { /* Check observed report matches information in @r. */ __no_kcsan -static bool report_matches(const struct expect_report *r) +static bool __report_matches(const struct expect_report *r) { const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; bool ret = false; @@ -213,9 +216,9 @@ static bool report_matches(const struct expect_report *r) const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC); const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED); const char *const access_type_aux = - (is_atomic && is_scoped) ? " (marked, scoped)" + (is_atomic && is_scoped) ? " (marked, reordered)" : (is_atomic ? " (marked)" - : (is_scoped ? " (scoped)" : "")); + : (is_scoped ? " (reordered)" : "")); if (i == 1) { /* Access 2 */ @@ -253,6 +256,40 @@ out: return ret; } +static __always_inline const struct expect_report * +__report_set_scoped(struct expect_report *r, int accesses) +{ + BUILD_BUG_ON(accesses > 3); + + if (accesses & 1) + r->access[0].type |= KCSAN_ACCESS_SCOPED; + else + r->access[0].type &= ~KCSAN_ACCESS_SCOPED; + + if (accesses & 2) + r->access[1].type |= KCSAN_ACCESS_SCOPED; + else + r->access[1].type &= ~KCSAN_ACCESS_SCOPED; + + return r; +} + +__no_kcsan +static bool report_matches_any_reordered(struct expect_report *r) +{ + return __report_matches(__report_set_scoped(r, 0)) || + __report_matches(__report_set_scoped(r, 1)) || + __report_matches(__report_set_scoped(r, 2)) || + __report_matches(__report_set_scoped(r, 3)); +} + +#ifdef CONFIG_KCSAN_WEAK_MEMORY +/* Due to reordering accesses, any access may appear as "(reordered)". */ +#define report_matches report_matches_any_reordered +#else +#define report_matches __report_matches +#endif + /* ===== Test kernels ===== */ static long test_sink; @@ -263,6 +300,8 @@ static struct { long val[8]; } test_struct; static DEFINE_SEQLOCK(test_seqlock); +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_MUTEX(test_mutex); /* * Helper to avoid compiler optimizing out reads, and to generate source values @@ -271,6 +310,16 @@ static DEFINE_SEQLOCK(test_seqlock); __no_kcsan static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); } +/* + * Generates a delay and some accesses that enter the runtime but do not produce + * data races. + */ +static noinline void test_delay(int iter) +{ + while (iter--) + sink_value(READ_ONCE(test_sink)); +} + static noinline void test_kernel_read(void) { sink_value(test_var); } static noinline void test_kernel_write(void) @@ -432,19 +481,239 @@ static noinline void test_kernel_xor_1bit(void) kcsan_nestable_atomic_end(); } +#define TEST_KERNEL_LOCKED(name, acquire, release) \ + static noinline void test_kernel_##name(void) \ + { \ + long *flag = &test_struct.val[0]; \ + long v = 0; \ + if (!(acquire)) \ + return; \ + while (v++ < 100) { \ + test_var++; \ + barrier(); \ + } \ + release; \ + test_delay(10); \ + } + +TEST_KERNEL_LOCKED(with_memorder, + cmpxchg_acquire(flag, 0, 1) == 0, + smp_store_release(flag, 0)); +TEST_KERNEL_LOCKED(wrong_memorder, + cmpxchg_relaxed(flag, 0, 1) == 0, + WRITE_ONCE(*flag, 0)); +TEST_KERNEL_LOCKED(atomic_builtin_with_memorder, + __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED), + __atomic_store_n(flag, 0, __ATOMIC_RELEASE)); +TEST_KERNEL_LOCKED(atomic_builtin_wrong_memorder, + __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED), + __atomic_store_n(flag, 0, __ATOMIC_RELAXED)); + /* ===== Test cases ===== */ +/* + * Tests that various barriers have the expected effect on internal state. Not + * exhaustive on atomic_t operations. Unlike the selftest, also checks for + * too-strict barrier instrumentation; these can be tolerated, because it does + * not cause false positives, but at least we should be aware of such cases. + */ +static void test_barrier_nothreads(struct kunit *test) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + struct kcsan_scoped_access *reorder_access = ¤t->kcsan_ctx.reorder_access; +#else + struct kcsan_scoped_access *reorder_access = NULL; +#endif + arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; + atomic_t dummy; + + KCSAN_TEST_REQUIRES(test, reorder_access != NULL); + KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP)); + +#define __KCSAN_EXPECT_BARRIER(access_type, barrier, order_before, name) \ + do { \ + reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \ + reorder_access->size = sizeof(test_var); \ + barrier; \ + KUNIT_EXPECT_EQ_MSG(test, reorder_access->size, \ + order_before ? 0 : sizeof(test_var), \ + "improperly instrumented type=(" #access_type "): " name); \ + } while (0) +#define KCSAN_EXPECT_READ_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(0, b, o, #b) +#define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b) +#define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b) + + /* + * Lockdep initialization can strengthen certain locking operations due + * to calling into instrumented files; "warm up" our locks. + */ + spin_lock(&test_spinlock); + spin_unlock(&test_spinlock); + mutex_lock(&test_mutex); + mutex_unlock(&test_mutex); + + /* Force creating a valid entry in reorder_access first. */ + test_var = 0; + while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var)) + __kcsan_check_read(&test_var, sizeof(test_var)); + KUNIT_ASSERT_EQ(test, reorder_access->size, sizeof(test_var)); + + kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */ + + KCSAN_EXPECT_READ_BARRIER(mb(), true); + KCSAN_EXPECT_READ_BARRIER(wmb(), false); + KCSAN_EXPECT_READ_BARRIER(rmb(), true); + KCSAN_EXPECT_READ_BARRIER(smp_mb(), true); + KCSAN_EXPECT_READ_BARRIER(smp_wmb(), false); + KCSAN_EXPECT_READ_BARRIER(smp_rmb(), true); + KCSAN_EXPECT_READ_BARRIER(dma_wmb(), false); + KCSAN_EXPECT_READ_BARRIER(dma_rmb(), true); + KCSAN_EXPECT_READ_BARRIER(smp_mb__before_atomic(), true); + KCSAN_EXPECT_READ_BARRIER(smp_mb__after_atomic(), true); + KCSAN_EXPECT_READ_BARRIER(smp_mb__after_spinlock(), true); + KCSAN_EXPECT_READ_BARRIER(smp_store_mb(test_var, 0), true); + KCSAN_EXPECT_READ_BARRIER(smp_load_acquire(&test_var), false); + KCSAN_EXPECT_READ_BARRIER(smp_store_release(&test_var, 0), true); + KCSAN_EXPECT_READ_BARRIER(xchg(&test_var, 0), true); + KCSAN_EXPECT_READ_BARRIER(xchg_release(&test_var, 0), true); + KCSAN_EXPECT_READ_BARRIER(xchg_relaxed(&test_var, 0), false); + KCSAN_EXPECT_READ_BARRIER(cmpxchg(&test_var, 0, 0), true); + KCSAN_EXPECT_READ_BARRIER(cmpxchg_release(&test_var, 0, 0), true); + KCSAN_EXPECT_READ_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false); + KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_read_acquire(&dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_set(&dummy, 0), false); + KCSAN_EXPECT_READ_BARRIER(atomic_set_release(&dummy, 0), true); + KCSAN_EXPECT_READ_BARRIER(atomic_add(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_add_return(1, &dummy), true); + KCSAN_EXPECT_READ_BARRIER(atomic_add_return_acquire(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_add_return_release(1, &dummy), true); + KCSAN_EXPECT_READ_BARRIER(atomic_add_return_relaxed(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add(1, &dummy), true); + KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_acquire(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_release(1, &dummy), true); + KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(test_and_set_bit(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(test_and_clear_bit(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false); + KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true); + KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false); + KCSAN_EXPECT_READ_BARRIER(spin_unlock(&test_spinlock), true); + KCSAN_EXPECT_READ_BARRIER(mutex_lock(&test_mutex), false); + KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&test_mutex), true); + + KCSAN_EXPECT_WRITE_BARRIER(mb(), true); + KCSAN_EXPECT_WRITE_BARRIER(wmb(), true); + KCSAN_EXPECT_WRITE_BARRIER(rmb(), false); + KCSAN_EXPECT_WRITE_BARRIER(smp_mb(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_wmb(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_rmb(), false); + KCSAN_EXPECT_WRITE_BARRIER(dma_wmb(), true); + KCSAN_EXPECT_WRITE_BARRIER(dma_rmb(), false); + KCSAN_EXPECT_WRITE_BARRIER(smp_mb__before_atomic(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_atomic(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_spinlock(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_store_mb(test_var, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_load_acquire(&test_var), false); + KCSAN_EXPECT_WRITE_BARRIER(smp_store_release(&test_var, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(xchg(&test_var, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(xchg_release(&test_var, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(xchg_relaxed(&test_var, 0), false); + KCSAN_EXPECT_WRITE_BARRIER(cmpxchg(&test_var, 0, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_read(&dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_read_acquire(&dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_set(&dummy, 0), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_set_release(&dummy, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return(1, &dummy), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_acquire(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_release(1, &dummy), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_relaxed(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add(1, &dummy), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_acquire(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(test_and_set_bit(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(test_and_clear_bit(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false); + KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true); + KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false); + KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&test_spinlock), true); + KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&test_mutex), false); + KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&test_mutex), true); + + KCSAN_EXPECT_RW_BARRIER(mb(), true); + KCSAN_EXPECT_RW_BARRIER(wmb(), true); + KCSAN_EXPECT_RW_BARRIER(rmb(), true); + KCSAN_EXPECT_RW_BARRIER(smp_mb(), true); + KCSAN_EXPECT_RW_BARRIER(smp_wmb(), true); + KCSAN_EXPECT_RW_BARRIER(smp_rmb(), true); + KCSAN_EXPECT_RW_BARRIER(dma_wmb(), true); + KCSAN_EXPECT_RW_BARRIER(dma_rmb(), true); + KCSAN_EXPECT_RW_BARRIER(smp_mb__before_atomic(), true); + KCSAN_EXPECT_RW_BARRIER(smp_mb__after_atomic(), true); + KCSAN_EXPECT_RW_BARRIER(smp_mb__after_spinlock(), true); + KCSAN_EXPECT_RW_BARRIER(smp_store_mb(test_var, 0), true); + KCSAN_EXPECT_RW_BARRIER(smp_load_acquire(&test_var), false); + KCSAN_EXPECT_RW_BARRIER(smp_store_release(&test_var, 0), true); + KCSAN_EXPECT_RW_BARRIER(xchg(&test_var, 0), true); + KCSAN_EXPECT_RW_BARRIER(xchg_release(&test_var, 0), true); + KCSAN_EXPECT_RW_BARRIER(xchg_relaxed(&test_var, 0), false); + KCSAN_EXPECT_RW_BARRIER(cmpxchg(&test_var, 0, 0), true); + KCSAN_EXPECT_RW_BARRIER(cmpxchg_release(&test_var, 0, 0), true); + KCSAN_EXPECT_RW_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false); + KCSAN_EXPECT_RW_BARRIER(atomic_read(&dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_read_acquire(&dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_set(&dummy, 0), false); + KCSAN_EXPECT_RW_BARRIER(atomic_set_release(&dummy, 0), true); + KCSAN_EXPECT_RW_BARRIER(atomic_add(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_add_return(1, &dummy), true); + KCSAN_EXPECT_RW_BARRIER(atomic_add_return_acquire(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_add_return_release(1, &dummy), true); + KCSAN_EXPECT_RW_BARRIER(atomic_add_return_relaxed(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add(1, &dummy), true); + KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_acquire(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_release(1, &dummy), true); + KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(test_and_set_bit(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(test_and_clear_bit(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false); + KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true); + KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false); + KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true); + KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false); + KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true); + +#ifdef clear_bit_unlock_is_negative_byte + KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); +#endif + kcsan_nestable_atomic_end(); +} + /* Simple test with normal data race. */ __no_kcsan static void test_basic(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, }, }; - static const struct expect_report never = { + struct expect_report never = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, @@ -469,14 +738,14 @@ static void test_basic(struct kunit *test) __no_kcsan static void test_concurrent_races(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { /* NULL will match any address. */ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) }, }, }; - static const struct expect_report never = { + struct expect_report never = { .access = { { test_kernel_rmw_array, NULL, 0, 0 }, { test_kernel_rmw_array, NULL, 0, 0 }, @@ -498,13 +767,13 @@ static void test_concurrent_races(struct kunit *test) __no_kcsan static void test_novalue_change(struct kunit *test) { - const struct expect_report expect_rw = { + struct expect_report expect_rw = { .access = { { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, }, }; - const struct expect_report expect_ww = { + struct expect_report expect_ww = { .access = { { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -530,13 +799,13 @@ static void test_novalue_change(struct kunit *test) __no_kcsan static void test_novalue_change_exception(struct kunit *test) { - const struct expect_report expect_rw = { + struct expect_report expect_rw = { .access = { { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, }, }; - const struct expect_report expect_ww = { + struct expect_report expect_ww = { .access = { { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -556,7 +825,7 @@ static void test_novalue_change_exception(struct kunit *test) __no_kcsan static void test_unknown_origin(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { NULL }, @@ -578,7 +847,7 @@ static void test_unknown_origin(struct kunit *test) __no_kcsan static void test_write_write_assume_atomic(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -604,7 +873,7 @@ static void test_write_write_assume_atomic(struct kunit *test) __no_kcsan static void test_write_write_struct(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, @@ -626,7 +895,7 @@ static void test_write_write_struct(struct kunit *test) __no_kcsan static void test_write_write_struct_part(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE }, @@ -658,7 +927,7 @@ static void test_read_atomic_write_atomic(struct kunit *test) __no_kcsan static void test_read_plain_atomic_write(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, @@ -679,7 +948,7 @@ static void test_read_plain_atomic_write(struct kunit *test) __no_kcsan static void test_read_plain_atomic_rmw(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { test_kernel_atomic_rmw, &test_var, sizeof(test_var), @@ -701,13 +970,13 @@ static void test_read_plain_atomic_rmw(struct kunit *test) __no_kcsan static void test_zero_size_access(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, }, }; - const struct expect_report never = { + struct expect_report never = { .access = { { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 }, @@ -741,7 +1010,7 @@ static void test_data_race(struct kunit *test) __no_kcsan static void test_assert_exclusive_writer(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -759,7 +1028,7 @@ static void test_assert_exclusive_writer(struct kunit *test) __no_kcsan static void test_assert_exclusive_access(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, @@ -777,19 +1046,19 @@ static void test_assert_exclusive_access(struct kunit *test) __no_kcsan static void test_assert_exclusive_access_writer(struct kunit *test) { - const struct expect_report expect_access_writer = { + struct expect_report expect_access_writer = { .access = { { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, }, }; - const struct expect_report expect_access_access = { + struct expect_report expect_access_access = { .access = { { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, }, }; - const struct expect_report never = { + struct expect_report never = { .access = { { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, @@ -813,7 +1082,7 @@ static void test_assert_exclusive_access_writer(struct kunit *test) __no_kcsan static void test_assert_exclusive_bits_change(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, { test_kernel_change_bits, &test_var, sizeof(test_var), @@ -844,13 +1113,13 @@ static void test_assert_exclusive_bits_nochange(struct kunit *test) __no_kcsan static void test_assert_exclusive_writer_scoped(struct kunit *test) { - const struct expect_report expect_start = { + struct expect_report expect_start = { .access = { { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, }, }; - const struct expect_report expect_inscope = { + struct expect_report expect_inscope = { .access = { { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -871,16 +1140,16 @@ static void test_assert_exclusive_writer_scoped(struct kunit *test) __no_kcsan static void test_assert_exclusive_access_scoped(struct kunit *test) { - const struct expect_report expect_start1 = { + struct expect_report expect_start1 = { .access = { { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, }, }; - const struct expect_report expect_start2 = { + struct expect_report expect_start2 = { .access = { expect_start1.access[0], expect_start1.access[0] }, }; - const struct expect_report expect_inscope = { + struct expect_report expect_inscope = { .access = { { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, @@ -985,7 +1254,7 @@ static void test_atomic_builtins(struct kunit *test) __no_kcsan static void test_1bit_value_change(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, @@ -1005,6 +1274,90 @@ static void test_1bit_value_change(struct kunit *test) KUNIT_EXPECT_TRUE(test, match); } +__no_kcsan +static void test_correct_barrier(struct kunit *test) +{ + struct expect_report expect = { + .access = { + { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) }, + }, + }; + bool match_expect = false; + + test_struct.val[0] = 0; /* init unlocked */ + begin_test_checks(test_kernel_with_memorder, test_kernel_with_memorder); + do { + match_expect = report_matches_any_reordered(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_FALSE(test, match_expect); +} + +__no_kcsan +static void test_missing_barrier(struct kunit *test) +{ + struct expect_report expect = { + .access = { + { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) }, + }, + }; + bool match_expect = false; + + test_struct.val[0] = 0; /* init unlocked */ + begin_test_checks(test_kernel_wrong_memorder, test_kernel_wrong_memorder); + do { + match_expect = report_matches_any_reordered(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + +__no_kcsan +static void test_atomic_builtins_correct_barrier(struct kunit *test) +{ + struct expect_report expect = { + .access = { + { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) }, + }, + }; + bool match_expect = false; + + test_struct.val[0] = 0; /* init unlocked */ + begin_test_checks(test_kernel_atomic_builtin_with_memorder, + test_kernel_atomic_builtin_with_memorder); + do { + match_expect = report_matches_any_reordered(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_FALSE(test, match_expect); +} + +__no_kcsan +static void test_atomic_builtins_missing_barrier(struct kunit *test) +{ + struct expect_report expect = { + .access = { + { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) }, + }, + }; + bool match_expect = false; + + test_struct.val[0] = 0; /* init unlocked */ + begin_test_checks(test_kernel_atomic_builtin_wrong_memorder, + test_kernel_atomic_builtin_wrong_memorder); + do { + match_expect = report_matches_any_reordered(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + /* * Generate thread counts for all test cases. Values generated are in interval * [2, 5] followed by exponentially increasing thread counts from 8 to 32. @@ -1054,6 +1407,7 @@ static const void *nthreads_gen_params(const void *prev, char *desc) #define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params) static struct kunit_case kcsan_test_cases[] = { + KUNIT_CASE(test_barrier_nothreads), KCSAN_KUNIT_CASE(test_basic), KCSAN_KUNIT_CASE(test_concurrent_races), KCSAN_KUNIT_CASE(test_novalue_change), @@ -1078,6 +1432,10 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_seqlock_noreport), KCSAN_KUNIT_CASE(test_atomic_builtins), KCSAN_KUNIT_CASE(test_1bit_value_change), + KCSAN_KUNIT_CASE(test_correct_barrier), + KCSAN_KUNIT_CASE(test_missing_barrier), + KCSAN_KUNIT_CASE(test_atomic_builtins_correct_barrier), + KCSAN_KUNIT_CASE(test_atomic_builtins_missing_barrier), {}, }; @@ -1142,6 +1500,9 @@ static int test_init(struct kunit *test) observed.nlines = 0; spin_unlock_irqrestore(&observed.lock, flags); + if (strstr(test->name, "nothreads")) + return 0; + if (!torture_init_begin((char *)test->name, 1)) return -EBUSY; @@ -1184,6 +1545,9 @@ static void test_exit(struct kunit *test) struct task_struct **stop_thread; int i; + if (strstr(test->name, "nothreads")) + return; + if (torture_cleanup_begin()) return; diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index fc15077991c4..67794404042a 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -215,9 +215,9 @@ static const char *get_access_type(int type) if (type & KCSAN_ACCESS_ASSERT) { if (type & KCSAN_ACCESS_SCOPED) { if (type & KCSAN_ACCESS_WRITE) - return "assert no accesses (scoped)"; + return "assert no accesses (reordered)"; else - return "assert no writes (scoped)"; + return "assert no writes (reordered)"; } else { if (type & KCSAN_ACCESS_WRITE) return "assert no accesses"; @@ -240,17 +240,17 @@ static const char *get_access_type(int type) case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "read-write (marked)"; case KCSAN_ACCESS_SCOPED: - return "read (scoped)"; + return "read (reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: - return "read (marked, scoped)"; + return "read (marked, reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE: - return "write (scoped)"; + return "write (reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: - return "write (marked, scoped)"; + return "write (marked, reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE: - return "read-write (scoped)"; + return "read-write (reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: - return "read-write (marked, scoped)"; + return "read-write (marked, reordered)"; default: BUG(); } @@ -308,10 +308,12 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries /* * Skips to the first entry that matches the function of @ip, and then replaces - * that entry with @ip, returning the entries to skip. + * that entry with @ip, returning the entries to skip with @replaced containing + * the replaced entry. */ static int -replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip) +replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip, + unsigned long *replaced) { unsigned long symbolsize, offset; unsigned long target_func; @@ -330,6 +332,7 @@ replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned lon func -= offset; if (func == target_func) { + *replaced = stack_entries[skip]; stack_entries[skip] = ip; return skip; } @@ -342,9 +345,10 @@ fallback: } static int -sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip) +sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip, + unsigned long *replaced) { - return ip ? replace_stack_entry(stack_entries, num_entries, ip) : + return ip ? replace_stack_entry(stack_entries, num_entries, ip, replaced) : get_stack_skipnr(stack_entries, num_entries); } @@ -360,6 +364,14 @@ static int sym_strcmp(void *addr1, void *addr2) return strncmp(buf1, buf2, sizeof(buf1)); } +static void +print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to) +{ + stack_trace_print(stack_entries, num_entries, 0); + if (reordered_to) + pr_err(" |\n +-> reordered to: %pS\n", (void *)reordered_to); +} + static void print_verbose_info(struct task_struct *task) { if (!task) @@ -378,10 +390,12 @@ static void print_report(enum kcsan_value_change value_change, struct other_info *other_info, u64 old, u64 new, u64 mask) { + unsigned long reordered_to = 0; unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); - int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip); + int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip, &reordered_to); unsigned long this_frame = stack_entries[skipnr]; + unsigned long other_reordered_to = 0; unsigned long other_frame = 0; int other_skipnr = 0; /* silence uninit warnings */ @@ -394,7 +408,7 @@ static void print_report(enum kcsan_value_change value_change, if (other_info) { other_skipnr = sanitize_stack_entries(other_info->stack_entries, other_info->num_stack_entries, - other_info->ai.ip); + other_info->ai.ip, &other_reordered_to); other_frame = other_info->stack_entries[other_skipnr]; /* @value_change is only known for the other thread */ @@ -434,10 +448,9 @@ static void print_report(enum kcsan_value_change value_change, other_info->ai.cpu_id); /* Print the other thread's stack trace. */ - stack_trace_print(other_info->stack_entries + other_skipnr, + print_stack_trace(other_info->stack_entries + other_skipnr, other_info->num_stack_entries - other_skipnr, - 0); - + other_reordered_to); if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) print_verbose_info(other_info->task); @@ -451,9 +464,7 @@ static void print_report(enum kcsan_value_change value_change, get_thread_desc(ai->task_pid), ai->cpu_id); } /* Print stack trace of this thread. */ - stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, - 0); - + print_stack_trace(stack_entries + skipnr, num_stack_entries - skipnr, reordered_to); if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) print_verbose_info(current); diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index b4295a3892b7..75712959c84e 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -7,10 +7,15 @@ #define pr_fmt(fmt) "kcsan: " fmt +#include <linux/atomic.h> +#include <linux/bitops.h> #include <linux/init.h> +#include <linux/kcsan-checks.h> #include <linux/kernel.h> #include <linux/printk.h> #include <linux/random.h> +#include <linux/sched.h> +#include <linux/spinlock.h> #include <linux/types.h> #include "encoding.h" @@ -103,6 +108,143 @@ static bool __init test_matching_access(void) return true; } +/* + * Correct memory barrier instrumentation is critical to avoiding false + * positives: simple test to check at boot certain barriers are always properly + * instrumented. See kcsan_test for a more complete test. + */ +static DEFINE_SPINLOCK(test_spinlock); +static bool __init test_barrier(void) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + struct kcsan_scoped_access *reorder_access = ¤t->kcsan_ctx.reorder_access; +#else + struct kcsan_scoped_access *reorder_access = NULL; +#endif + bool ret = true; + arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; + atomic_t dummy; + long test_var; + + if (!reorder_access || !IS_ENABLED(CONFIG_SMP)) + return true; + +#define __KCSAN_CHECK_BARRIER(access_type, barrier, name) \ + do { \ + reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \ + reorder_access->size = 1; \ + barrier; \ + if (reorder_access->size != 0) { \ + pr_err("improperly instrumented type=(" #access_type "): " name "\n"); \ + ret = false; \ + } \ + } while (0) +#define KCSAN_CHECK_READ_BARRIER(b) __KCSAN_CHECK_BARRIER(0, b, #b) +#define KCSAN_CHECK_WRITE_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE, b, #b) +#define KCSAN_CHECK_RW_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND, b, #b) + + kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */ + + KCSAN_CHECK_READ_BARRIER(mb()); + KCSAN_CHECK_READ_BARRIER(rmb()); + KCSAN_CHECK_READ_BARRIER(smp_mb()); + KCSAN_CHECK_READ_BARRIER(smp_rmb()); + KCSAN_CHECK_READ_BARRIER(dma_rmb()); + KCSAN_CHECK_READ_BARRIER(smp_mb__before_atomic()); + KCSAN_CHECK_READ_BARRIER(smp_mb__after_atomic()); + KCSAN_CHECK_READ_BARRIER(smp_mb__after_spinlock()); + KCSAN_CHECK_READ_BARRIER(smp_store_mb(test_var, 0)); + KCSAN_CHECK_READ_BARRIER(smp_store_release(&test_var, 0)); + KCSAN_CHECK_READ_BARRIER(xchg(&test_var, 0)); + KCSAN_CHECK_READ_BARRIER(xchg_release(&test_var, 0)); + KCSAN_CHECK_READ_BARRIER(cmpxchg(&test_var, 0, 0)); + KCSAN_CHECK_READ_BARRIER(cmpxchg_release(&test_var, 0, 0)); + KCSAN_CHECK_READ_BARRIER(atomic_set_release(&dummy, 0)); + KCSAN_CHECK_READ_BARRIER(atomic_add_return(1, &dummy)); + KCSAN_CHECK_READ_BARRIER(atomic_add_return_release(1, &dummy)); + KCSAN_CHECK_READ_BARRIER(atomic_fetch_add(1, &dummy)); + KCSAN_CHECK_READ_BARRIER(atomic_fetch_add_release(1, &dummy)); + KCSAN_CHECK_READ_BARRIER(test_and_set_bit(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(test_and_clear_bit(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var)); + arch_spin_lock(&arch_spinlock); + KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock)); + spin_lock(&test_spinlock); + KCSAN_CHECK_READ_BARRIER(spin_unlock(&test_spinlock)); + + KCSAN_CHECK_WRITE_BARRIER(mb()); + KCSAN_CHECK_WRITE_BARRIER(wmb()); + KCSAN_CHECK_WRITE_BARRIER(smp_mb()); + KCSAN_CHECK_WRITE_BARRIER(smp_wmb()); + KCSAN_CHECK_WRITE_BARRIER(dma_wmb()); + KCSAN_CHECK_WRITE_BARRIER(smp_mb__before_atomic()); + KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_atomic()); + KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_spinlock()); + KCSAN_CHECK_WRITE_BARRIER(smp_store_mb(test_var, 0)); + KCSAN_CHECK_WRITE_BARRIER(smp_store_release(&test_var, 0)); + KCSAN_CHECK_WRITE_BARRIER(xchg(&test_var, 0)); + KCSAN_CHECK_WRITE_BARRIER(xchg_release(&test_var, 0)); + KCSAN_CHECK_WRITE_BARRIER(cmpxchg(&test_var, 0, 0)); + KCSAN_CHECK_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0)); + KCSAN_CHECK_WRITE_BARRIER(atomic_set_release(&dummy, 0)); + KCSAN_CHECK_WRITE_BARRIER(atomic_add_return(1, &dummy)); + KCSAN_CHECK_WRITE_BARRIER(atomic_add_return_release(1, &dummy)); + KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add(1, &dummy)); + KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy)); + KCSAN_CHECK_WRITE_BARRIER(test_and_set_bit(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(test_and_clear_bit(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var)); + arch_spin_lock(&arch_spinlock); + KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock)); + spin_lock(&test_spinlock); + KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&test_spinlock)); + + KCSAN_CHECK_RW_BARRIER(mb()); + KCSAN_CHECK_RW_BARRIER(wmb()); + KCSAN_CHECK_RW_BARRIER(rmb()); + KCSAN_CHECK_RW_BARRIER(smp_mb()); + KCSAN_CHECK_RW_BARRIER(smp_wmb()); + KCSAN_CHECK_RW_BARRIER(smp_rmb()); + KCSAN_CHECK_RW_BARRIER(dma_wmb()); + KCSAN_CHECK_RW_BARRIER(dma_rmb()); + KCSAN_CHECK_RW_BARRIER(smp_mb__before_atomic()); + KCSAN_CHECK_RW_BARRIER(smp_mb__after_atomic()); + KCSAN_CHECK_RW_BARRIER(smp_mb__after_spinlock()); + KCSAN_CHECK_RW_BARRIER(smp_store_mb(test_var, 0)); + KCSAN_CHECK_RW_BARRIER(smp_store_release(&test_var, 0)); + KCSAN_CHECK_RW_BARRIER(xchg(&test_var, 0)); + KCSAN_CHECK_RW_BARRIER(xchg_release(&test_var, 0)); + KCSAN_CHECK_RW_BARRIER(cmpxchg(&test_var, 0, 0)); + KCSAN_CHECK_RW_BARRIER(cmpxchg_release(&test_var, 0, 0)); + KCSAN_CHECK_RW_BARRIER(atomic_set_release(&dummy, 0)); + KCSAN_CHECK_RW_BARRIER(atomic_add_return(1, &dummy)); + KCSAN_CHECK_RW_BARRIER(atomic_add_return_release(1, &dummy)); + KCSAN_CHECK_RW_BARRIER(atomic_fetch_add(1, &dummy)); + KCSAN_CHECK_RW_BARRIER(atomic_fetch_add_release(1, &dummy)); + KCSAN_CHECK_RW_BARRIER(test_and_set_bit(0, &test_var)); + KCSAN_CHECK_RW_BARRIER(test_and_clear_bit(0, &test_var)); + KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var)); + KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var)); + KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var)); + arch_spin_lock(&arch_spinlock); + KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock)); + spin_lock(&test_spinlock); + KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock)); + +#ifdef clear_bit_unlock_is_negative_byte + KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); +#endif + kcsan_nestable_atomic_end(); + + return ret; +} + static int __init kcsan_selftest(void) { int passed = 0; @@ -120,6 +262,7 @@ static int __init kcsan_selftest(void) RUN_TEST(test_requires); RUN_TEST(test_encode_decode); RUN_TEST(test_matching_access); + RUN_TEST(test_barrier); pr_info("selftest: %d/%d tests passed\n", passed, total); if (passed != total) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5a5d192a89ac..68480f731192 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -81,7 +81,7 @@ int kexec_should_crash(struct task_struct *p) if (crash_kexec_post_notifiers) return 0; /* - * There are 4 panic() calls in do_exit() path, each of which + * There are 4 panic() calls in make_task_dead() path, each of which * corresponds to each of these 4 conditions. */ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e9db0c810554..94cab8c9ce56 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -48,6 +48,9 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) +#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL) +#define kprobe_sysctls_init() do { } while (0) +#endif static int kprobes_initialized; /* kprobe_table can be accessed by @@ -938,10 +941,10 @@ static void unoptimize_all_kprobes(void) } static DEFINE_MUTEX(kprobe_sysctl_mutex); -int sysctl_kprobes_optimization; -int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos) +static int sysctl_kprobes_optimization; +static int proc_kprobes_optimization_handler(struct ctl_table *table, + int write, void *buffer, + size_t *length, loff_t *ppos) { int ret; @@ -957,6 +960,24 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, return ret; } + +static struct ctl_table kprobe_sysctls[] = { + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static void __init kprobe_sysctls_init(void) +{ + register_sysctl_init("debug", kprobe_sysctls); +} #endif /* CONFIG_SYSCTL */ /* Put a breakpoint for a probe. */ @@ -2086,6 +2107,9 @@ int register_kretprobe(struct kretprobe *rp) } } + if (rp->data_size > KRETPROBE_MAX_DATA_SIZE) + return -E2BIG; + rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; @@ -2581,6 +2605,7 @@ static int __init init_kprobes(void) err = register_module_notifier(&kprobe_module_nb); kprobes_initialized = (err == 0); + kprobe_sysctls_init(); return err; } early_initcall(init_kprobes); diff --git a/kernel/kthread.c b/kernel/kthread.c index 7113003fab63..38c6dd822da8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,6 +52,7 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + int result; int (*threadfn)(void *); void *data; mm_segment_t oldfs; @@ -60,6 +61,8 @@ struct kthread { #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif + /* To store the full name if task comm is truncated. */ + char *full_name; }; enum KTHREAD_BITS { @@ -71,7 +74,7 @@ enum KTHREAD_BITS { static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); - return (__force void *)k->set_child_tid; + return k->worker_private; } /* @@ -79,7 +82,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) * * Per construction; when: * - * (p->flags & PF_KTHREAD) && p->set_child_tid + * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and @@ -87,26 +90,41 @@ static inline struct kthread *to_kthread(struct task_struct *k) */ static inline struct kthread *__to_kthread(struct task_struct *p) { - void *kthread = (__force void *)p->set_child_tid; + void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } -void set_kthread_struct(struct task_struct *p) +void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { - struct kthread *kthread; + struct kthread *kthread = to_kthread(tsk); - if (__to_kthread(p)) + if (!kthread || !kthread->full_name) { + __get_task_comm(buf, buf_size, tsk); return; + } + + strscpy_pad(buf, kthread->full_name, buf_size); +} + +bool set_kthread_struct(struct task_struct *p) +{ + struct kthread *kthread; + + if (WARN_ON_ONCE(to_kthread(p))) + return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); - /* - * We abuse ->set_child_tid to avoid the new member and because it - * can't be wrongly copied by copy_process(). We also rely on fact - * that the caller can't exec, so PF_KTHREAD can't be cleared. - */ - p->set_child_tid = (__force void __user *)kthread; + if (!kthread) + return false; + + init_completion(&kthread->exited); + init_completion(&kthread->parked); + p->vfork_done = &kthread->exited; + + p->worker_private = kthread; + return true; } void free_kthread_struct(struct task_struct *k) @@ -114,13 +132,17 @@ void free_kthread_struct(struct task_struct *k) struct kthread *kthread; /* - * Can be NULL if this kthread was created by kernel_thread() - * or if kmalloc() in kthread() failed. + * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); + if (!kthread) + return; + #ifdef CONFIG_BLK_CGROUP - WARN_ON_ONCE(kthread && kthread->blkcg_css); + WARN_ON_ONCE(kthread->blkcg_css); #endif + k->worker_private = NULL; + kfree(kthread->full_name); kfree(kthread); } @@ -268,6 +290,44 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); +/** + * kthread_exit - Cause the current kthread return @result to kthread_stop(). + * @result: The integer value to return to kthread_stop(). + * + * While kthread_exit can be called directly, it exists so that + * functions which do some additional work in non-modular code such as + * module_put_and_kthread_exit can be implemented. + * + * Does not return. + */ +void __noreturn kthread_exit(long result) +{ + struct kthread *kthread = to_kthread(current); + kthread->result = result; + do_exit(0); +} + +/** + * kthread_complete_and_exit - Exit the current kthread. + * @comp: Completion to complete + * @code: The integer value to return to kthread_stop(). + * + * If present complete @comp and the reuturn code to kthread_stop(). + * + * A kernel thread whose module may be removed after the completion of + * @comp can use this function exit safely. + * + * Does not return. + */ +void __noreturn kthread_complete_and_exit(struct completion *comp, long code) +{ + if (comp) + complete(comp); + + kthread_exit(code); +} +EXPORT_SYMBOL(kthread_complete_and_exit); + static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; @@ -279,27 +339,17 @@ static int kthread(void *_create) struct kthread *self; int ret; - set_kthread_struct(current); self = to_kthread(current); /* If user was SIGKILLed, I release the structure. */ done = xchg(&create->done, NULL); if (!done) { kfree(create); - do_exit(-EINTR); - } - - if (!self) { - create->result = ERR_PTR(-ENOMEM); - complete(done); - do_exit(-ENOMEM); + kthread_exit(-EINTR); } self->threadfn = threadfn; self->data = data; - init_completion(&self->exited); - init_completion(&self->parked); - current->vfork_done = &self->exited; /* * The new thread inherited kthreadd's priority and CPU mask. Reset @@ -326,7 +376,7 @@ static int kthread(void *_create) __kthread_parkme(self); ret = threadfn(data); } - do_exit(ret); + kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ @@ -406,12 +456,22 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { char name[TASK_COMM_LEN]; + va_list aq; + int len; /* * task is already visible to other tasks, so updating * COMM must be protected. */ - vsnprintf(name, sizeof(name), namefmt, args); + va_copy(aq, args); + len = vsnprintf(name, sizeof(name), namefmt, aq); + va_end(aq); + if (len >= TASK_COMM_LEN) { + struct kthread *kthread = to_kthread(task); + + /* leave it truncated when out of memory. */ + kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args); + } set_task_comm(task, name); } kfree(create); @@ -523,6 +583,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), to_kthread(p)->cpu = cpu; return p; } +EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { @@ -627,7 +688,7 @@ EXPORT_SYMBOL_GPL(kthread_park); * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * - * If threadfn() may call do_exit() itself, the caller must ensure + * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() @@ -646,7 +707,7 @@ int kthread_stop(struct task_struct *k) kthread_unpark(k); wake_up_process(k); wait_for_completion(&kthread->exited); - ret = k->exit_code; + ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 335d988bd811..585494ec464f 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -862,14 +862,11 @@ static void klp_init_object_early(struct klp_patch *patch, list_add_tail(&obj->node, &patch->obj_list); } -static int klp_init_patch_early(struct klp_patch *patch) +static void klp_init_patch_early(struct klp_patch *patch) { struct klp_object *obj; struct klp_func *func; - if (!patch->objs) - return -EINVAL; - INIT_LIST_HEAD(&patch->list); INIT_LIST_HEAD(&patch->obj_list); kobject_init(&patch->kobj, &klp_ktype_patch); @@ -879,20 +876,12 @@ static int klp_init_patch_early(struct klp_patch *patch) init_completion(&patch->finish); klp_for_each_object_static(patch, obj) { - if (!obj->funcs) - return -EINVAL; - klp_init_object_early(patch, obj); klp_for_each_func_static(obj, func) { klp_init_func_early(obj, func); } } - - if (!try_module_get(patch->mod)) - return -ENODEV; - - return 0; } static int klp_init_patch(struct klp_patch *patch) @@ -1024,10 +1013,17 @@ err: int klp_enable_patch(struct klp_patch *patch) { int ret; + struct klp_object *obj; - if (!patch || !patch->mod) + if (!patch || !patch->mod || !patch->objs) return -EINVAL; + klp_for_each_object_static(patch, obj) { + if (!obj->funcs) + return -EINVAL; + } + + if (!is_livepatch_module(patch->mod)) { pr_err("module %s is not marked as a livepatch module\n", patch->mod->name); @@ -1051,12 +1047,13 @@ int klp_enable_patch(struct klp_patch *patch) return -EINVAL; } - ret = klp_init_patch_early(patch); - if (ret) { + if (!try_module_get(patch->mod)) { mutex_unlock(&klp_mutex); - return ret; + return -ENODEV; } + klp_init_patch_early(patch); + ret = klp_init_patch(patch); if (ret) goto err; diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index e5c9fb295ba9..c2e724d97ddf 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -272,12 +272,12 @@ void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor) EXPORT_SYMBOL_GPL(klp_shadow_free); /** - * klp_shadow_free_all() - detach and free all <*, id> shadow variables + * klp_shadow_free_all() - detach and free all <_, id> shadow variables * @id: data identifier * @dtor: custom callback that can be used to unregister the variable * and/or free data that the shadow variable points to (optional) * - * This function releases the memory for all <*, id> shadow variable + * This function releases the memory for all <_, id> shadow variable * instances, callers should stop referencing them accordingly. */ void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor) @@ -288,7 +288,7 @@ void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor) spin_lock_irqsave(&klp_shadow_lock, flags); - /* Delete all <*, id> from hash */ + /* Delete all <_, id> from hash */ hash_for_each(klp_shadow_hash, i, shadow, node) { if (klp_shadow_match(shadow, shadow->obj, id)) klp_shadow_free_struct(shadow, dtor); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2270ec68f10a..4a882f83aeb9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5485,6 +5485,7 @@ static noinstr void check_flags(unsigned long flags) } } +#ifndef CONFIG_PREEMPT_RT /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @@ -5499,6 +5500,7 @@ static noinstr void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } +#endif if (!debug_locks) print_irqtrace_events(current); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 397ac13d2ef7..9c2fb613a55d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1047,7 +1047,7 @@ static int __init lock_torture_init(void) sizeof(writer_tasks[0]), GFP_KERNEL); if (writer_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + TOROUT_ERRSTRING("writer_tasks: Out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -1058,7 +1058,7 @@ static int __init lock_torture_init(void) sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + TOROUT_ERRSTRING("reader_tasks: Out of memory"); kfree(writer_tasks); writer_tasks = NULL; firsterr = -ENOMEM; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index db1913611192..5e3585950ec8 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -367,8 +367,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, /* * Use vcpu_is_preempted to detect lock holder preemption issue. */ - if (!owner->on_cpu || need_resched() || - vcpu_is_preempted(task_cpu(owner))) { + if (!owner_on_cpu(owner) || need_resched()) { ret = false; break; } @@ -403,14 +402,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) * structure won't go away during the spinning period. */ owner = __mutex_owner(lock); - - /* - * As lock holder preemption issue, we both skip spinning if task is not - * on cpu or its cpu is preempted - */ - if (owner) - retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); + retval = owner_on_cpu(owner); /* * If lock->owner is not set, the mutex has been released. Return true diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0c6a48dfcecb..8555c4efe97c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1103,8 +1103,11 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, * the other will detect the deadlock and return -EDEADLOCK, * which is wrong, as the other waiter is not in a deadlock * situation. + * + * Except for ww_mutex, in that case the chain walk must already deal + * with spurious cycles, see the comments at [3] and [6]. */ - if (owner == task) + if (owner == task && !(build_ww_mutex() && ww_ctx)) return -EDEADLK; raw_spin_lock(&task->pi_lock); @@ -1379,9 +1382,8 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * for CONFIG_PREEMPT_RCU=y) * - the VCPU on which owner runs is preempted */ - if (!owner->on_cpu || need_resched() || - rt_mutex_waiter_is_top_waiter(lock, waiter) || - vcpu_is_preempted(task_cpu(owner))) { + if (!owner_on_cpu(owner) || need_resched() || + !rt_mutex_waiter_is_top_waiter(lock, waiter)) { res = false; break; } diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 5c9299aaabae..900220941caa 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -21,12 +21,13 @@ int max_lock_depth = 1024; */ static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, unsigned int state, + struct lockdep_map *nest_lock, unsigned int subclass) { int ret; might_sleep(); - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_); ret = __rt_mutex_lock(&lock->rtmutex, state); if (ret) mutex_release(&lock->dep_map, _RET_IP_); @@ -48,10 +49,16 @@ EXPORT_SYMBOL(rt_mutex_base_init); */ void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); } EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); +void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); +} +EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); + #else /* !CONFIG_DEBUG_LOCK_ALLOC */ /** @@ -61,7 +68,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); #endif @@ -77,11 +84,26 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { - return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); + return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + +/** * rt_mutex_trylock - try to lock a rt_mutex * * @lock: the rt_mutex to be locked diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c51387a43265..69aba4abe104 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,9 +105,9 @@ * atomic_long_cmpxchg() will be used to obtain writer lock. * * There are three places where the lock handoff bit may be set or cleared. - * 1) rwsem_mark_wake() for readers. - * 2) rwsem_try_write_lock() for writers. - * 3) Error path of rwsem_down_write_slowpath(). + * 1) rwsem_mark_wake() for readers -- set, clear + * 2) rwsem_try_write_lock() for writers -- set, clear + * 3) rwsem_del_waiter() -- clear * * For all the above cases, wait_lock will be held. A writer must also * be the first one in the wait_list to be eligible for setting the handoff @@ -334,6 +334,9 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; + + /* Writer only, not initialized in reader */ + bool handoff_set; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -344,12 +347,6 @@ enum rwsem_wake_type { RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ }; -enum writer_wait_state { - WRITER_NOT_FIRST, /* Writer is not first in wait list */ - WRITER_FIRST, /* Writer is first in wait list */ - WRITER_HANDOFF /* Writer is first & handoff needed */ -}; - /* * The typical HZ value is either 250 or 1000. So set the minimum waiting * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait @@ -365,6 +362,31 @@ enum writer_wait_state { */ #define MAX_READERS_WAKEUP 0x100 +static inline void +rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_add_tail(&waiter->list, &sem->wait_list); + /* caller will set RWSEM_FLAG_WAITERS */ +} + +/* + * Remove a waiter from the wait_list and clear flags. + * + * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of + * this function. Modify with care. + */ +static inline void +rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_del(&waiter->list); + if (likely(!list_empty(&sem->wait_list))) + return; + + atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -376,6 +398,8 @@ enum writer_wait_state { * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false + * + * Implies rwsem_del_waiter() for all woken readers. */ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, @@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); + + oldcount = atomic_long_read(&sem->count); if (list_empty(&sem->wait_list)) { - /* hit end of list above */ + /* + * Combined with list_move_tail() above, this implies + * rwsem_del_waiter(). + */ adjustment -= RWSEM_FLAG_WAITERS; + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } else if (woken) { + /* + * When we've woken a reader, we no longer need to force + * writers to give up the lock and we can clear HANDOFF. + */ + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; } - /* - * When we've woken a reader, we no longer need to force writers - * to give up the lock and we can clear HANDOFF. - */ - if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) - adjustment -= RWSEM_FLAG_HANDOFF; - if (adjustment) atomic_long_add(adjustment, &sem->count); @@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. * - * If wstate is WRITER_HANDOFF, it will make sure that either the handoff - * bit is set or the lock is acquired with handoff bit cleared. + * Implies rwsem_del_waiter() on success. */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, - enum writer_wait_state wstate) + struct rwsem_waiter *waiter) { + bool first = rwsem_first_waiter(sem) == waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); - if (has_handoff && wstate == WRITER_NOT_FIRST) - return false; + if (has_handoff) { + if (!first) + return false; + + /* First waiter inherits a previously set handoff bit */ + waiter->handoff_set = true; + } new = count; if (count & RWSEM_LOCK_MASK) { - if (has_handoff || (wstate != WRITER_HANDOFF)) + if (has_handoff || (!rt_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) return false; new |= RWSEM_FLAG_HANDOFF; @@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * We have either acquired the lock with handoff bit cleared or * set the handoff bit. */ - if (new & RWSEM_FLAG_HANDOFF) + if (new & RWSEM_FLAG_HANDOFF) { + waiter->handoff_set = true; + lockevent_inc(rwsem_wlock_handoff); return false; + } + /* + * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on + * success. + */ + list_del(&waiter->list); rwsem_set_owner(sem); return true; } @@ -613,15 +658,6 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) return false; } -static inline bool owner_on_cpu(struct task_struct *owner) -{ - /* - * As lock holder preemption issue, we both skip spinning if - * task is not on cpu or its cpu is preempted - */ - return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -} - static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; @@ -956,7 +992,7 @@ queue: } adjustment += RWSEM_FLAG_WAITERS; } - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1002,11 +1038,7 @@ queue: return sem; out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) { - atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, - &sem->count); - } + rwsem_del_waiter(sem, &waiter); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -1020,9 +1052,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - enum writer_wait_state wstate; struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1038,16 +1068,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; - - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock */ - if (wstate == WRITER_NOT_FIRST) { + if (rwsem_first_waiter(sem) != &waiter) { count = atomic_long_read(&sem->count); /* @@ -1083,13 +1110,16 @@ wait: /* wait until we successfully acquire the lock */ set_current_state(state); for (;;) { - if (rwsem_try_write_lock(sem, wstate)) { + if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ break; } raw_spin_unlock_irq(&sem->wait_lock); + if (signal_pending_state(state, current)) + goto out_nolock; + /* * After setting the handoff bit and failing to acquire * the lock, attempt to spin on owner to accelerate lock @@ -1098,7 +1128,7 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if (wstate == WRITER_HANDOFF) { + if (waiter.handoff_set) { enum owner_state owner_state; preempt_disable(); @@ -1109,66 +1139,26 @@ wait: goto trylock_again; } - /* Block until there are no active lockers. */ - for (;;) { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - /* - * If HANDOFF bit is set, unconditionally do - * a trylock. - */ - if (wstate == WRITER_HANDOFF) - break; - - if ((wstate == WRITER_NOT_FIRST) && - (rwsem_first_waiter(sem) == &waiter)) - wstate = WRITER_FIRST; - - count = atomic_long_read(&sem->count); - if (!(count & RWSEM_LOCK_MASK)) - break; - - /* - * The setting of the handoff bit is deferred - * until rwsem_try_write_lock() is called. - */ - if ((wstate == WRITER_FIRST) && (rt_task(current) || - time_after(jiffies, waiter.timeout))) { - wstate = WRITER_HANDOFF; - lockevent_inc(rwsem_wlock_handoff); - break; - } - } + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); - list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); - - return ret; + return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - - if (unlikely(wstate == WRITER_HANDOFF)) - atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); - - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - else + rwsem_del_waiter(sem, &waiter); + if (!list_empty(&sem->wait_list)) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); lockevent_inc(rwsem_wlock_fail); - return ERR_PTR(-EINTR); } @@ -1249,17 +1239,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); - /* - * Optimize for the case when the rwsem is not locked at all. - */ - tmp = RWSEM_UNLOCKED_VALUE; - do { + tmp = atomic_long_read(&sem->count); + while (!(tmp & RWSEM_READ_FAILED_MASK)) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_READER_BIAS)) { + tmp + RWSEM_READER_BIAS)) { rwsem_set_reader_owned(sem); return 1; } - } while (!(tmp & RWSEM_READ_FAILED_MASK)); + } return 0; } diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index b562f9289372..7f49baaa4979 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -300,6 +300,16 @@ void __lockfunc _raw_write_lock(rwlock_t *lock) __raw_write_lock(lock); } EXPORT_SYMBOL(_raw_write_lock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock))) +#endif + +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + __raw_write_lock_nested(lock, subclass); +} +EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index b2e553f9255b..48a19ed8486d 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -239,6 +239,18 @@ void __sched rt_write_lock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock_nested); +#endif + void __sched rt_read_unlock(rwlock_t *rwlock) { rwlock_release(&rwlock->dep_map, _RET_IP_); @@ -257,12 +269,6 @@ void __sched rt_write_unlock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_unlock); -int __sched rt_rwlock_is_contended(rwlock_t *rwlock) -{ - return rw_base_is_contended(&rwlock->rwbase); -} -EXPORT_SYMBOL(rt_rwlock_is_contended); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void __rt_rwlock_init(rwlock_t *rwlock, const char *name, struct lock_class_key *key) diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index 0e00205cf467..d1473c624105 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -26,7 +26,7 @@ int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) if (__rt_mutex_trylock(&rtm->rtmutex)) { ww_mutex_set_context_fastpath(lock, ww_ctx); - mutex_acquire_nest(&rtm->dep_map, 0, 1, ww_ctx->dep_map, _RET_IP_); + mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); return 1; } diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 33783abc377b..8c381c99062f 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -23,9 +23,28 @@ struct load_info { #ifdef CONFIG_KALLSYMS unsigned long mod_kallsyms_init_off; #endif +#ifdef CONFIG_MODULE_DECOMPRESS + struct page **pages; + unsigned int max_pages; + unsigned int used_pages; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; }; extern int mod_verify_sig(const void *mod, struct load_info *info); + +#ifdef CONFIG_MODULE_DECOMPRESS +int module_decompress(struct load_info *info, const void *buf, size_t size); +void module_decompress_cleanup(struct load_info *info); +#else +static inline int module_decompress(struct load_info *info, + const void *buf, size_t size) +{ + return -EOPNOTSUPP; +} +static inline void module_decompress_cleanup(struct load_info *info) +{ +} +#endif diff --git a/kernel/module.c b/kernel/module.c index 84a9141a5e15..24dab046e16c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -337,12 +337,12 @@ static inline void add_taint_module(struct module *mod, unsigned flag, * A thread that wants to hold a reference to a module only while it * is running can call this to safely exit. nfsd and lockd use this. */ -void __noreturn __module_put_and_exit(struct module *mod, long code) +void __noreturn __module_put_and_kthread_exit(struct module *mod, long code) { module_put(mod); - do_exit(code); + kthread_exit(code); } -EXPORT_SYMBOL(__module_put_and_exit); +EXPORT_SYMBOL(__module_put_and_kthread_exit); /* Find a module section: 0 means not found. */ static unsigned int find_sec(const struct load_info *info, const char *name) @@ -958,7 +958,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, } } - /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); if (ret != 0) goto out; @@ -2884,12 +2883,13 @@ static int module_sig_check(struct load_info *info, int flags) const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const char *reason; const void *mod = info->hdr; - + bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | + MODULE_INIT_IGNORE_VERMAGIC); /* - * Require flags == 0, as a module with version information - * removed is no longer the module that was signed + * Do not allow mangled modules as a module with version information + * removed is no longer the module that was signed. */ - if (flags == 0 && + if (!mangled_module && info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ @@ -3174,9 +3174,12 @@ out: return err; } -static void free_copy(struct load_info *info) +static void free_copy(struct load_info *info, int flags) { - vfree(info->hdr); + if (flags & MODULE_INIT_COMPRESSED_FILE) + module_decompress_cleanup(info); + else + vfree(info->hdr); } static int rewrite_section_headers(struct load_info *info, int flags) @@ -4125,7 +4128,7 @@ static int load_module(struct load_info *info, const char __user *uargs, } /* Get rid of temporary copy. */ - free_copy(info); + free_copy(info, flags); /* Done! */ trace_module_load(mod); @@ -4174,7 +4177,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_deallocate(mod, info); free_copy: - free_copy(info); + free_copy(info, flags); return err; } @@ -4201,7 +4204,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { struct load_info info = { }; - void *hdr = NULL; + void *buf = NULL; + int len; int err; err = may_init_module(); @@ -4211,15 +4215,24 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS - |MODULE_INIT_IGNORE_VERMAGIC)) + |MODULE_INIT_IGNORE_VERMAGIC + |MODULE_INIT_COMPRESSED_FILE)) return -EINVAL; - err = kernel_read_file_from_fd(fd, 0, &hdr, INT_MAX, NULL, + len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, READING_MODULE); - if (err < 0) - return err; - info.hdr = hdr; - info.len = err; + if (len < 0) + return len; + + if (flags & MODULE_INIT_COMPRESSED_FILE) { + err = module_decompress(&info, buf, len); + vfree(buf); /* compressed data is no longer needed */ + if (err) + return err; + } else { + info.hdr = buf; + info.len = len; + } return load_module(&info, uargs, flags); } @@ -4499,6 +4512,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, mod, kallsyms_symbol_value(sym)); if (ret != 0) goto out; + + cond_resched(); } } out: diff --git a/kernel/module_decompress.c b/kernel/module_decompress.c new file mode 100644 index 000000000000..b01c69c2ff99 --- /dev/null +++ b/kernel/module_decompress.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2021 Google LLC. + */ + +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/kobject.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <linux/vmalloc.h> + +#include "module-internal.h" + +static int module_extend_max_pages(struct load_info *info, unsigned int extent) +{ + struct page **new_pages; + + new_pages = kvmalloc_array(info->max_pages + extent, + sizeof(info->pages), GFP_KERNEL); + if (!new_pages) + return -ENOMEM; + + memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages)); + kvfree(info->pages); + info->pages = new_pages; + info->max_pages += extent; + + return 0; +} + +static struct page *module_get_next_page(struct load_info *info) +{ + struct page *page; + int error; + + if (info->max_pages == info->used_pages) { + error = module_extend_max_pages(info, info->used_pages); + if (error) + return ERR_PTR(error); + } + + page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!page) + return ERR_PTR(-ENOMEM); + + info->pages[info->used_pages++] = page; + return page; +} + +#ifdef CONFIG_MODULE_COMPRESS_GZIP +#include <linux/zlib.h> +#define MODULE_COMPRESSION gzip +#define MODULE_DECOMPRESS_FN module_gzip_decompress + +/* + * Calculate length of the header which consists of signature, header + * flags, time stamp and operating system ID (10 bytes total), plus + * an optional filename. + */ +static size_t module_gzip_header_len(const u8 *buf, size_t size) +{ + const u8 signature[] = { 0x1f, 0x8b, 0x08 }; + size_t len = 10; + + if (size < len || memcmp(buf, signature, sizeof(signature))) + return 0; + + if (buf[3] & 0x08) { + do { + /* + * If we can't find the end of the file name we must + * be dealing with a corrupted file. + */ + if (len == size) + return 0; + } while (buf[len++] != '\0'); + } + + return len; +} + +static ssize_t module_gzip_decompress(struct load_info *info, + const void *buf, size_t size) +{ + struct z_stream_s s = { 0 }; + size_t new_size = 0; + size_t gzip_hdr_len; + ssize_t retval; + int rc; + + gzip_hdr_len = module_gzip_header_len(buf, size); + if (!gzip_hdr_len) { + pr_err("not a gzip compressed module\n"); + return -EINVAL; + } + + s.next_in = buf + gzip_hdr_len; + s.avail_in = size - gzip_hdr_len; + + s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); + if (!s.workspace) + return -ENOMEM; + + rc = zlib_inflateInit2(&s, -MAX_WBITS); + if (rc != Z_OK) { + pr_err("failed to initialize decompressor: %d\n", rc); + retval = -EINVAL; + goto out; + } + + do { + struct page *page = module_get_next_page(info); + if (!page) { + retval = -ENOMEM; + goto out_inflate_end; + } + + s.next_out = kmap(page); + s.avail_out = PAGE_SIZE; + rc = zlib_inflate(&s, 0); + kunmap(page); + + new_size += PAGE_SIZE - s.avail_out; + } while (rc == Z_OK); + + if (rc != Z_STREAM_END) { + pr_err("decompression failed with status %d\n", rc); + retval = -EINVAL; + goto out_inflate_end; + } + + retval = new_size; + +out_inflate_end: + zlib_inflateEnd(&s); +out: + kfree(s.workspace); + return retval; +} +#elif CONFIG_MODULE_COMPRESS_XZ +#include <linux/xz.h> +#define MODULE_COMPRESSION xz +#define MODULE_DECOMPRESS_FN module_xz_decompress + +static ssize_t module_xz_decompress(struct load_info *info, + const void *buf, size_t size) +{ + static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 }; + struct xz_dec *xz_dec; + struct xz_buf xz_buf; + enum xz_ret xz_ret; + size_t new_size = 0; + ssize_t retval; + + if (size < sizeof(signature) || + memcmp(buf, signature, sizeof(signature))) { + pr_err("not an xz compressed module\n"); + return -EINVAL; + } + + xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1); + if (!xz_dec) + return -ENOMEM; + + xz_buf.in_size = size; + xz_buf.in = buf; + xz_buf.in_pos = 0; + + do { + struct page *page = module_get_next_page(info); + if (!page) { + retval = -ENOMEM; + goto out; + } + + xz_buf.out = kmap(page); + xz_buf.out_pos = 0; + xz_buf.out_size = PAGE_SIZE; + xz_ret = xz_dec_run(xz_dec, &xz_buf); + kunmap(page); + + new_size += xz_buf.out_pos; + } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK); + + if (xz_ret != XZ_STREAM_END) { + pr_err("decompression failed with status %d\n", xz_ret); + retval = -EINVAL; + goto out; + } + + retval = new_size; + + out: + xz_dec_end(xz_dec); + return retval; +} +#else +#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS" +#endif + +int module_decompress(struct load_info *info, const void *buf, size_t size) +{ + unsigned int n_pages; + ssize_t data_size; + int error; + + /* + * Start with number of pages twice as big as needed for + * compressed data. + */ + n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2; + error = module_extend_max_pages(info, n_pages); + + data_size = MODULE_DECOMPRESS_FN(info, buf, size); + if (data_size < 0) { + error = data_size; + goto err; + } + + info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL); + if (!info->hdr) { + error = -ENOMEM; + goto err; + } + + info->len = data_size; + return 0; + +err: + module_decompress_cleanup(info); + return error; +} + +void module_decompress_cleanup(struct load_info *info) +{ + int i; + + if (info->hdr) + vunmap(info->hdr); + + for (i = 0; i < info->used_pages; i++) + __free_page(info->pages[i]); + + kvfree(info->pages); + + info->pages = NULL; + info->max_pages = info->used_pages = 0; +} + +static ssize_t compression_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); +} +static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); + +static int __init module_decompress_sysfs_init(void) +{ + int error; + + error = sysfs_create_file(&module_kset->kobj, + &module_compression_attr.attr); + if (error) + pr_warn("Failed to create 'compression' attribute"); + + return 0; +} +late_initcall(module_decompress_sysfs_init); diff --git a/kernel/notifier.c b/kernel/notifier.c index b8251dc0bc0f..ba005ebf4730 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -20,12 +20,13 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); */ static int notifier_chain_register(struct notifier_block **nl, - struct notifier_block *n) + struct notifier_block *n) { while ((*nl) != NULL) { if (unlikely((*nl) == n)) { - WARN(1, "double register detected"); - return 0; + WARN(1, "notifier callback %ps already registered", + n->notifier_call); + return -EEXIST; } if (n->priority > (*nl)->priority) break; @@ -134,7 +135,7 @@ static int notifier_call_chain_robust(struct notifier_block **nl, * * Adds a notifier to an atomic notifier chain. * - * Currently always returns zero. + * Returns 0 on success, %-EEXIST on error. */ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *n) @@ -216,7 +217,7 @@ NOKPROBE_SYMBOL(atomic_notifier_call_chain); * Adds a notifier to a blocking notifier chain. * Must be called in process context. * - * Currently always returns zero. + * Returns 0 on success, %-EEXIST on error. */ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n) @@ -335,7 +336,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); * Adds a notifier to a raw notifier chain. * All locking must be provided by the caller. * - * Currently always returns zero. + * Returns 0 on success, %-EEXIST on error. */ int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) @@ -406,7 +407,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_call_chain); * Adds a notifier to an SRCU notifier chain. * Must be called in process context. * - * Currently always returns zero. + * Returns 0 on success, %-EEXIST on error. */ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *n) diff --git a/kernel/panic.c b/kernel/panic.c index cefd7d82366f..55b50e052ec3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ #include <linux/bug.h> #include <linux/ratelimit.h> #include <linux/debugfs.h> +#include <trace/events/error_report.h> #include <asm/sections.h> #define PANIC_TIMER_STEP 100 @@ -533,26 +534,9 @@ void oops_enter(void) trigger_all_cpu_backtrace(); } -/* - * 64-bit random ID for oopses: - */ -static u64 oops_id; - -static int init_oops_id(void) -{ - if (!oops_id) - get_random_bytes(&oops_id, sizeof(oops_id)); - else - oops_id++; - - return 0; -} -late_initcall(init_oops_id); - static void print_oops_end_marker(void) { - init_oops_id(); - pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", 0ULL); } /* @@ -609,6 +593,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, print_irqtrace_events(current); print_oops_end_marker(); + trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller); /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); diff --git a/kernel/params.c b/kernel/params.c index 8299bd764e42..5b92310425c5 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -926,9 +926,9 @@ static const struct sysfs_ops module_sysfs_ops = { .store = module_attr_store, }; -static int uevent_filter(struct kset *kset, struct kobject *kobj) +static int uevent_filter(struct kobject *kobj) { - struct kobj_type *ktype = get_ktype(kobj); + const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &module_ktype) return 1; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 9ed9b744876c..e6af502c2fd7 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -693,7 +693,7 @@ static int load_image_and_restore(void) goto Unlock; error = swsusp_read(&flags); - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -983,7 +983,7 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; } @@ -1018,7 +1018,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); goto Finish; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 326f8d032eb5..b4f433943209 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -170,6 +170,7 @@ extern int swsusp_swap_in_use(void); #define SF_PLATFORM_MODE 1 #define SF_NOCOMPRESS_MODE 2 #define SF_CRC32_MODE 4 +#define SF_HW_SIG 8 /* kernel/power/hibernate.c */ extern int swsusp_check(void); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f7a986078213..330d49937692 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -978,8 +978,7 @@ static void memory_bm_recycle(struct memory_bitmap *bm) * Register a range of page frames the contents of which should not be saved * during hibernation (to be used in the early initialization code). */ -void __init __register_nosave_region(unsigned long start_pfn, - unsigned long end_pfn, int use_kmalloc) +void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) { struct nosave_region *region; @@ -995,18 +994,12 @@ void __init __register_nosave_region(unsigned long start_pfn, goto Report; } } - if (use_kmalloc) { - /* During init, this shouldn't fail */ - region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); - BUG_ON(!region); - } else { - /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), - SMP_CACHE_BYTES); - if (!region) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct nosave_region)); - } + /* This allocation cannot fail */ + region = memblock_alloc(sizeof(struct nosave_region), + SMP_CACHE_BYTES); + if (!region) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ff326c2cb77b..ad10359030a4 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -36,6 +36,8 @@ #define HIBERNATE_SIG "S1SUSPEND" +u32 swsusp_hardware_signature; + /* * When reading an {un,}compressed image, we may restore pages in place, * in which case some architectures need these pages cleaning before they @@ -104,7 +106,8 @@ struct swap_map_handle { struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - - sizeof(u32)]; + sizeof(u32) - sizeof(u32)]; + u32 hw_sig; u32 crc32; sector_t image; unsigned int flags; /* Flags to pass to the "boot" kernel */ @@ -312,7 +315,6 @@ static int hib_wait_io(struct hib_bio_batch *hb) /* * Saving part */ - static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; @@ -324,6 +326,10 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; + if (swsusp_hardware_signature) { + swsusp_header->hw_sig = swsusp_hardware_signature; + flags |= SF_HW_SIG; + } swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; @@ -1537,6 +1543,12 @@ int swsusp_check(void) } else { error = -EINVAL; } + if (!error && swsusp_header->flags & SF_HW_SIG && + swsusp_header->hw_sig != swsusp_hardware_signature) { + pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n", + swsusp_header->hw_sig, swsusp_hardware_signature); + error = -EINVAL; + } put: if (error) diff --git a/kernel/power/user.c b/kernel/power/user.c index 740723bb3885..ad241b4ff64c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -177,7 +177,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res <= 0) goto unlock; } else { - res = PAGE_SIZE - pg_offp; + res = PAGE_SIZE; } if (!data_of(data->handle)) { diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 105df4dfc783..52571dcad768 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -39,23 +39,20 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) { struct rb_node *node; struct wakelock *wl; - char *str = buf; - char *end = buf + PAGE_SIZE; + int len = 0; mutex_lock(&wakelocks_lock); for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { wl = rb_entry(node, struct wakelock, node); if (wl->ws->active == show_active) - str += scnprintf(str, end - str, "%s ", wl->name); + len += sysfs_emit_at(buf, len, "%s ", wl->name); } - if (str > buf) - str--; - str += scnprintf(str, end - str, "\n"); + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); - return (str - buf); + return len; } #if CONFIG_PM_WAKELOCKS_LIMIT > 0 diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index d118739874c0..f5b388e810b9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,5 +2,8 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o -obj-$(CONFIG_PRINTK) += printk_ringbuffer.o obj-$(CONFIG_PRINTK_INDEX) += index.o + +obj-$(CONFIG_PRINTK) += printk_support.o +printk_support-y := printk_ringbuffer.o +printk_support-$(CONFIG_SYSCTL) += sysctl.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 9f3ed2fdb721..d947ca6c84f9 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -4,6 +4,14 @@ */ #include <linux/percpu.h> +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) +void __init printk_sysctl_init(void); +int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#else +#define printk_sysctl_init() do { } while (0) +#endif + #ifdef CONFIG_PRINTK /* Flags for a single printk record. */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 57b132b658e1..82abfaf3c2aa 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -171,7 +171,7 @@ static int __init control_devkmsg(char *str) __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; - +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -210,6 +210,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return 0; } +#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; @@ -280,7 +281,6 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; -static bool has_preferred_console; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -2861,7 +2861,8 @@ early_param("keep_bootcon", keep_bootcon_setup); * Care need to be taken with consoles that are statically * enabled such as netconsole */ -static int try_enable_new_console(struct console *newcon, bool user_specified) +static int try_enable_preferred_console(struct console *newcon, + bool user_specified) { struct console_cmdline *c; int i, err; @@ -2891,10 +2892,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) return err; } newcon->flags |= CON_ENABLED; - if (i == preferred_console) { + if (i == preferred_console) newcon->flags |= CON_CONSDEV; - has_preferred_console = true; - } return 0; } @@ -2909,6 +2908,21 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) return -ENOENT; } +/* Try to enable the console unconditionally */ +static void try_enable_default_console(struct console *newcon) +{ + if (newcon->index < 0) + newcon->index = 0; + + if (newcon->setup && newcon->setup(newcon, NULL) != 0) + return; + + newcon->flags |= CON_ENABLED; + + if (newcon->device) + newcon->flags |= CON_CONSDEV; +} + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -2930,59 +2944,56 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) */ void register_console(struct console *newcon) { - struct console *bcon = NULL; + struct console *con; + bool bootcon_enabled = false; + bool realcon_enabled = false; int err; - for_each_console(bcon) { - if (WARN(bcon == newcon, "console '%s%d' already registered\n", - bcon->name, bcon->index)) + for_each_console(con) { + if (WARN(con == newcon, "console '%s%d' already registered\n", + con->name, con->index)) return; } - /* - * before we register a new CON_BOOT console, make sure we don't - * already have a valid console - */ - if (newcon->flags & CON_BOOT) { - for_each_console(bcon) { - if (!(bcon->flags & CON_BOOT)) { - pr_info("Too late to register bootconsole %s%d\n", - newcon->name, newcon->index); - return; - } - } + for_each_console(con) { + if (con->flags & CON_BOOT) + bootcon_enabled = true; + else + realcon_enabled = true; } - if (console_drivers && console_drivers->flags & CON_BOOT) - bcon = console_drivers; - - if (!has_preferred_console || bcon || !console_drivers) - has_preferred_console = preferred_console >= 0; + /* Do not register boot consoles when there already is a real one. */ + if (newcon->flags & CON_BOOT && realcon_enabled) { + pr_info("Too late to register bootconsole %s%d\n", + newcon->name, newcon->index); + return; + } /* - * See if we want to use this console driver. If we - * didn't select a console we take the first one - * that registers here. + * See if we want to enable this console driver by default. + * + * Nope when a console is preferred by the command line, device + * tree, or SPCR. + * + * The first real console with tty binding (driver) wins. More + * consoles might get enabled before the right one is found. + * + * Note that a console with tty binding will have CON_CONSDEV + * flag set and will be first in the list. */ - if (!has_preferred_console) { - if (newcon->index < 0) - newcon->index = 0; - if (newcon->setup == NULL || - newcon->setup(newcon, NULL) == 0) { - newcon->flags |= CON_ENABLED; - if (newcon->device) { - newcon->flags |= CON_CONSDEV; - has_preferred_console = true; - } + if (preferred_console < 0) { + if (!console_drivers || !console_drivers->device || + console_drivers->flags & CON_BOOT) { + try_enable_default_console(newcon); } } /* See if this console matches one we selected on the command line */ - err = try_enable_new_console(newcon, true); + err = try_enable_preferred_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) - err = try_enable_new_console(newcon, false); + err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) @@ -2994,8 +3005,10 @@ void register_console(struct console *newcon) * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) + if (bootcon_enabled && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; + } /* * Put this console in the list - keep the @@ -3051,15 +3064,15 @@ void register_console(struct console *newcon) pr_info("%sconsole [%s%d] enabled\n", (newcon->flags & CON_BOOT) ? "boot" : "" , newcon->name, newcon->index); - if (bcon && + if (bootcon_enabled && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { /* We need to iterate through all boot consoles, to make * sure we print everything out, before we unregister them. */ - for_each_console(bcon) - if (bcon->flags & CON_BOOT) - unregister_console(bcon); + for_each_console(con) + if (con->flags & CON_BOOT) + unregister_console(con); } } EXPORT_SYMBOL(register_console); @@ -3199,6 +3212,7 @@ static int __init printk_late_init(void) ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); + printk_sysctl_init(); return 0; } late_initcall(printk_late_init); diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c new file mode 100644 index 000000000000..653ae04aab7f --- /dev/null +++ b/kernel/printk/sysctl.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sysctl.c: General linux system control interface + */ + +#include <linux/sysctl.h> +#include <linux/printk.h> +#include <linux/capability.h> +#include <linux/ratelimit.h> +#include "internal.h" + +static const int ten_thousand = 10000; + +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} + +static struct ctl_table printk_sysctls[] = { + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&ten_thousand, + }, + { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + {} +}; + +void __init printk_sysctl_init(void) +{ + register_sysctl_init("kernel", printk_sysctls); +} diff --git a/kernel/profile.c b/kernel/profile.c index eb9c7f0f5ac5..37640a0bd8a3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -133,79 +133,6 @@ int __ref profile_init(void) return -ENOMEM; } -/* Profile event notifications */ - -static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); -static ATOMIC_NOTIFIER_HEAD(task_free_notifier); -static BLOCKING_NOTIFIER_HEAD(munmap_notifier); - -void profile_task_exit(struct task_struct *task) -{ - blocking_notifier_call_chain(&task_exit_notifier, 0, task); -} - -int profile_handoff_task(struct task_struct *task) -{ - int ret; - ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); - return (ret == NOTIFY_OK) ? 1 : 0; -} - -void profile_munmap(unsigned long addr) -{ - blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); -} - -int task_handoff_register(struct notifier_block *n) -{ - return atomic_notifier_chain_register(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_register); - -int task_handoff_unregister(struct notifier_block *n) -{ - return atomic_notifier_chain_unregister(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_unregister); - -int profile_event_register(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_register( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_register( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_register); - -int profile_event_unregister(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_unregister( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_unregister( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_unregister); - #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f8589bf8d7dc..eea265082e97 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -419,8 +419,6 @@ static int ptrace_attach(struct task_struct *task, long request, if (task->ptrace) goto unlock_tasklist; - if (seize) - flags |= PT_SEIZED; task->ptrace = flags; ptrace_link(task, current); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 3128b7cf8e1f..bf8e341e75b4 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -112,7 +112,7 @@ config RCU_STALL_COMMON making these warnings mandatory for the tree variants. config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || TREE_SRCU ) + def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC ) config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" @@ -169,24 +169,6 @@ config RCU_FANOUT_LEAF Take the default if unsure. -config RCU_FAST_NO_HZ - bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ_COMMON && SMP && RCU_EXPERT - default n - help - This option permits CPUs to enter dynticks-idle state even if - they have RCU callbacks queued, and prevents RCU from waking - these CPUs up more than roughly once every four jiffies (by - default, you can adjust this using the rcutree.rcu_idle_gp_delay - parameter), thus improving energy efficiency. On the other - hand, this option increases the duration of RCU grace periods, - for example, slowing down synchronize_rcu(). - - Say Y if energy efficiency is critically important, and you - don't care about increased grace-period durations. - - Say N if you are unsure. - config RCU_BOOST bool "Enable RCU priority boosting" depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index aaa111237b60..81145c3ece25 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -261,16 +261,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Mark the specified rcu_segcblist structure as offloaded. + * Mark the specified rcu_segcblist structure as offloaded (or not) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { - if (offload) { - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); - rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); - } else { + if (offload) + rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED); + else rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); - } } /* diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 9a19328ff251..e373fbe44da5 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -80,11 +80,14 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } -/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */ +/* + * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the + * [de]offloading process)? + */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) + rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING)) return true; return false; @@ -92,9 +95,8 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) { - int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED; - - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE)) return true; return false; diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 228f143bf935..5e4f1f83d38e 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -50,8 +50,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s) #define VERBOSE_SCALEOUT_STRING(s) \ do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0) -#define VERBOSE_SCALEOUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0) +#define SCALEOUT_ERRSTRING(s) \ + pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s) /* * The intended use cases for the nreaders and nwriters module parameters @@ -514,11 +514,11 @@ rcu_scale_cleanup(void) * during the mid-boot phase, so have to wait till the end. */ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); if (rcu_gp_is_normal() && gp_exp) - VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); if (gp_exp && gp_async) - VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); if (torture_cleanup_begin()) return; @@ -845,7 +845,7 @@ rcu_scale_init(void) reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -865,7 +865,7 @@ rcu_scale_init(void) kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); if (!writer_tasks || !writer_durations || !writer_n_durations) { - VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8b410d982990..422f7e4cc08d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -46,6 +46,7 @@ #include <linux/oom.h> #include <linux/tick.h> #include <linux/rcupdate_trace.h> +#include <linux/nmi.h> #include "rcu.h" @@ -53,15 +54,18 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); /* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) +#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ #define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ #define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */ +#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */ #define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) @@ -75,7 +79,7 @@ torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); -torture_param(bool, fwd_progress, 1, "Test grace-period forward progress"); +torture_param(int, fwd_progress, 1, "Test grace-period forward progress"); torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait"); torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)"); @@ -109,6 +113,8 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); +torture_param(bool, stall_no_softlockup, false, + "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); torture_param(int, stall_gp_kthread, 0, @@ -140,7 +146,7 @@ static struct task_struct *stats_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; -static struct task_struct *fwd_prog_task; +static struct task_struct **fwd_prog_tasks; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; static struct task_struct *read_exit_task; @@ -342,10 +348,12 @@ struct rcu_torture_ops { void (*gp_kthread_dbg)(void); bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); + long cbflood_max; int irq_capable; int can_boost; int extendables; int slow_gps; + int no_pi_lock; const char *name; }; @@ -667,6 +675,7 @@ static struct rcu_torture_ops srcu_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcu" }; @@ -700,6 +709,7 @@ static struct rcu_torture_ops srcud_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcud" }; @@ -720,6 +730,7 @@ static struct rcu_torture_ops busted_srcud_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted_srcud" }; @@ -831,6 +842,7 @@ static struct rcu_torture_ops tasks_rude_ops = { .call = call_rcu_tasks_rude, .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, + .cbflood_max = 50000, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -871,6 +883,7 @@ static struct rcu_torture_ops tasks_tracing_ops = { .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, + .cbflood_max = 50000, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -1420,13 +1433,15 @@ static void rcutorture_one_extend(int *readstate, int newstate, struct rt_read_seg *rtrsp) { unsigned long flags; - int idxnew = -1; - int idxold = *readstate; + int idxnew1 = -1; + int idxnew2 = -1; + int idxold1 = *readstate; + int idxold2 = idxold1; int statesnew = ~*readstate & newstate; int statesold = *readstate & ~newstate; - WARN_ON_ONCE(idxold < 0); - WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + WARN_ON_ONCE(idxold2 < 0); + WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1440,8 +1455,10 @@ static void rcutorture_one_extend(int *readstate, int newstate, preempt_disable(); if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); - if (statesnew & RCUTORTURE_RDR_RCU) - idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; + if (statesnew & RCUTORTURE_RDR_RCU_1) + idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1; + if (statesnew & RCUTORTURE_RDR_RCU_2) + idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2; /* * Next, remove old protection, in decreasing order of strength @@ -1460,12 +1477,20 @@ static void rcutorture_one_extend(int *readstate, int newstate, local_bh_enable(); if (statesold & RCUTORTURE_RDR_RBH) rcu_read_unlock_bh(); - if (statesold & RCUTORTURE_RDR_RCU) { - bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + if (statesold & RCUTORTURE_RDR_RCU_2) { + cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1); + WARN_ON_ONCE(idxnew2 != -1); + idxold2 = 0; + } + if (statesold & RCUTORTURE_RDR_RCU_1) { + bool lockit; + lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); - cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } @@ -1475,13 +1500,19 @@ static void rcutorture_one_extend(int *readstate, int newstate, cur_ops->read_delay(trsp, rtrsp); /* Update the reader state. */ - if (idxnew == -1) - idxnew = idxold & ~RCUTORTURE_RDR_MASK; - WARN_ON_ONCE(idxnew < 0); - WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); - *readstate = idxnew | newstate; - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0); - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1); + if (idxnew1 == -1) + idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; + WARN_ON_ONCE(idxnew1 < 0); + if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1)) + pr_info("Unexpected idxnew1 value of %#x\n", idxnew1); + if (idxnew2 == -1) + idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; + WARN_ON_ONCE(idxnew2 < 0); + WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1); + *readstate = idxnew1 | idxnew2 | newstate; + WARN_ON_ONCE(*readstate < 0); + if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1)) + pr_info("Unexpected idxnew2 value of %#x\n", idxnew2); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1491,7 +1522,7 @@ static int rcutorture_extend_mask_max(void) WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND); mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables; - mask = mask | RCUTORTURE_RDR_RCU; + mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; return mask; } @@ -1506,13 +1537,21 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; else mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); + // Can't have nested RCU reader without outer RCU reader. + if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) { + if (oldmask & RCUTORTURE_RDR_RCU_1) + mask &= ~RCUTORTURE_RDR_RCU_2; + else + mask |= RCUTORTURE_RDR_RCU_1; + } + /* * Can't enable bh w/irq disabled. */ @@ -1532,7 +1571,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) mask |= oldmask & bhs; } - return mask ?: RCUTORTURE_RDR_RCU; + return mask ?: RCUTORTURE_RDR_RCU_1; } /* @@ -1626,7 +1665,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); - WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); @@ -1992,9 +2031,8 @@ static int rcutorture_booster_init(unsigned int cpu) mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); + boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL, + cpu, "rcu_torture_boost_%u"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed"); @@ -2003,8 +2041,6 @@ static int rcutorture_booster_init(unsigned int cpu) mutex_unlock(&boost_mutex); return retval; } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); mutex_unlock(&boost_mutex); return 0; } @@ -2052,6 +2088,8 @@ static int rcu_torture_stall(void *args) #else schedule_timeout_uninterruptible(HZ); #endif + } else if (stall_no_softlockup) { + touch_softlockup_watchdog(); } if (stall_cpu_irqsoff) local_irq_enable(); @@ -2123,10 +2161,13 @@ struct rcu_fwd { unsigned long rcu_fwd_startat; struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; unsigned long rcu_launder_gp_seq_start; + int rcu_fwd_id; }; static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; +static unsigned long rcu_fwd_seq; +static atomic_long_t rcu_fwd_max_cbs; static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -2139,8 +2180,9 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) if (rfp->n_launders_hist[i].n_launders > 0) break; - pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rfp->rcu_fwd_startat); + mutex_lock(&rcu_fwd_mutex); // Serialize histograms. + pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):", + __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat); gps_old = rfp->rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { gps = rfp->n_launders_hist[j].launder_gp_seq; @@ -2151,6 +2193,7 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) gps_old = gps; } pr_cont("\n"); + mutex_unlock(&rcu_fwd_mutex); } /* Callback function for continuous-flood RCU callbacks. */ @@ -2276,7 +2319,8 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); WARN_ON(!cver && gps < 2); - pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__, + rfp->rcu_fwd_id, dur, cver, gps); } if (selfpropcb) { WRITE_ONCE(fcs.stop, 1); @@ -2344,7 +2388,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) rfp->rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; - } else { + } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) { rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); if (WARN_ON_ONCE(!rfcp)) { schedule_timeout_interruptible(1); @@ -2354,8 +2398,11 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders_sa = 0; rfcp->rfc_gps = 0; rfcp->rfc_rfp = rfp; + } else { + rfcp = NULL; } - cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); + if (rfcp) + cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); if (tick_nohz_full_enabled()) { local_irq_save(flags); @@ -2379,6 +2426,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); + atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); rcu_torture_fwd_cb_hist(rfp); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ @@ -2394,6 +2442,8 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + int i; + long ncbs; struct rcu_fwd *rfp; mutex_lock(&rcu_fwd_mutex); @@ -2404,18 +2454,26 @@ static int rcutorture_oom_notify(struct notifier_block *self, } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(rfp); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2); + for (i = 0; i < fwd_progress; i++) { + rcu_torture_fwd_cb_hist(&rfp[i]); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2); + } WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); @@ -2430,7 +2488,10 @@ static struct notifier_block rcutorture_oom_nb = { /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { + bool firsttime = true; + long max_cbs; int oldnice = task_nice(current); + unsigned long oldseq = READ_ONCE(rcu_fwd_seq); struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; @@ -2440,21 +2501,38 @@ static int rcu_torture_fwd_prog(void *args) if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); do { - schedule_timeout_interruptible(fwd_progress_holdoff * HZ); - WRITE_ONCE(rcu_fwd_emergency_stop, false); - if (!IS_ENABLED(CONFIG_TINY_RCU) || - rcu_inkernel_boot_has_ended()) - rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); - if (rcu_inkernel_boot_has_ended()) + if (!rfp->rcu_fwd_id) { + schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + WRITE_ONCE(rcu_fwd_emergency_stop, false); + if (!firsttime) { + max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0); + pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs); + } + firsttime = false; + WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); + } else { + while (READ_ONCE(rcu_fwd_seq) == oldseq) + schedule_timeout_interruptible(1); + oldseq = READ_ONCE(rcu_fwd_seq); + } + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); + if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id) rcu_torture_fwd_prog_cr(rfp); + if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) && + (!IS_ENABLED(CONFIG_TINY_RCU) || + (rcu_inkernel_boot_has_ended() && + torture_num_online_cpus() > rfp->rcu_fwd_id))) + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); /* Avoid slow periods, better to test when busy. */ if (stutter_wait("rcu_torture_fwd_prog")) sched_set_normal(current, oldnice); } while (!torture_must_stop()); /* Short runs might not contain a valid forward-progress attempt. */ - WARN_ON(!tested && tested_tries >= 5); - pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + if (!rfp->rcu_fwd_id) { + WARN_ON(!tested && tested_tries >= 5); + pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + } torture_kthread_stopping("rcu_torture_fwd_prog"); return 0; } @@ -2462,17 +2540,28 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { + int i; + int ret = 0; struct rcu_fwd *rfp; if (!fwd_progress) return 0; /* Not requested, so don't do it. */ + if (fwd_progress >= nr_cpu_ids) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n"); + fwd_progress = nr_cpu_ids; + } else if (fwd_progress < 0) { + fwd_progress = nr_cpu_ids; + } if ((!cur_ops->sync && !cur_ops->call) || - !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) { + (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) || + cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); + fwd_progress = 0; return 0; } if (stall_cpu > 0) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + fwd_progress = 0; if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ WARN_ON(1); /* Make sure rcutorture notices conflict. */ @@ -2482,29 +2571,51 @@ static int __init rcu_torture_fwd_prog_init(void) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; - rfp = kzalloc(sizeof(*rfp), GFP_KERNEL); - if (!rfp) + rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL); + fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL); + if (!rfp || !fwd_prog_tasks) { + kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; + fwd_progress = 0; return -ENOMEM; - spin_lock_init(&rfp->rcu_fwd_lock); - rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + } + for (i = 0; i < fwd_progress; i++) { + spin_lock_init(&rfp[i].rcu_fwd_lock); + rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head; + rfp[i].rcu_fwd_id = i; + } mutex_lock(&rcu_fwd_mutex); rcu_fwds = rfp; mutex_unlock(&rcu_fwd_mutex); register_oom_notifier(&rcutorture_oom_nb); - return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); + for (i = 0; i < fwd_progress; i++) { + ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]); + if (ret) { + fwd_progress = i; + return ret; + } + } + return 0; } static void rcu_torture_fwd_prog_cleanup(void) { + int i; struct rcu_fwd *rfp; - torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); - rfp = rcu_fwds; + if (!rcu_fwds || !fwd_prog_tasks) + return; + for (i = 0; i < fwd_progress; i++) + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]); + unregister_oom_notifier(&rcutorture_oom_nb); mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; rcu_fwds = NULL; mutex_unlock(&rcu_fwd_mutex); - unregister_oom_notifier(&rcutorture_oom_nb); kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; } /* Callback function for RCU barrier testing. */ @@ -2741,7 +2852,7 @@ static int rcu_torture_read_exit(void *unused) &trs, "%s", "rcu_torture_read_exit_child"); if (IS_ERR(tsp)) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); errexit = true; tsp = NULL; break; @@ -3068,7 +3179,7 @@ rcu_torture_init(void) sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -3084,7 +3195,7 @@ rcu_torture_init(void) rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk), GFP_KERNEL); if (!reader_tasks || !rcu_torture_reader_mbchk) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -3103,7 +3214,7 @@ rcu_torture_init(void) if (nrealnocbers > 0) { nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL); if (nocb_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1631ef8a138d..5489ff7f478e 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -44,7 +44,10 @@ pr_alert("%s" SCALE_FLAG s, scale_type, ## x) #define VERBOSE_SCALEOUT(s, x...) \ - do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) + do { \ + if (verbose) \ + pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \ + } while (0) static atomic_t verbose_batch_ctr; @@ -54,12 +57,11 @@ do { \ (verbose_batched <= 0 || \ !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \ schedule_timeout_uninterruptible(1); \ - pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \ + pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \ } \ } while (0) -#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ - do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) +#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>"); @@ -604,7 +606,7 @@ static u64 process_durations(int n) char *buf; u64 sum = 0; - buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + buf = kmalloc(800 + 64, GFP_KERNEL); if (!buf) return 0; buf[0] = 0; @@ -617,13 +619,15 @@ static u64 process_durations(int n) if (i % 5 == 0) strcat(buf, "\n"); + if (strlen(buf) >= 800) { + pr_alert("%s", buf); + buf[0] = 0; + } strcat(buf, buf1); sum += rt->last_duration_ns; } - strcat(buf, "\n"); - - SCALEOUT("%s\n", buf); + pr_alert("%s\n", buf); kfree(buf); return sum; @@ -637,7 +641,6 @@ static u64 process_durations(int n) // point all the timestamps are printed. static int main_func(void *arg) { - bool errexit = false; int exp, r; char buf1[64]; char *buf; @@ -648,10 +651,10 @@ static int main_func(void *arg) VERBOSE_SCALEOUT("main_func task started"); result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); - buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + buf = kzalloc(800 + 64, GFP_KERNEL); if (!result_avg || !buf) { - VERBOSE_SCALEOUT_ERRSTRING("out of memory"); - errexit = true; + SCALEOUT_ERRSTRING("out of memory"); + goto oom_exit; } if (holdoff) schedule_timeout_interruptible(holdoff * HZ); @@ -663,8 +666,6 @@ static int main_func(void *arg) // Start exp readers up per experiment for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { - if (errexit) - break; if (torture_must_stop()) goto end; @@ -698,26 +699,23 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - if (!errexit) { - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); - } - + pr_alert("Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { u64 avg; u32 rem; - if (errexit) - break; avg = div_u64_rem(result_avg[exp], 1000, &rem); sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); strcat(buf, buf1); + if (strlen(buf) >= 800) { + pr_alert("%s", buf); + buf[0] = 0; + } } - if (!errexit) - SCALEOUT("%s", buf); + pr_alert("%s", buf); +oom_exit: // This will shutdown everything including us. if (shutdown) { shutdown_start = 1; @@ -841,12 +839,12 @@ ref_scale_init(void) reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { - VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } - VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders); + VERBOSE_SCALEOUT("Starting %d reader threads", nreaders); for (i = 0; i < nreaders; i++) { firsterr = torture_create_kthread(ref_scale_reader, (void *)i, diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index a0ba2ed49bc6..92c002d65482 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -99,7 +99,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); - if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 7da3c81c3f59..d64f0b1d8cd3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -6,6 +6,7 @@ */ #ifdef CONFIG_TASKS_RCU_GENERIC +#include "rcu_segcblist.h" //////////////////////////////////////////////////////////////////////// // @@ -20,11 +21,33 @@ typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** + * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism. + * @cblist: Callback list. + * @lock: Lock protecting per-CPU callback list. + * @rtp_jiffies: Jiffies counter value for statistics. + * @rtp_n_lock_retries: Rough lock-contention statistic. + * @rtp_work: Work queue for invoking callbacks. + * @rtp_irq_work: IRQ work queue for deferred wakeups. + * @barrier_q_head: RCU callback for barrier operation. + * @cpu: CPU number corresponding to this entry. + * @rtpp: Pointer to the rcu_tasks structure. + */ +struct rcu_tasks_percpu { + struct rcu_segcblist cblist; + raw_spinlock_t __private lock; + unsigned long rtp_jiffies; + unsigned long rtp_n_lock_retries; + struct work_struct rtp_work; + struct irq_work rtp_irq_work; + struct rcu_head barrier_q_head; + int cpu; + struct rcu_tasks *rtpp; +}; + +/** * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. - * @cbs_head: Head of callback list. - * @cbs_tail: Tail pointer for callback list. * @cbs_wq: Wait queue allowing new callback to get kthread's attention. - * @cbs_lock: Lock protecting callback list. + * @cbs_gbl_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). @@ -32,7 +55,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. - * @n_gps: Number of grace periods completed since boot. + * @tasks_gp_seq: Number of grace periods completed since boot. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. * @pregp_func: This flavor's pre-grace-period function (optional). @@ -41,20 +64,27 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. + * @rtpcpu: This flavor's rcu_tasks_percpu structure. + * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. + * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. + * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. + * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers. + * @barrier_q_mutex: Serialize barrier operations. + * @barrier_q_count: Number of queues being waited on. + * @barrier_q_completion: Barrier wait/wakeup mechanism. + * @barrier_q_seq: Sequence number for barrier operations. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ struct rcu_tasks { - struct rcu_head *cbs_head; - struct rcu_head **cbs_tail; struct wait_queue_head cbs_wq; - raw_spinlock_t cbs_lock; + raw_spinlock_t cbs_gbl_lock; int gp_state; int gp_sleep; int init_fract; unsigned long gp_jiffies; unsigned long gp_start; - unsigned long n_gps; + unsigned long tasks_gp_seq; unsigned long n_ipis; unsigned long n_ipis_fails; struct task_struct *kthread_ptr; @@ -65,20 +95,40 @@ struct rcu_tasks { holdouts_func_t holdouts_func; postgp_func_t postgp_func; call_rcu_func_t call_func; + struct rcu_tasks_percpu __percpu *rtpcpu; + int percpu_enqueue_shift; + int percpu_enqueue_lim; + int percpu_dequeue_lim; + unsigned long percpu_dequeue_gpseq; + struct mutex barrier_q_mutex; + atomic_t barrier_q_count; + struct completion barrier_q_completion; + unsigned long barrier_q_seq; char *name; char *kname; }; -#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ -static struct rcu_tasks rt_name = \ -{ \ - .cbs_tail = &rt_name.cbs_head, \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ - .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \ - .gp_func = gp, \ - .call_func = call, \ - .name = n, \ - .kname = #rt_name, \ +static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); + +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ + .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \ +}; \ +static struct rcu_tasks rt_name = \ +{ \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ + .gp_func = gp, \ + .call_func = call, \ + .rtpcpu = &rt_name ## __percpu, \ + .name = n, \ + .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS) + 1, \ + .percpu_enqueue_lim = 1, \ + .percpu_dequeue_lim = 1, \ + .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ + .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \ + .kname = #rt_name, \ } /* Track exiting tasks in order to allow them to be waited for. */ @@ -94,6 +144,15 @@ module_param(rcu_task_ipi_delay, int, 0644); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +static int rcu_task_enqueue_lim __read_mostly = -1; +module_param(rcu_task_enqueue_lim, int, 0444); + +static bool rcu_task_cb_adjust; +static int rcu_task_contend_lim __read_mostly = 100; +module_param(rcu_task_contend_lim, int, 0444); +static int rcu_task_collapse_lim __read_mostly = 10; +module_param(rcu_task_collapse_lim, int, 0444); + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -128,6 +187,8 @@ static const char * const rcu_tasks_gp_state_names[] = { // // Generic code. +static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp); + /* Record grace-period phase and time. */ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) { @@ -148,23 +209,110 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) } #endif /* #ifndef CONFIG_TINY_RCU */ +// Initialize per-CPU callback lists for the specified flavor of +// Tasks RCU. +static void cblist_init_generic(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + int lim; + int shift; + + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rcu_task_enqueue_lim < 0) { + rcu_task_enqueue_lim = 1; + rcu_task_cb_adjust = true; + pr_info("%s: Setting adjustable number of callback queues.\n", __func__); + } else if (rcu_task_enqueue_lim == 0) { + rcu_task_enqueue_lim = 1; + } + lim = rcu_task_enqueue_lim; + + if (lim > nr_cpu_ids) + lim = nr_cpu_ids; + shift = ilog2(nr_cpu_ids / lim); + if (((nr_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); + WRITE_ONCE(rtp->percpu_dequeue_lim, lim); + smp_store_release(&rtp->percpu_enqueue_lim, lim); + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + WARN_ON_ONCE(!rtpcp); + if (cpu) + raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + if (rcu_segcblist_empty(&rtpcp->cblist)) + rcu_segcblist_init(&rtpcp->cblist); + INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); + rtpcp->cpu = cpu; + rtpcp->rtpp = rtp; + raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); +} + +// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). +static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); + + rtp = rtpcp->rtpp; + wake_up(&rtp->cbs_wq); +} + // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { unsigned long flags; + unsigned long j; + bool needadjust = false; bool needwake; + struct rcu_tasks_percpu *rtpcp; rhp->next = NULL; rhp->func = func; - raw_spin_lock_irqsave(&rtp->cbs_lock, flags); - needwake = !rtp->cbs_head; - WRITE_ONCE(*rtp->cbs_tail, rhp); - rtp->cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + local_irq_save(flags); + rcu_read_lock(); + rtpcp = per_cpu_ptr(rtp->rtpcpu, + smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); + if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + j = jiffies; + if (rtpcp->rtp_jiffies != j) { + rtpcp->rtp_jiffies = j; + rtpcp->rtp_n_lock_retries = 0; + } + if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && + READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) + needadjust = true; // Defer adjustment to avoid deadlock. + } + if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { + raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. + cblist_init_generic(rtp); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + } + needwake = rcu_segcblist_empty(&rtpcp->cblist); + rcu_segcblist_enqueue(&rtpcp->cblist, rhp); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (unlikely(needadjust)) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim != nr_cpu_ids) { + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); + WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); + smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); + pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + rcu_read_unlock(); /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) - wake_up(&rtp->cbs_wq); + irq_work_queue(&rtpcp->rtp_irq_work); } // Wait for a grace period for the specified flavor of Tasks RCU. @@ -178,12 +326,173 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) wait_rcu_gp(rtp->call_func); } +// RCU callback function for rcu_barrier_tasks_generic(). +static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp; + + rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head); + rtp = rtpcp->rtpp; + if (atomic_dec_and_test(&rtp->barrier_q_count)) + complete(&rtp->barrier_q_completion); +} + +// Wait for all in-flight callbacks for the specified RCU Tasks flavor. +// Operates in a manner similar to rcu_barrier(). +static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq); + + mutex_lock(&rtp->barrier_q_mutex); + if (rcu_seq_done(&rtp->barrier_q_seq, s)) { + smp_mb(); + mutex_unlock(&rtp->barrier_q_mutex); + return; + } + rcu_seq_start(&rtp->barrier_q_seq); + init_completion(&rtp->barrier_q_completion); + atomic_set(&rtp->barrier_q_count, 2); + for_each_possible_cpu(cpu) { + if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim)) + break; + rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head)) + atomic_inc(&rtp->barrier_q_count); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } + if (atomic_sub_and_test(2, &rtp->barrier_q_count)) + complete(&rtp->barrier_q_completion); + wait_for_completion(&rtp->barrier_q_completion); + rcu_seq_end(&rtp->barrier_q_seq); + mutex_unlock(&rtp->barrier_q_mutex); +} + +// Advance callbacks and indicate whether either a grace period or +// callback invocation is needed. +static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + long n; + long ncbs = 0; + long ncbsnz = 0; + int needgpcb = 0; + + for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + /* Advance and accelerate any new callbacks. */ + if (!rcu_segcblist_n_cbs(&rtpcp->cblist)) + continue; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + // Should we shrink down to a single callback queue? + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (n) { + ncbs += n; + if (cpu > 0) + ncbsnz += n; + } + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + if (rcu_segcblist_pend_cbs(&rtpcp->cblist)) + needgpcb |= 0x3; + if (!rcu_segcblist_empty(&rtpcp->cblist)) + needgpcb |= 0x1; + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } + + // Shrink down to a single callback queue if appropriate. + // This is done in two stages: (1) If there are no more than + // rcu_task_collapse_lim callbacks on CPU 0 and none on any other + // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period, + // if there has not been an increase in callbacks, limit dequeuing + // to CPU 0. Note the matching RCU read-side critical section in + // call_rcu_tasks_generic(). + if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim > 1) { + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); + smp_store_release(&rtp->percpu_enqueue_lim, 1); + rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); + pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + if (rcu_task_cb_adjust && !ncbsnz && + poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) { + WRITE_ONCE(rtp->percpu_dequeue_lim, 1); + pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + + return needgpcb; +} + +// Advance callbacks and invoke any that are ready. +static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp) +{ + int cpu; + int cpunext; + unsigned long flags; + int len; + struct rcu_head *rhp; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + struct rcu_tasks_percpu *rtpcp_next; + + cpu = rtpcp->cpu; + cpunext = cpu * 2 + 1; + if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); + queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpunext++; + if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); + queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + } + } + + if (rcu_segcblist_empty(&rtpcp->cblist)) + return; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + len = rcl.len; + for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + cond_resched(); + } + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_add_len(&rtpcp->cblist, -len); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); +} + +// Workqueue flood to advance callbacks and invoke any that are ready. +static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work); + + rtp = rtpcp->rtpp; + rcu_tasks_invoke_cbs(rtp, rtpcp); +} + /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ static int __noreturn rcu_tasks_kthread(void *arg) { - unsigned long flags; - struct rcu_head *list; - struct rcu_head *next; + int needgpcb; struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ @@ -199,42 +508,22 @@ static int __noreturn rcu_tasks_kthread(void *arg) for (;;) { set_tasks_gp_state(rtp, RTGS_WAIT_CBS); - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rtp->cbs_lock, flags); - smp_mb__after_spinlock(); // Order updates vs. GP. - list = rtp->cbs_head; - rtp->cbs_head = NULL; - rtp->cbs_tail = &rtp->cbs_head; - raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); - /* If there were none, wait a bit and start over. */ - if (!list) { - wait_event_interruptible(rtp->cbs_wq, - READ_ONCE(rtp->cbs_head)); - if (!rtp->cbs_head) { - WARN_ON(signal_pending(current)); - set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); - schedule_timeout_idle(HZ/10); - } - continue; + wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp))); + + if (needgpcb & 0x2) { + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rcu_seq_start(&rtp->tasks_gp_seq); + rtp->gp_func(rtp); + rcu_seq_end(&rtp->tasks_gp_seq); } - // Wait for one grace period. - set_tasks_gp_state(rtp, RTGS_WAIT_GP); - rtp->gp_start = jiffies; - rtp->gp_func(rtp); - rtp->n_gps++; - - /* Invoke the callbacks. */ + /* Invoke callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); - while (list) { - next = list->next; - local_bh_disable(); - list->func(list); - local_bh_enable(); - list = next; - cond_resched(); - } + rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); + /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_idle(rtp->gp_sleep); } @@ -279,14 +568,15 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), - data_race(rtp->n_gps), + data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!!data_race(rtp->cbs_head)], + ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))], s); } #endif // #ifndef CONFIG_TINY_RCU @@ -411,10 +701,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU // read-side critical sections waited for by rcu_tasks_postscan(). // -// Pre-grace-period update-side code is ordered before the grace via the -// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side -// code is ordered before the grace period via synchronize_rcu() call -// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt +// Pre-grace-period update-side code is ordered before the grace +// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code +// is ordered before the grace period via synchronize_rcu() call in +// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt // disabling. /* Pre-grace-period preparation. */ @@ -586,13 +876,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); */ void rcu_barrier_tasks(void) { - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); + rcu_barrier_tasks_generic(&rcu_tasks); } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { + cblist_init_generic(&rcu_tasks); rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = HZ / 10; rcu_tasks.pregp_func = rcu_tasks_pregp_step; @@ -724,13 +1014,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); */ void rcu_barrier_tasks_rude(void) { - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks_rude(); + rcu_barrier_tasks_generic(&rcu_tasks_rude); } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); static int __init rcu_spawn_tasks_rude_kthread(void) { + cblist_init_generic(&rcu_tasks_rude); rcu_tasks_rude.gp_sleep = HZ / 10; rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; @@ -1073,25 +1363,50 @@ static void rcu_tasks_trace_postscan(struct list_head *hop) // Any tasks that exit after this point will set ->trc_reader_checked. } +/* Communicate task state back to the RCU tasks trace stall warning request. */ +struct trc_stall_chk_rdr { + int nesting; + int ipi_to_cpu; + u8 needqs; +}; + +static int trc_check_slow_task(struct task_struct *t, void *arg) +{ + struct trc_stall_chk_rdr *trc_rdrp = arg; + + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); + trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); + trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs); + return true; +} + /* Show the state of a task stalling the current RCU tasks trace GP. */ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) { int cpu; + struct trc_stall_chk_rdr trc_rdr; + bool is_idle_tsk = is_idle_task(t); if (*firstreport) { pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); *firstreport = false; } - // FIXME: This should attempt to use try_invoke_on_nonrunning_task(). cpu = task_cpu(t); - pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", - t->pid, - ".I"[READ_ONCE(t->trc_ipi_to_cpu) >= 0], - ".i"[is_idle_task(t)], - ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], - READ_ONCE(t->trc_reader_nesting), - " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)], - cpu); + if (!task_call_func(t, trc_check_slow_task, &trc_rdr)) + pr_alert("P%d: %c\n", + t->pid, + ".i"[is_idle_tsk]); + else + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + t->pid, + ".I"[trc_rdr.ipi_to_cpu >= 0], + ".i"[is_idle_tsk], + ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], + trc_rdr.nesting, + " N"[!!trc_rdr.needqs], + cpu); sched_show_task(t); } @@ -1121,7 +1436,8 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, trc_wait_for_one_reader(t, hop); // If check succeeded, remove this task from the list. - if (READ_ONCE(t->trc_reader_checked)) + if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && + READ_ONCE(t->trc_reader_checked)) trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); @@ -1156,7 +1472,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) // Yes, this assumes that CPUs process IPIs in order. If that ever // changes, there will need to be a recheck and/or timed wait. for_each_online_cpu(cpu) - if (smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))) + if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); // Remove the safety count. @@ -1256,13 +1572,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); */ void rcu_barrier_tasks_trace(void) { - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks_trace(); + rcu_barrier_tasks_generic(&rcu_tasks_trace); } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); static int __init rcu_spawn_tasks_trace_kthread(void) { + cblist_init_generic(&rcu_tasks_trace); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { rcu_tasks_trace.gp_sleep = HZ / 10; rcu_tasks_trace.init_fract = HZ / 10; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ef8d36f580fc..a4c25a6283b0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,7 +79,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_RCU_NOCB_CPU - .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, + .cblist.flags = SEGCBLIST_RCU_CORE, #endif }; static struct rcu_state rcu_state = { @@ -624,7 +624,6 @@ static noinstr void rcu_eqs_enter(bool user) instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); // instrumentation for the noinstr rcu_dynticks_eqs_enter() @@ -768,9 +767,6 @@ noinstr void rcu_nmi_exit(void) trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - if (!in_nmi()) - rcu_prepare_for_idle(); - // instrumentation for the noinstr rcu_dynticks_eqs_enter() instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); instrumentation_end(); @@ -872,7 +868,6 @@ static void noinstr rcu_eqs_exit(bool user) // instrumentation for the noinstr rcu_dynticks_eqs_exit() instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); - rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); @@ -1014,12 +1009,6 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) { - instrumentation_begin(); - rcu_cleanup_after_idle(); - instrumentation_end(); - } - instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); @@ -1087,6 +1076,24 @@ void rcu_irq_enter_irqson(void) } /* + * Check to see if any future non-offloaded RCU-related work will need + * to be done by the current CPU, even if none need be done immediately, + * returning 1 if so. This function is part of the RCU implementation; + * it is -not- an exported member of the RCU API. This is used by + * the idle-entry code to figure out whether it is safe to disable the + * scheduler-clock interrupt. + * + * Just check whether or not this CPU has non-offloaded RCU callbacks + * queued. + */ +int rcu_needs_cpu(u64 basemono, u64 *nextevt) +{ + *nextevt = KTIME_MAX; + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && + !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); +} + +/* * If any sort of urgency was applied to the current CPU (for example, * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order * to get to a quiescent state, disable it. @@ -1467,7 +1474,7 @@ static void rcu_gp_kthread_wake(void) { struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); - if ((current == t && !in_irq() && !in_serving_softirq()) || + if ((current == t && !in_hardirq() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !t) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); @@ -1590,10 +1597,11 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, struct rcu_data *rdp) { rcu_lockdep_assert_cblist_protected(rdp); - if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || - !raw_spin_trylock_rcu_node(rnp)) + if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp)) return; - WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); + // The grace period cannot end while we hold the rcu_node lock. + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) + WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); raw_spin_unlock_rcu_node(rnp); } @@ -2277,7 +2285,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp) unsigned long flags; unsigned long mask; bool needwake = false; - const bool offloaded = rcu_rdp_is_offloaded(rdp); + bool needacc = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2304,15 +2312,30 @@ rcu_report_qs_rdp(struct rcu_data *rdp) /* * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. + * + * NOCB kthreads have their own way to deal with that... */ - if (!offloaded) + if (!rcu_rdp_is_offloaded(rdp)) { needwake = rcu_accelerate_cbs(rnp, rdp); + } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { + /* + * ...but NOCB kthreads may miss or delay callbacks acceleration + * if in the middle of a (de-)offloading process. + */ + needacc = true; + } rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(); + + if (needacc) { + rcu_nocb_lock_irqsave(rdp, flags); + rcu_accelerate_cbs_unlocked(rnp, rdp); + rcu_nocb_unlock_irqrestore(rdp, flags); + } } } @@ -2444,7 +2467,6 @@ static void rcu_do_batch(struct rcu_data *rdp) int div; bool __maybe_unused empty; unsigned long flags; - const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; @@ -2462,18 +2484,17 @@ static void rcu_do_batch(struct rcu_data *rdp) } /* - * Extract the list of ready callbacks, disabling to prevent + * Extract the list of ready callbacks, disabling IRQs to prevent * races with call_rcu() from interrupt handlers. Leave the * callback counts, as rcu_barrier() needs to be conservative. */ - local_irq_save(flags); - rcu_nocb_lock(rdp); + rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (unlikely(bl > 100)) { + if (in_serving_softirq() && unlikely(bl > 100)) { long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; @@ -2482,7 +2503,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); - if (offloaded) + if (rcu_rdp_is_offloaded(rdp)) rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued")); @@ -2510,18 +2531,21 @@ static void rcu_do_batch(struct rcu_data *rdp) /* * Stop only if limit reached and CPU has something to do. */ - if (count >= bl && !offloaded && - (need_resched() || - (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) - break; - if (unlikely(tlimit)) { - /* only call local_clock() every 32 callbacks */ - if (likely((count & 31) || local_clock() < tlimit)) - continue; - /* Exceeded the time limit, so leave. */ - break; - } - if (!in_serving_softirq()) { + if (in_serving_softirq()) { + if (count >= bl && (need_resched() || !is_idle_task(current))) + break; + /* + * Make sure we don't spend too much time here and deprive other + * softirq vectors of CPU cycles. + */ + if (unlikely(tlimit)) { + /* only call local_clock() every 32 callbacks */ + if (likely((count & 31) || local_clock() < tlimit)) + continue; + /* Exceeded the time limit, so leave. */ + break; + } + } else { local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); @@ -2530,8 +2554,7 @@ static void rcu_do_batch(struct rcu_data *rdp) } } - local_irq_save(flags); - rcu_nocb_lock(rdp); + rcu_nocb_lock_irqsave(rdp, flags); rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2565,9 +2588,6 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_unlock_irqrestore(rdp, flags); - /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_core(); tick_dep_clear_task(current, TICK_DEP_BIT_RCU); } @@ -2706,6 +2726,23 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + /* + * On RT rcu_core() can be preempted when IRQs aren't disabled. + * Therefore this function can race with concurrent NOCB (de-)offloading + * on this CPU and the below condition must be considered volatile. + * However if we race with: + * + * _ Offloading: In the worst case we accelerate or process callbacks + * concurrently with NOCB kthreads. We are guaranteed to + * call rcu_nocb_lock() if that happens. + * + * _ Deoffloading: In the worst case we miss callbacks acceleration or + * processing. This is fine because the early stage + * of deoffloading invokes rcu_core() after setting + * SEGCBLIST_RCU_CORE. So we guarantee that we'll process + * what could have been dismissed without the need to wait + * for the next rcu_pending() check in the next jiffy. + */ const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) @@ -2714,7 +2751,7 @@ static __latent_entropy void rcu_core(void) WARN_ON_ONCE(!rdp->beenonline); /* Report any deferred quiescent states if preemption enabled. */ - if (!(preempt_count() & PREEMPT_MASK)) { + if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { set_tsk_need_resched(current); @@ -2737,8 +2774,12 @@ static __latent_entropy void rcu_core(void) /* If there are callbacks ready, invoke them. */ if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && - likely(READ_ONCE(rcu_scheduler_fully_active))) + likely(READ_ONCE(rcu_scheduler_fully_active))) { rcu_do_batch(rdp); + /* Re-invoke RCU core processing if there are callbacks remaining. */ + if (rcu_segcblist_ready_cbs(&rdp->cblist)) + invoke_rcu_core(); + } /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); @@ -2982,7 +3023,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) head->func = func; head->next = NULL; local_irq_save(flags); - kasan_record_aux_stack(head); + kasan_record_aux_stack_noalloc(head); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ @@ -3547,7 +3588,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) return; } - kasan_record_aux_stack(ptr); + kasan_record_aux_stack_noalloc(ptr); success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..486fc901bd08 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -157,7 +157,6 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ @@ -189,11 +188,6 @@ struct rcu_data { bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ -#ifdef CONFIG_RCU_FAST_NO_HZ - unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; @@ -227,8 +221,11 @@ struct rcu_data { struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ struct task_struct *nocb_cb_kthread; - struct rcu_data *nocb_next_cb_rdp; - /* Next rcu_data in wakeup chain. */ + struct list_head nocb_head_rdp; /* + * Head of rcu_data list in wakeup chain, + * if rdp_gp. + */ + struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */ /* The following fields are used by CB kthread, hence new cacheline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; @@ -419,8 +416,6 @@ static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static void __init rcu_spawn_boost_kthreads(void); -static void rcu_cleanup_after_idle(void); -static void rcu_prepare_for_idle(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); @@ -447,12 +442,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); -#define rcu_nocb_lock_irqsave(rdp, flags) \ -do { \ - if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ - local_irq_save(flags); \ - else \ - raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \ + +/* + * Disable IRQs before checking offloaded state so that local + * locking is safe against concurrent de-offloading. + */ +#define rcu_nocb_lock_irqsave(rdp, flags) \ +do { \ + local_irq_save(flags); \ + if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ + raw_spin_lock(&(rdp)->nocb_lock); \ } while (0) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ #define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f3947c49eee7..237a79989aba 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -255,7 +255,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->exp_deferred_qs, false); + WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -387,6 +387,7 @@ retry_ipi: continue; } if (get_cpu() == cpu) { + mask_ofl_test |= mask; put_cpu(); continue; } @@ -506,7 +507,10 @@ static void synchronize_rcu_expedited_wait(void) if (rdp->rcu_forced_tick_exp) continue; rdp->rcu_forced_tick_exp = true; - tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + preempt_disable(); + if (cpu_online(cpu)) + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + preempt_enable(); } } j = READ_ONCE(jiffies_till_first_fqs); @@ -655,7 +659,7 @@ static void rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -677,7 +681,7 @@ static void rcu_exp_handler(void *unused) if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -759,7 +763,7 @@ static void sync_sched_exp_online_cleanup(int cpu) my_cpu = get_cpu(); /* Quiescent state either not needed or already requested, leave. */ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - rdp->cpu_no_qs.b.exp) { + READ_ONCE(rdp->cpu_no_qs.b.exp)) { put_cpu(); return; } diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 368ef7b9af4f..eeafb546a7a0 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -60,16 +60,22 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ + +static bool rcu_nocb_is_setup; + static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (cpulist_parse(str, rcu_nocb_mask)) { - pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); - cpumask_setall(rcu_nocb_mask); + if (*str == '=') { + if (cpulist_parse(++str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } } + rcu_nocb_is_setup = true; return 1; } -__setup("rcu_nocbs=", rcu_nocb_setup); +__setup("rcu_nocbs", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { @@ -625,7 +631,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * and the global grace-period kthread are awakened if needed. */ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + /* + * An rcu_data structure is removed from the list after its + * CPU is de-offloaded and added to the list before that CPU is + * (re-)offloaded. If the following loop happens to be referencing + * that rcu_data structure during the time that the corresponding + * CPU is de-offloaded and then immediately re-offloaded, this + * loop's rdp pointer will be carried to the end of the list by + * the resulting pair of list operations. This can cause the loop + * to skip over some of the rcu_data structures that were supposed + * to have been scanned. Fortunately a new iteration through the + * entire loop is forced after a given CPU's rcu_data structure + * is added to the list, so the skipped-over rcu_data structures + * won't be ignored for long. + */ + list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) { bool needwake_state = false; if (!nocb_gp_enabled_cb(rdp)) @@ -789,6 +809,18 @@ static void nocb_cb_wait(struct rcu_data *rdp) bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + + // VVV Ensure CB invocation follows _sleep test. + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); + + local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); @@ -841,17 +873,6 @@ static void nocb_cb_wait(struct rcu_data *rdp) if (needwake_state) swake_up_one(&rdp->nocb_state_wq); - - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); } /* @@ -990,22 +1011,33 @@ static long rcu_nocb_rdp_deoffload(void *arg) * will refuse to put anything into the bypass. */ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); + /* + * Start with invoking rcu_core() early. This way if the current thread + * happens to preempt an ongoing call to rcu_core() in the middle, + * leaving some work dismissed because rcu_core() still thinks the rdp is + * completely offloaded, we are guaranteed a nearby future instance of + * rcu_core() to catch up. + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); + invoke_rcu_core(); ret = rdp_offload_toggle(rdp, false, flags); swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + /* Stop nocb_gp_wait() from iterating over this structure. */ + list_del_rcu(&rdp->nocb_entry_rdp); /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. */ rcu_nocb_lock_irqsave(rdp, flags); /* - * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb + * Theoretically we could clear SEGCBLIST_LOCKING after the nocb * lock is released but how about being paranoid for once? */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING); /* - * With SEGCBLIST_SOFTIRQ_ONLY, we can't use + * Without SEGCBLIST_LOCKING, we can't use * rcu_nocb_unlock_irqrestore() anymore. */ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); @@ -1057,15 +1089,26 @@ static long rcu_nocb_rdp_offload(void *arg) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); + + /* + * Cause future nocb_gp_wait() invocations to iterate over + * structure, resetting ->nocb_gp_sleep and waking up the related + * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock + * before setting ->nocb_gp_sleep again, we are guaranteed to + * iterate this newly added structure before "rcuog" goes to + * sleep again. + */ + list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp); + /* - * Can't use rcu_nocb_lock_irqsave() while we are in - * SEGCBLIST_SOFTIRQ_ONLY mode. + * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING + * is set. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); /* * We didn't take the nocb lock while working on the - * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode). * Every modifications that have been done previously on * rdp->cblist must be visible remotely by the nocb kthreads * upon wake up after reading the cblist flags. @@ -1084,6 +1127,14 @@ static long rcu_nocb_rdp_offload(void *arg) rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + /* + * All kthreads are ready to work, we can finally relieve rcu_core() and + * enable nocb bypass. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); + rcu_nocb_unlock_irqrestore(rdp, flags); + return ret; } @@ -1122,13 +1173,17 @@ void __init rcu_init_nohz(void) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ - if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { - if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { - pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); - return; + if (need_rcu_nocb_mask) { + if (!cpumask_available(rcu_nocb_mask)) { + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } } + rcu_nocb_is_setup = true; } - if (!cpumask_available(rcu_nocb_mask)) + + if (!rcu_nocb_is_setup) return; #if defined(CONFIG_NO_HZ_FULL) @@ -1154,8 +1209,8 @@ void __init rcu_init_nohz(void) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); } rcu_organize_nocb_kthreads(); } @@ -1178,17 +1233,17 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread * for this CPU's group has not yet been created, spawn it as well. */ -static void rcu_spawn_one_nocb_kthread(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_data *rdp_gp; struct task_struct *t; - /* - * If this isn't a no-CBs CPU or if it already has an rcuo kthread, - * then nothing to do. - */ - if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) + if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) + return; + + /* If there already is an rcuo kthread, then nothing to do. */ + if (rdp->nocb_cb_kthread) return; /* If we didn't spawn the GP kthread first, reorganize! */ @@ -1211,16 +1266,6 @@ static void rcu_spawn_one_nocb_kthread(int cpu) } /* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it. - */ -static void rcu_spawn_cpu_nocb_kthread(int cpu) -{ - if (rcu_scheduler_fully_active) - rcu_spawn_one_nocb_kthread(cpu); -} - -/* * Once the scheduler is running, spawn rcuo kthreads for all online * no-CBs CPUs. This assumes that the early_initcall()s happen before * non-boot CPUs come online -- if this changes, we will need to add @@ -1230,8 +1275,10 @@ static void __init rcu_spawn_nocb_kthreads(void) { int cpu; - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); + if (rcu_nocb_is_setup) { + for_each_online_cpu(cpu) + rcu_spawn_cpu_nocb_kthread(cpu); + } } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ @@ -1251,7 +1298,6 @@ static void __init rcu_organize_nocb_kthreads(void) int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ - struct rcu_data *rdp_prev = NULL; if (!cpumask_available(rcu_nocb_mask)) return; @@ -1265,14 +1311,14 @@ static void __init rcu_organize_nocb_kthreads(void) * Should the corresponding CPU come online in the future, then * we will spawn the needed set of rcu_nocb_kthread() kthreads. */ - for_each_cpu(cpu, rcu_nocb_mask) { + for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { /* New GP kthread, set up for CBs & next GP. */ gotnocbs = true; nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_gp_rdp = rdp; rdp_gp = rdp; + INIT_LIST_HEAD(&rdp->nocb_head_rdp); if (dump_tree) { if (!firsttime) pr_cont("%s\n", gotnocbscbs @@ -1285,12 +1331,12 @@ static void __init rcu_organize_nocb_kthreads(void) } else { /* Another CB kthread, link to previous GP kthread. */ gotnocbscbs = true; - rdp->nocb_gp_rdp = rdp_gp; - rdp_prev->nocb_next_cb_rdp = rdp; if (dump_tree) pr_cont(" %d", cpu); } - rdp_prev = rdp; + rdp->nocb_gp_rdp = rdp_gp; + if (cpumask_test_cpu(cpu, rcu_nocb_mask)) + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); } if (gotnocbs && dump_tree) pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); @@ -1352,6 +1398,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) { char bufw[20]; char bufr[20]; + struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; bool wassleep; @@ -1359,11 +1406,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, + &rdp->nocb_entry_rdp, + typeof(*rdp), + nocb_entry_rdp); + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, - rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, + nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], "cC"[!!atomic_read(&rdp->nocb_lock_contended)], diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5199559fbbf0..c5b45c2f68a1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -16,7 +16,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) { /* - * In order to read the offloaded state of an rdp is a safe + * In order to read the offloaded state of an rdp in a safe * and stable way and prevent from its value to be changed * under us, we must either hold the barrier mutex, the cpu * hotplug lock (read or write) or the nocb lock. Local @@ -51,12 +51,10 @@ static void __init rcu_bootup_announce_oddness(void) RCU_FANOUT); if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); - if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) - pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) - pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); + pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -88,13 +86,13 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_kick_kthreads) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) - pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + pr_info("\tRCU callback double-/use-after-free debug is enabled.\n"); if (gp_preinit_delay) pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); if (gp_init_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) - pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); if (!use_softirq) pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) @@ -260,10 +258,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->exp_deferred_qs); + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); } /* @@ -277,12 +275,16 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * current task, there might be any number of other tasks blocked while * in an RCU read-side critical section. * + * Unlike non-preemptible-RCU, quiescent state reports for expedited + * grace periods are handled separately via deferred quiescent states + * and context switch events. + * * Callers to this function must disable preemption. */ static void rcu_qs(void) { RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); - if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { + if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); @@ -350,7 +352,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); @@ -477,7 +479,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->exp_deferred_qs) { + if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; } @@ -497,7 +499,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); /* Clean up if blocked during RCU read-side critical section. */ @@ -580,7 +582,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.exp_deferred_qs) || + return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && rcu_preempt_depth() == 0; } @@ -642,7 +644,7 @@ static void rcu_read_unlock_special(struct task_struct *t) (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) { + if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. @@ -845,10 +847,8 @@ static void rcu_qs(void) trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) - return; - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); } /* @@ -925,7 +925,18 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return false; } -static void rcu_preempt_deferred_qs(struct task_struct *t) { } + +// Except that we do need to respond to a request by an expedited grace +// period for a quiescent state from this CPU. Note that requests from +// tasks are handled when removing the task from the blocked-tasks list +// below. +static void rcu_preempt_deferred_qs(struct task_struct *t) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + if (rdp->cpu_no_qs.b.exp) + rcu_report_exp_rdp(rdp); +} /* * Because there is no preemptible RCU, there can be no readers blocked, @@ -1153,7 +1164,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) /* * Create an RCU-boost kthread for the specified node if one does not * already exist. We only create this kthread for preemptible RCU. - * Returns zero if all is well, a negated errno otherwise. */ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { @@ -1204,8 +1214,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); + cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU)); if (cpumask_weight(cm) == 0) - cpumask_setall(cm); + cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU)); set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } @@ -1253,201 +1264,6 @@ static void __init rcu_spawn_boost_kthreads(void) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#if !defined(CONFIG_RCU_FAST_NO_HZ) - -/* - * Check to see if any future non-offloaded RCU-related work will need - * to be done by the current CPU, even if none need be done immediately, - * returning 1 if so. This function is part of the RCU implementation; - * it is -not- an exported member of the RCU API. - * - * Because we not have RCU_FAST_NO_HZ, just check whether or not this - * CPU has RCU callbacks queued. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - *nextevt = KTIME_MAX; - return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && - !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up - * after it. - */ -static void rcu_cleanup_after_idle(void) -{ -} - -/* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, - * is nothing. - */ -static void rcu_prepare_for_idle(void) -{ -} - -#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - -/* - * This code is invoked when a CPU goes idle, at which point we want - * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. - * - * The following preprocessor symbol controls this: - * - * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted - * to sleep in dyntick-idle mode with RCU callbacks pending. This - * is sized to be roughly one RCU grace period. Those energy-efficiency - * benchmarkers who might otherwise be tempted to set this to a large - * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your - * system. And if you are -that- concerned about energy efficiency, - * just power the system down and be done with it! - * - * The value below works well in practice. If future workloads require - * adjustment, they can be converted into kernel config parameters, though - * making the state machine smarter might be a better option. - */ -#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ - -static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; -module_param(rcu_idle_gp_delay, int, 0644); - -/* - * Try to advance callbacks on the current CPU, but only if it has been - * awhile since the last time we did so. Afterwards, if there are any - * callbacks ready for immediate invocation, return true. - */ -static bool __maybe_unused rcu_try_advance_all_cbs(void) -{ - bool cbs_ready = false; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - - /* Exit early if we advanced recently. */ - if (jiffies == rdp->last_advance_all) - return false; - rdp->last_advance_all = jiffies; - - rnp = rdp->mynode; - - /* - * Don't bother checking unless a grace period has - * completed since we last checked and there are - * callbacks not yet ready to invoke. - */ - if ((rcu_seq_completed_gp(rdp->gp_seq, - rcu_seq_current(&rnp->gp_seq)) || - unlikely(READ_ONCE(rdp->gpwrap))) && - rcu_segcblist_pend_cbs(&rdp->cblist)) - note_gp_changes(rdp); - - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - cbs_ready = true; - return cbs_ready; -} - -/* - * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready - * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller about what to set the timeout. - * - * The caller must have disabled interrupts. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - unsigned long dj; - - lockdep_assert_irqs_disabled(); - - /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ - if (rcu_segcblist_empty(&rdp->cblist) || - rcu_rdp_is_offloaded(rdp)) { - *nextevt = KTIME_MAX; - return 0; - } - - /* Attempt to advance callbacks. */ - if (rcu_try_advance_all_cbs()) { - /* Some ready to invoke, so initiate later invocation. */ - invoke_rcu_core(); - return 1; - } - rdp->last_accelerate = jiffies; - - /* Request timer and round. */ - dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; - - *nextevt = basemono + dj * TICK_NSEC; - return 0; -} - -/* - * Prepare a CPU for idle from an RCU perspective. The first major task is to - * sense whether nohz mode has been enabled or disabled via sysfs. The second - * major task is to accelerate (that is, assign grace-period numbers to) any - * recently arrived callbacks. - * - * The caller must have disabled interrupts. - */ -static void rcu_prepare_for_idle(void) -{ - bool needwake; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - int tne; - - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - return; - - /* Handle nohz enablement switches conservatively. */ - tne = READ_ONCE(tick_nohz_active); - if (tne != rdp->tick_nohz_enabled_snap) { - if (!rcu_segcblist_empty(&rdp->cblist)) - invoke_rcu_core(); /* force nohz to see update. */ - rdp->tick_nohz_enabled_snap = tne; - return; - } - if (!tne) - return; - - /* - * If we have not yet accelerated this jiffy, accelerate all - * callbacks on this CPU. - */ - if (rdp->last_accelerate == jiffies) - return; - rdp->last_accelerate = jiffies; - if (rcu_segcblist_pend_cbs(&rdp->cblist)) { - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_accelerate_cbs(rnp, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - if (needwake) - rcu_gp_kthread_wake(); - } -} - -/* - * Clean up for exit from idle. Attempt to advance callbacks based on - * any grace periods that elapsed while the CPU was idle, and if any - * callbacks are now ready to invoke, initiate invocation. - */ -static void rcu_cleanup_after_idle(void) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - return; - if (rcu_try_advance_all_cbs()) - invoke_rcu_core(); -} - -#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -1455,7 +1271,7 @@ static void rcu_cleanup_after_idle(void) * CPU unless the grace period has extended for too long. * * This code relies on the fact that all NO_HZ_FULL CPUs are also - * CONFIG_RCU_NOCB_CPU CPUs. + * RCU_NOCB_CPU CPUs. */ static bool rcu_nohz_full_cpu(void) { diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5e2fa6fd97f1..21bebf7c9030 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -347,26 +347,6 @@ static void rcu_dump_cpu_stacks(void) } } -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - - sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", - rdp->last_accelerate & 0xffff, jiffies & 0xffff, - !!rdp->tick_nohz_enabled_snap); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - *cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - static const char * const gp_state_names[] = { [RCU_GP_IDLE] = "RCU_GP_IDLE", [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", @@ -408,13 +388,12 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp) * of RCU grace periods that this CPU is ignorant of, for example, "1" * if the CPU was aware of the previous grace period. * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + * Also print out idle info. */ static void print_cpu_stall_info(int cpu) { unsigned long delta; bool falsepositive; - char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; unsigned long ticks_value; @@ -432,11 +411,10 @@ static void print_cpu_stall_info(int cpu) ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; } - print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -449,7 +427,6 @@ static void print_cpu_stall_info(int cpu) rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz, falsepositive ? " (false positive?)" : ""); } diff --git a/kernel/resource.c b/kernel/resource.c index 5ad3eba619ba..9c08d6e9eef2 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -99,7 +99,7 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = PDE_DATA(file_inode(m->file)); + struct resource *p = pde_data(file_inode(m->file)); loff_t l = 0; read_lock(&resource_lock); for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) @@ -115,7 +115,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct resource *root = PDE_DATA(file_inode(m->file)); + struct resource *root = pde_data(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 5d42f44e3e1a..dcb0410950e4 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -38,14 +38,10 @@ #define SCFTORT_STRING "scftorture" #define SCFTORT_FLAG SCFTORT_STRING ": " -#define SCFTORTOUT(s, x...) \ - pr_alert(SCFTORT_FLAG s, ## x) - #define VERBOSE_SCFTORTOUT(s, x...) \ - do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0) + do { if (verbose) pr_alert(SCFTORT_FLAG s "\n", ## x); } while (0) -#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \ - do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0) +#define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); @@ -587,14 +583,14 @@ static int __init scf_torture_init(void) if (weight_resched1 == 0 && weight_single1 == 0 && weight_single_rpc1 == 0 && weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 && weight_all1 == 0 && weight_all_wait1 == 0) { - VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); + SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); firsterr = -EINVAL; goto unwind; } if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false); else if (weight_resched1) - VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored"); + SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored"); scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true); scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); @@ -625,12 +621,12 @@ static int __init scf_torture_init(void) nthreads = num_online_cpus(); scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL); if (!scf_stats_p) { - VERBOSE_SCFTORTOUT_ERRSTRING("out of memory"); + SCFTORTOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } - VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads); + VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads", nthreads); atomic_set(&n_started, nthreads); for (i = 0; i < nthreads; i++) { diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index c7421f2d05e1..c83b37af155b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,11 +11,10 @@ ccflags-y += $(call cc-disable-warning, unused-but-set-variable) # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n -# There are numerous data races here, however, most of them are due to plain accesses. -# This would make it even harder for syzbot to find reproducers, because these -# bugs trigger without specific input. Disable by default, but should re-enable -# eventually. +# Disable KCSAN to avoid excessive noise and performance degradation. To avoid +# false positives ensure barriers implied by sched functions are instrumented. KCSAN_SANITIZE := n +KCSAN_INSTRUMENT_BARRIERS := y ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c9b0fda64ac..848eaa0efe0e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct * return false; /* flip prio, so high prio is leftmost */ - if (prio_less(b, a, task_rq(a)->core->core_forceidle)) + if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count)) return true; return false; @@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); } -void sched_core_dequeue(struct rq *rq, struct task_struct *p) +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { rq->core->core_task_seq++; - if (!sched_core_enqueued(p)) - return; + if (sched_core_enqueued(p)) { + rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); + } - rb_erase(&p->core_node, &rq->core_tree); - RB_CLEAR_NODE(&p->core_node); + /* + * Migrating the last task off the cpu, with the cpu in forced idle + * state. Reschedule to create an accounting edge for forced idle, + * and re-examine whether the core is still in forced idle state. + */ + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && + rq->core->core_forceidle_count && rq->curr == rq->idle) + resched_curr(rq); } /* @@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled) for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; + cpu_rq(cpu)->core->core_forceidle_start = 0; + sched_core_unlock(cpu, &flags); cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); @@ -364,7 +374,8 @@ void sched_core_put(void) #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } -static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ @@ -1918,7 +1929,7 @@ static void __init init_uclamp_rq(struct rq *rq) }; } - rq->uclamp_flags = 0; + rq->uclamp_flags = UCLAMP_FLAG_IDLE; } static void __init init_uclamp(void) @@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { if (sched_core_enabled(rq)) - sched_core_dequeue(rq, p); + sched_core_dequeue(rq, p, flags); if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); @@ -2173,6 +2184,9 @@ void migrate_enable(void) return; } + if (WARN_ON_ONCE(!p->migration_disabled)) + return; + /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). @@ -5244,6 +5258,7 @@ void scheduler_tick(void) if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); + sched_core_tick(rq); rq_unlock(rq, &rf); @@ -5656,6 +5671,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *next, *p, *max = NULL; const struct cpumask *smt_mask; bool fi_before = false; + bool core_clock_updated = (rq == rq->core); unsigned long cookie; int i, cpu, occ = 0; struct rq *rq_i; @@ -5708,10 +5724,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* reset state */ rq->core->core_cookie = 0UL; - if (rq->core->core_forceidle) { + if (rq->core->core_forceidle_count) { + if (!core_clock_updated) { + update_rq_clock(rq->core); + core_clock_updated = true; + } + sched_core_account_forceidle(rq); + /* reset after accounting force idle */ + rq->core->core_forceidle_start = 0; + rq->core->core_forceidle_count = 0; + rq->core->core_forceidle_occupation = 0; need_sync = true; fi_before = true; - rq->core->core_forceidle = false; } /* @@ -5753,7 +5777,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); - if (i != cpu) + /* + * Current cpu always has its clock updated on entrance to + * pick_next_task(). If the current cpu is not the core, + * the core may also have been updated above. + */ + if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); p = rq_i->core_pick = pick_task(rq_i); @@ -5783,7 +5812,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (p == rq_i->idle) { if (rq_i->nr_running) { - rq->core->core_forceidle = true; + rq->core->core_forceidle_count++; if (!fi_before) rq->core->core_forceidle_seq++; } @@ -5792,6 +5821,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } } + if (schedstat_enabled() && rq->core->core_forceidle_count) { + rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_occupation = occ; + } + rq->core->core_pick_seq = rq->core->core_task_seq; next = rq->core_pick; rq->core_sched_seq = rq->core->core_pick_seq; @@ -5828,8 +5862,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * 1 0 1 * 1 1 0 */ - if (!(fi_before && rq->core->core_forceidle)) - task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle); + if (!(fi_before && rq->core->core_forceidle_count)) + task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); rq_i->core_pick->core_occupation = occ; @@ -6033,11 +6067,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu) goto unlock; /* copy the shared state to the new leader */ - core_rq->core_task_seq = rq->core_task_seq; - core_rq->core_pick_seq = rq->core_pick_seq; - core_rq->core_cookie = rq->core_cookie; - core_rq->core_forceidle = rq->core_forceidle; - core_rq->core_forceidle_seq = rq->core_forceidle_seq; + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle_count = rq->core_forceidle_count; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; + + /* + * Accounting edge for forced idle is handled in pick_next_task(). + * Don't need another one here, since the hotplug thread shouldn't + * have a cookie. + */ + core_rq->core_forceidle_start = 0; /* install new leader */ for_each_cpu(t, smt_mask) { @@ -6617,11 +6659,11 @@ static int __init setup_preempt_mode(char *str) int mode = sched_dynamic_mode(str); if (mode < 0) { pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); - return 1; + return 0; } sched_dynamic_update(mode); - return 0; + return 1; } __setup("preempt=", setup_preempt_mode); @@ -7126,7 +7168,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long sched_cpu_util(int cpu, unsigned long max) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, + return effective_cpu_util(cpu, cpu_util_cfs(cpu), max, ENERGY_UTIL, NULL); } #endif /* CONFIG_SMP */ @@ -8176,9 +8218,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; spin_lock(lock); @@ -8196,9 +8236,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { read_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; read_lock(lock); @@ -8216,9 +8254,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { write_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; write_lock(lock); @@ -8520,7 +8556,7 @@ void sched_show_task(struct task_struct *p) rcu_read_unlock(); pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", free, task_pid_nr(p), ppid, - (unsigned long)task_thread_info(p)->flags); + read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -8599,14 +8635,6 @@ void __init init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); - /* - * The idle task doesn't need the kthread struct to function, but it - * is dressed up as a per-CPU kthread and thus needs to play the part - * if we want to avoid special-casing it in code that deals with per-CPU - * kthreads. - */ - set_kthread_struct(idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -8619,9 +8647,6 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); - #ifdef CONFIG_SMP /* * It's possible that init_idle() gets called multiple times on a task, @@ -8777,7 +8802,6 @@ void idle_task_exit(void) finish_arch_post_lock_switch(); } - scs_task_reset(current); /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } @@ -9413,7 +9437,9 @@ void __init sched_init(void) rq->core_pick = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; - rq->core_forceidle = false; + rq->core_forceidle_count = 0; + rq->core_forceidle_occupation = 0; + rq->core_forceidle_start = 0; rq->core_cookie = 0UL; #endif @@ -9428,6 +9454,14 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* + * The idle task doesn't need the kthread struct to function, but it + * is dressed up as a per-CPU kthread and thus needs to play the part + * if we want to avoid special-casing it in code that deals with per-CPU + * kthreads. + */ + WARN_ON(!set_kthread_struct(current)); + + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 517f72b008f5..c8746a9a7ada 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, enqueued = sched_core_enqueued(p); if (enqueued) - sched_core_dequeue(rq, p); + sched_core_dequeue(rq, p, DEQUEUE_SAVE); old_cookie = p->core_cookie; p->core_cookie = cookie; @@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * If task is currently running, it may not be compatible anymore after * the cookie change, so enter the scheduler on its CPU to schedule it * away. + * + * Note that it is possible that as a result of this cookie change, the + * core has now entered/left forced idle state. Defer accounting to the + * next scheduling edge, rather than always forcing a reschedule here. */ if (task_running(rq, p)) resched_curr(rq); @@ -232,3 +236,63 @@ out: return err; } +#ifdef CONFIG_SCHEDSTATS + +/* REQUIRES: rq->core's clock recently updated. */ +void __sched_core_account_forceidle(struct rq *rq) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + u64 delta, now = rq_clock(rq->core); + struct rq *rq_i; + struct task_struct *p; + int i; + + lockdep_assert_rq_held(rq); + + WARN_ON_ONCE(!rq->core->core_forceidle_count); + + if (rq->core->core_forceidle_start == 0) + return; + + delta = now - rq->core->core_forceidle_start; + if (unlikely((s64)delta <= 0)) + return; + + rq->core->core_forceidle_start = now; + + if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) { + /* can't be forced idle without a running task */ + } else if (rq->core->core_forceidle_count > 1 || + rq->core->core_forceidle_occupation > 1) { + /* + * For larger SMT configurations, we need to scale the charged + * forced idle amount since there can be more than one forced + * idle sibling and more than one running cookied task. + */ + delta *= rq->core->core_forceidle_count; + delta = div_u64(delta, rq->core->core_forceidle_occupation); + } + + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + p = rq_i->core_pick ?: rq_i->curr; + + if (p == rq_i->idle) + continue; + + __schedstat_add(p->stats.core_forceidle_sum, delta); + } +} + +void __sched_core_tick(struct rq *rq) +{ + if (!rq->core->core_forceidle_count) + return; + + if (rq != rq->core) + update_rq_clock(rq->core); + + __sched_core_account_forceidle(rq); +} + +#endif /* CONFIG_SCHEDSTATS */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 893eece65bfd..3d06c5e4220d 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -struct cpuacct_usage { - u64 usages[CPUACCT_STAT_NSTATS]; -}; - /* track CPU usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */ - struct cpuacct_usage __percpu *cpuusage; + u64 __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(struct cpuacct_usage); + ca->cpuusage = alloc_percpu(u64); if (!ca->cpuusage) goto out_free_ca; @@ -99,14 +95,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, enum cpuacct_stat_index index) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 data; /* * We allow index == CPUACCT_STAT_NSTATS here to read * the sum of usages. */ - BUG_ON(index > CPUACCT_STAT_NSTATS); + if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS)) + return 0; #ifndef CONFIG_64BIT /* @@ -115,14 +113,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif - if (index == CPUACCT_STAT_NSTATS) { - int i = 0; - - data = 0; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - data += cpuusage->usages[i]; - } else { - data = cpuusage->usages[index]; + switch (index) { + case CPUACCT_STAT_USER: + data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE]; + break; + case CPUACCT_STAT_SYSTEM: + data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] + + cpustat[CPUTIME_SOFTIRQ]; + break; + case CPUACCT_STAT_NSTATS: + data = *cpuusage; + break; } #ifndef CONFIG_64BIT @@ -132,10 +133,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, return data; } -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - int i; + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + + /* Don't allow to reset global kernel_cpustat */ + if (ca == &root_cpuacct) + return; #ifndef CONFIG_64BIT /* @@ -143,9 +148,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) */ raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif - - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - cpuusage->usages[i] = val; + *cpuusage = 0; + cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0; + cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0; + cpustat[CPUTIME_SOFTIRQ] = 0; #ifndef CONFIG_64BIT raw_spin_rq_unlock_irq(cpu_rq(cpu)); @@ -196,7 +202,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL; for_each_possible_cpu(cpu) - cpuacct_cpuusage_write(ca, cpu, 0); + cpuacct_cpuusage_write(ca, cpu); return 0; } @@ -243,25 +249,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) seq_puts(m, "\n"); for_each_possible_cpu(cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - seq_printf(m, "%d", cpu); - - for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit - * platforms. - */ - raw_spin_rq_lock_irq(cpu_rq(cpu)); -#endif - - seq_printf(m, " %llu", cpuusage->usages[index]); - -#ifndef CONFIG_64BIT - raw_spin_rq_unlock_irq(cpu_rq(cpu)); -#endif - } + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %llu", + cpuacct_cpuusage_read(ca, cpu, index)); seq_puts(m, "\n"); } return 0; @@ -270,25 +261,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); - s64 val[CPUACCT_STAT_NSTATS]; + struct task_cputime cputime; + u64 val[CPUACCT_STAT_NSTATS]; int cpu; int stat; - memset(val, 0, sizeof(val)); + memset(&cputime, 0, sizeof(cputime)); for_each_possible_cpu(cpu) { u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; + cputime.utime += cpustat[CPUTIME_USER]; + cputime.utime += cpustat[CPUTIME_NICE]; + cputime.stime += cpustat[CPUTIME_SYSTEM]; + cputime.stime += cpustat[CPUTIME_IRQ]; + cputime.stime += cpustat[CPUTIME_SOFTIRQ]; + + cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu); } + cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime, + &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { - seq_printf(sf, "%s %lld\n", - cpuacct_stat_desc[stat], - (long long)nsec_to_clock_t(val[stat])); + seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat], + nsec_to_clock_t(val[stat])); } return 0; @@ -339,16 +335,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); - - if (regs && user_mode(regs)) - index = CPUACCT_STAT_USER; rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(ca->cpuusage->usages[index], cputime); + __this_cpu_add(*ca->cpuusage, cputime); rcu_read_unlock(); } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e7af18857371..26778884d9ab 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -168,7 +168,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max, FREQUENCY_UTIL, NULL); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 872e481d5098..b7ec42732b28 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -148,10 +148,10 @@ void account_guest_time(struct task_struct *p, u64 cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += cputime; + task_group_account_field(p, CPUTIME_NICE, cputime); cpustat[CPUTIME_GUEST_NICE] += cputime; } else { - cpustat[CPUTIME_USER] += cputime; + task_group_account_field(p, CPUTIME_USER, cputime); cpustat[CPUTIME_GUEST] += cputime; } } @@ -615,7 +615,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) .sum_exec_runtime = p->se.sum_exec_runtime, }; - task_cputime(p, &cputime.utime, &cputime.stime); + if (task_cputime(p, &cputime.utime, &cputime.stime)) + cputime.sum_exec_runtime = task_sched_runtime(p); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } EXPORT_SYMBOL_GPL(task_cputime_adjusted); @@ -828,19 +829,21 @@ u64 task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) +bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { struct vtime *vtime = &t->vtime; unsigned int seq; u64 delta; + int ret; if (!vtime_accounting_enabled()) { *utime = t->utime; *stime = t->stime; - return; + return false; } do { + ret = false; seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; @@ -850,6 +853,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) if (vtime->state < VTIME_SYS) continue; + ret = true; delta = vtime_delta(vtime); /* @@ -861,6 +865,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) else *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return ret; } static int vtime_state_fetch(struct vtime *vtime, int cpu) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7dcbaa31c5d9..aa29211de1bf 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PN(avg_atom); __PN(avg_per_cpu); + +#ifdef CONFIG_SCHED_CORE + PN_SCHEDSTAT(core_forceidle_sum); +#endif } __P(nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e476f6d9435..5146163bfabb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1502,7 +1502,6 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); -static unsigned long cpu_util(int cpu); static inline long adjust_numa_imbalance(int imbalance, int dst_running, int dst_weight); @@ -1569,7 +1568,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); - ns->util += cpu_util(cpu); + ns->util += cpu_util_cfs(cpu); ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); @@ -3029,9 +3028,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u32 divider = get_pelt_divider(&se->avg); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } #else static inline void @@ -3240,7 +3241,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * As is, the util number is not freq-invariant (we'd have to * implement arch_scale_freq_capacity() for that). * - * See cpu_util(). + * See cpu_util_cfs(). */ cpufreq_update_util(rq, flags); } @@ -3382,7 +3383,6 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3450,15 +3450,14 @@ void set_task_rq_fair(struct sched_entity *se, * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3467,23 +3466,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * divider; + new_sum = se->avg.util_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.util_sum; + se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + add_positive(&cfs_rq->avg.util_avg, delta_avg); + add_positive(&cfs_rq->avg.util_sum, delta_sum); + + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3494,19 +3500,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * divider; + new_sum = se->avg.runnable_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.runnable_sum; + se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + add_positive(&cfs_rq->avg.runnable_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); } static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; unsigned long load_avg; u64 load_sum = 0; + s64 delta_sum; u32 divider; if (!runnable_sum) @@ -3533,7 +3545,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * assuming all tasks are equally runnable. */ if (scale_load_down(gcfs_rq->load.weight)) { - load_sum = div_s64(gcfs_rq->avg.load_sum, + load_sum = div_u64(gcfs_rq->avg.load_sum, scale_load_down(gcfs_rq->load.weight)); } @@ -3550,19 +3562,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); - load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, divider); + load_sum = se_weight(se) * runnable_sum; + load_avg = div_u64(load_sum, divider); - se->avg.load_sum = runnable_sum; - - delta = load_avg - se->avg.load_avg; - if (!delta) + delta_avg = load_avg - se->avg.load_avg; + if (!delta_avg) return; - se->avg.load_avg = load_avg; + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3653,7 +3668,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed load. + * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. @@ -3678,15 +3693,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_load; sub_positive(&sa->load_avg, r); - sa->load_sum = sa->load_avg * divider; + sub_positive(&sa->load_sum, r * divider); + /* See sa->util_sum below */ + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); r = removed_util; sub_positive(&sa->util_avg, r); - sa->util_sum = sa->util_avg * divider; + sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); r = removed_runnable; sub_positive(&sa->runnable_avg, r); - sa->runnable_sum = sa->runnable_avg * divider; + sub_positive(&sa->runnable_sum, r * divider); + /* See sa->util_sum above */ + sa->runnable_sum = max_t(u32, sa->runnable_sum, + sa->runnable_avg * PELT_MIN_DIVIDER); /* * removed_runnable is the unweighted version of removed_load so we @@ -3773,17 +3805,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = get_pelt_divider(&cfs_rq->avg); - dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -4070,7 +4103,8 @@ done: trace_sched_util_est_se_tp(&p->se); } -static inline int task_fits_capacity(struct task_struct *p, long capacity) +static inline int task_fits_capacity(struct task_struct *p, + unsigned long capacity) { return fits_capacity(uclamp_task_util(p), capacity); } @@ -5509,11 +5543,9 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP -static inline unsigned long cpu_util(int cpu); - static inline bool cpu_overutilized(int cpu) { - return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); + return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)); } static inline void update_overutilized_status(struct rq *rq) @@ -6345,7 +6377,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } -static inline bool asym_fits_capacity(int task_util, int cpu) +static inline bool asym_fits_capacity(unsigned long task_util, int cpu) { if (static_branch_unlikely(&sched_asym_cpucapacity)) return fits_capacity(task_util, capacity_of(cpu)); @@ -6398,8 +6430,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * pattern is IO completions. */ if (is_per_cpu_kthread(current) && + in_task() && prev == smp_processor_id() && - this_rq()->nr_running <= 1) { + this_rq()->nr_running <= 1 && + asym_fits_capacity(task_util, prev)) { return prev; } @@ -6456,58 +6490,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } -/** - * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. - * @cpu: the CPU to get the utilization of - * - * The unit of the return value must be the one of capacity so we can compare - * the utilization with the capacity of the CPU that is available for CFS task - * (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * The estimated utilization of a CPU is defined to be the maximum between its - * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks - * currently RUNNABLE on that CPU. - * This allows to properly represent the expected utilization of a CPU which - * has just got a big task running since a long sleep period. At the same time - * however it preserves the benefits of the "blocked utilization" in - * describing the potential for other tasks waking up on the same CPU. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - * - * Return: the (estimated) utilization for the specified CPU - */ -static inline unsigned long cpu_util(int cpu) -{ - struct cfs_rq *cfs_rq; - unsigned int util; - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); - - return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - /* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested @@ -6528,7 +6510,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util(cpu); + return cpu_util_cfs(cpu); cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); @@ -6592,7 +6574,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) /* * Utilization (estimated) can exceed the CPU capacity, thus let's * clamp to the maximum CPU capacity to ensure consistency with - * the cpu_util call. + * cpu_util. */ return min_t(unsigned long, util, capacity_orig_of(cpu)); } @@ -6624,7 +6606,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) * During wake-up, the task isn't enqueued yet and doesn't * appear in the cfs_rq->avg.util_est.enqueued of any rq, * so just add it (if needed) to "simulate" what will be - * cpu_util() after the task has been enqueued. + * cpu_util after the task has been enqueued. */ if (dst_cpu == cpu) util_est += _task_util_est(p); @@ -6915,6 +6897,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) break; } + /* + * Usually only true for WF_EXEC and WF_FORK, as sched_domains + * usually do not have SD_BALANCE_WAKE set. That means wakeup + * will usually go to the fast path. + */ if (tmp->flags & sd_flag) sd = tmp; else if (!want_affine) @@ -8586,6 +8573,8 @@ group_type group_classify(unsigned int imbalance_pct, * * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings * of @dst_cpu are idle and @sg has lower priority. + * + * Return: true if @dst_cpu can pull tasks, false otherwise. */ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, @@ -8661,6 +8650,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. + * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_status: Holds flag indicating the status of the sched_group @@ -8681,7 +8671,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct rq *rq = cpu_rq(i); sgs->group_load += cpu_load(rq); - sgs->group_util += cpu_util(i); + sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); sgs->sum_h_nr_running += rq->cfs.h_nr_running; @@ -9468,12 +9458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. + * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * - * @env: The load balancing environment. - * * Return: - The busiest group if imbalance exists. */ static struct sched_group *find_busiest_group(struct lb_env *env) @@ -9699,7 +9688,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, break; case migrate_util: - util = cpu_util(cpu_of(rq)); + util = cpu_util_cfs(i); /* * Don't try to pull utilization from a CPU with one @@ -11068,7 +11057,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * if we need to give up the CPU. */ - if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && + if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index b5add64d9698..3d2825408e3a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -147,11 +147,11 @@ #endif #ifdef CONFIG_RSEQ -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ) #else -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 #endif #define MEMBARRIER_CMD_BITMASK \ @@ -159,7 +159,8 @@ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ - | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) static void ipi_mb(void *info) { diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index e06071bf3472..c336f5f481bc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024) + static inline u32 get_pelt_divider(struct sched_avg *avg) { - return LOAD_AVG_MAX - 1024 + avg->period_contrib; + return PELT_MIN_DIVIDER + avg->period_contrib; } static inline void cfs_se_util_change(struct sched_avg *avg) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1652f2bb54b7..e14358178849 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Pressure stall information for CPU, memory and IO * @@ -34,13 +35,19 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * Naturally, the FULL state doesn't exist for the CPU resource at the - * system level, but exist at the cgroup level, means all non-idle tasks - * in a cgroup are delayed on the CPU resource which used by others outside - * of the cgroup or throttled by the cgroup cpu.max configuration. - * * SOME = nr_delayed_tasks != 0 - * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0 + * + * What it means for a task to be productive is defined differently + * for each resource. For IO, productive means a running task. For + * memory, productive means a running task that isn't a reclaimer. For + * CPU, productive means an oncpu task. + * + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level. At the cgroup level, + * FULL means all non-idle tasks in the cgroup are delayed on the CPU + * resource which is being used by others outside of the cgroup or + * throttled by the cgroup cpu.max configuration. * * The percentage of wallclock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, @@ -81,13 +88,13 @@ * * threads = min(nr_nonidle_tasks, nr_cpus) * SOME = min(nr_delayed_tasks / threads, 1) - * FULL = (threads - min(nr_running_tasks, threads)) / threads + * FULL = (threads - min(nr_productive_tasks, threads)) / threads * * For the 257 number crunchers on 256 CPUs, this yields: * * threads = min(257, 256) * SOME = min(1 / 256, 1) = 0.4% - * FULL = (256 - min(257, 256)) / 256 = 0% + * FULL = (256 - min(256, 256)) / 256 = 0% * * For the 1 out of 4 memory-delayed tasks, this yields: * @@ -112,7 +119,7 @@ * For each runqueue, we track: * * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) - * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu]) * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) * * and then periodically aggregate: @@ -233,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) case PSI_MEM_SOME: return unlikely(tasks[NR_MEMSTALL]); case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); case PSI_CPU_FULL: @@ -710,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu, if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], clear, set); + groupc->tasks[3], groupc->tasks[4], + clear, set); psi_bug = 1; } } @@ -833,7 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, /* * When switching between tasks that have an identical * runtime state, the cgroup that contains both tasks - * runtime state, the cgroup that contains both tasks * we reach the first common ancestor. Iterate @next's * ancestors only until we encounter @prev's ONCPU. */ @@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, int clear = TSK_ONCPU, set = 0; /* - * When we're going to sleep, psi_dequeue() lets us handle - * TSK_RUNNING and TSK_IOWAIT here, where we can combine it - * with TSK_ONCPU and save walking common ancestors twice. + * When we're going to sleep, psi_dequeue() lets us + * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and + * TSK_IOWAIT here, where we can combine it with + * TSK_ONCPU and save walking common ancestors twice. */ if (sleep) { clear |= TSK_RUNNING; + if (prev->in_memstall) + clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; } @@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 1; - psi_task_change(current, 0, TSK_MEMSTALL); + psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); } @@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 0; - psi_task_change(current, TSK_MEMSTALL, 0); + psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0); rq_unlock_irq(rq, &rf); } @@ -1071,44 +1082,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_io_show); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_memory_show); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_cpu_show); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { @@ -1151,7 +1124,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); mutex_lock(&group->trigger_lock); @@ -1180,15 +1152,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1224,9 +1200,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1243,18 +1219,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1264,27 +1228,57 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_io_show); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_memory_show); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_cpu_show); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1305,14 +1299,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1347,7 +1351,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } @@ -1389,3 +1393,5 @@ static int __init psi_proc_init(void) return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b48baaba2fc2..7b4f4fbbb404 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.function = sched_rt_period_timer; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) { - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; @@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + do_start_rt_bandwidth(rt_b); +} + void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -1031,13 +1036,17 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } } @@ -2911,8 +2920,12 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { + unsigned long flags; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } int sched_rt_handler(struct ctl_table *table, int write, void *buffer, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0e66749486e7..de53be905739 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1111,8 +1111,10 @@ struct rq { unsigned int core_task_seq; unsigned int core_pick_seq; unsigned long core_cookie; - unsigned char core_forceidle; + unsigned int core_forceidle_count; unsigned int core_forceidle_seq; + unsigned int core_forceidle_occupation; + u64 core_forceidle_start; #endif }; @@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p) } extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); -extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); @@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { } #include "stats.h" #include "autogroup.h" +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else + +static inline void sched_core_account_forceidle(struct rq *rq) {} + +static inline void sched_core_tick(struct rq *rq) {} + +#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */ + #ifdef CONFIG_CGROUP_SCHED /* @@ -2938,16 +2966,52 @@ static inline unsigned long cpu_util_dl(struct rq *rq) return READ_ONCE(rq->avg_dl.util_avg); } -static inline unsigned long cpu_util_cfs(struct rq *rq) +/** + * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks. + * @cpu: the CPU to get the utilization for. + * + * The unit of the return value must be the same as the one of CPU capacity + * so that CPU utilization can be compared with CPU capacity. + * + * CPU utilization is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on that CPU. + * It represents the amount of CPU capacity currently used by CFS tasks in + * the range [0..max CPU capacity] with max CPU capacity being the CPU + * capacity at f_max. + * + * The estimated CPU utilization is defined as the maximum between CPU + * utilization and sum of the estimated utilization of the currently + * runnable tasks on that CPU. It preserves a utilization "snapshot" of + * previously-executed tasks, which helps better deduce how busy a CPU will + * be when a long-sleeping task wakes up. The contribution to CPU utilization + * of such a task would be significantly decayed at this point of time. + * + * CPU utilization can be higher than the current CPU capacity + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because + * of rounding errors as well as task migrations or wakeups of new tasks. + * CPU utilization has to be capped to fit into the [0..max CPU capacity] + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) + * could be seen as over-utilized even though CPU1 has 20% of spare CPU + * capacity. CPU utilization is allowed to overshoot current CPU capacity + * though since this is useful for predicting the CPU capacity required + * after task migrations (scheduler-driven DVFS). + * + * Return: (Estimated) utilization for the specified CPU. + */ +static inline unsigned long cpu_util_cfs(int cpu) { - unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + struct cfs_rq *cfs_rq; + unsigned long util; + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); if (sched_feat(UTIL_EST)) { util = max_t(unsigned long, util, - READ_ONCE(rq->cfs.avg.util_est.enqueued)); + READ_ONCE(cfs_rq->avg.util_est.enqueued)); } - return util; + return min(util, capacity_orig_of(cpu)); } static inline unsigned long cpu_util_rt(struct rq *rq) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index cfb0893a83d4..3a3c826dd83a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) if (static_branch_likely(&psi_disabled)) return; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; + if (!wakeup || p->sched_psi_wake_requeue) { if (p->in_memstall) set |= TSK_MEMSTALL; @@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) return; if (p->in_memstall) - clear |= TSK_MEMSTALL; + clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); psi_task_change(p, clear, 0); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 76577d1642a5..eca38107b32f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -238,6 +238,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +void __wake_up_pollfree(struct wait_queue_head *wq_head) +{ + __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE)); + /* POLLFREE must have cleared the queue. */ + WARN_ON_ONCE(waitqueue_active(wq_head)); +} + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any diff --git a/kernel/signal.c b/kernel/signal.c index a629b11bf3e0..38602738866e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -626,7 +626,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, * * All callers have to hold the siglock. */ -int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info) +int dequeue_signal(struct task_struct *tsk, sigset_t *mask, + kernel_siginfo_t *info, enum pid_type *type) { bool resched_timer = false; int signr; @@ -634,8 +635,10 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ + *type = PIDTYPE_PID; signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { + *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info, &resched_timer); #ifdef CONFIG_POSIX_TIMERS @@ -903,8 +906,8 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) struct task_struct *t; sigset_t flush; - if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (!(signal->flags & SIGNAL_GROUP_EXIT)) + if (signal->flags & SIGNAL_GROUP_EXIT) { + if (signal->core_state) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. @@ -1029,7 +1032,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && - !(signal->flags & SIGNAL_GROUP_EXIT) && + (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) { /* @@ -1820,6 +1823,7 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) * force_sig_seccomp - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland * @reason: filter-supplied reason code to send to userland (via si_errno) + * @force_coredump: true to trigger a coredump * * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. */ @@ -2383,7 +2387,8 @@ static bool do_signal_stop(int signr) WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || - unlikely(signal_group_exit(sig))) + unlikely(sig->flags & SIGNAL_GROUP_EXIT) || + unlikely(sig->group_exec_task)) return false; /* * There is no group stop already in progress. We must @@ -2544,7 +2549,7 @@ static void do_freezer_trap(void) freezable_schedule(); } -static int ptrace_signal(int signr, kernel_siginfo_t *info) +static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) { /* * We do not check sig_kernel_stop(signr) but set this marker @@ -2584,8 +2589,9 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info) } /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { - send_signal(signr, info, current, PIDTYPE_PID); + if (sigismember(¤t->blocked, signr) || + fatal_signal_pending(current)) { + send_signal(signr, info, current, type); signr = 0; } @@ -2684,18 +2690,20 @@ relock: goto relock; } - /* Has this task already been marked for death? */ - if (signal_group_exit(signal)) { - ksig->info.si_signo = signr = SIGKILL; - sigdelset(¤t->pending.signal, SIGKILL); - trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, - &sighand->action[SIGKILL - 1]); - recalc_sigpending(); - goto fatal; - } - for (;;) { struct k_sigaction *ka; + enum pid_type type; + + /* Has this task already been marked for death? */ + if ((signal->flags & SIGNAL_GROUP_EXIT) || + signal->group_exec_task) { + ksig->info.si_signo = signr = SIGKILL; + sigdelset(¤t->pending.signal, SIGKILL); + trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, + &sighand->action[SIGKILL - 1]); + recalc_sigpending(); + goto fatal; + } if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && do_signal_stop(0)) @@ -2728,16 +2736,18 @@ relock: * so that the instruction pointer in the signal stack * frame points to the faulting instruction. */ + type = PIDTYPE_PID; signr = dequeue_synchronous_signal(&ksig->info); if (!signr) - signr = dequeue_signal(current, ¤t->blocked, &ksig->info); + signr = dequeue_signal(current, ¤t->blocked, + &ksig->info, &type); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && (signr != SIGKILL) && !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) { - signr = ptrace_signal(signr, &ksig->info); + signr = ptrace_signal(signr, &ksig->info, type); if (!signr) continue; } @@ -2863,13 +2873,13 @@ out: } /** - * signal_delivered - + * signal_delivered - called after signal delivery to update blocked signals * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask - * is always blocked, and the signal itself is blocked unless %SA_NODEFER + * is always blocked), and the signal itself is blocked unless %SA_NODEFER * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ static void signal_delivered(struct ksignal *ksig, int stepping) @@ -2942,7 +2952,7 @@ void exit_signals(struct task_struct *tsk) */ cgroup_threadgroup_change_begin(tsk); - if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { + if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -3562,6 +3572,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; sigset_t mask = *which; + enum pid_type type; int sig, ret = 0; if (ts) { @@ -3578,7 +3589,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, signotset(&mask); spin_lock_irq(&tsk->sighand->siglock); - sig = dequeue_signal(tsk, &mask, info); + sig = dequeue_signal(tsk, &mask, info, &type); if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested @@ -3597,7 +3608,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); - sig = dequeue_signal(tsk, &mask, info); + sig = dequeue_signal(tsk, &mask, info, &type); } spin_unlock_irq(&tsk->sighand->siglock); @@ -4185,6 +4196,15 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, ss_mode != 0)) return -EINVAL; + /* + * Return before taking any locks if no actual + * sigaltstack changes were requested. + */ + if (t->sas_ss_sp == (unsigned long)ss_sp && + t->sas_ss_size == ss_size && + t->sas_ss_flags == ss_flags) + return 0; + sigaltstack_lock(); if (ss_mode == SS_DISABLE) { ss_size = 0; diff --git a/kernel/softirq.c b/kernel/softirq.c index 322b65d45676..41f470929e99 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -595,7 +595,8 @@ void irq_enter_rcu(void) { __irq_enter_raw(); - if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)) + if (tick_nohz_full_cpu(smp_processor_id()) || + (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET))) tick_irq_enter(); account_hardirq_enter(current); diff --git a/kernel/stackleak.c b/kernel/stackleak.c index ce161a8e8d97..66b8af394e58 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -16,11 +16,13 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include <linux/jump_label.h> #include <linux/sysctl.h> +#include <linux/init.h> static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); -int stack_erasing_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SYSCTL +static int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); @@ -42,6 +44,26 @@ int stack_erasing_sysctl(struct ctl_table *table, int write, state ? "enabled" : "disabled"); return ret; } +static struct ctl_table stackleak_sysctls[] = { + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init stackleak_sysctls_init(void) +{ + register_sysctl_init("kernel", stackleak_sysctls); + return 0; +} +late_initcall(stackleak_sysctls_init); +#endif /* CONFIG_SYSCTL */ #define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) #else diff --git a/kernel/sys.c b/kernel/sys.c index 8fdac0d90504..ecc4cf019242 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -220,7 +220,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) niceval = MAX_NICE; rcu_read_lock(); - read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) @@ -235,9 +234,11 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) pgrp = find_vpid(who); else pgrp = task_pgrp(current); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); @@ -249,16 +250,15 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) if (!user) goto out_unlock; /* No processes for this user */ } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); + } if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } out_unlock: - read_unlock(&tasklist_lock); rcu_read_unlock(); out: return error; @@ -283,7 +283,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) return -EINVAL; rcu_read_lock(); - read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) @@ -301,11 +300,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); @@ -317,19 +318,18 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (!user) goto out_unlock; /* No processes for this user */ } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - } while_each_thread(g, p); + } if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } out_unlock: - read_unlock(&tasklist_lock); rcu_read_unlock(); return retval; @@ -2261,6 +2261,66 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) +#ifdef CONFIG_ANON_VMA_NAME + +#define ANON_VMA_NAME_MAX_LEN 80 +#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" + +static inline bool is_valid_name_char(char ch) +{ + /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ + return ch > 0x1f && ch < 0x7f && + !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); +} + +static int prctl_set_vma(unsigned long opt, unsigned long addr, + unsigned long size, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + const char __user *uname; + char *name, *pch; + int error; + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + uname = (const char __user *)arg; + if (uname) { + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + + if (IS_ERR(name)) + return PTR_ERR(name); + + for (pch = name; *pch != '\0'; pch++) { + if (!is_valid_name_char(*pch)) { + kfree(name); + return -EINVAL; + } + } + } else { + /* Reset the name */ + name = NULL; + } + + mmap_write_lock(mm); + error = madvise_set_anon_name(mm, addr, size, name); + mmap_write_unlock(mm); + kfree(name); + break; + default: + error = -EINVAL; + } + + return error; +} + +#else /* CONFIG_ANON_VMA_NAME */ +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long size, unsigned long arg) +{ + return -EINVAL; +} +#endif /* CONFIG_ANON_VMA_NAME */ + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2530,6 +2590,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d1944258cfc0..a492f159624f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -297,6 +297,7 @@ COND_SYSCALL(get_mempolicy); COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); +COND_SYSCALL(set_mempolicy_home_node); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 083be6af29d7..5ae443b2882e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,7 +20,6 @@ */ #include <linux/module.h> -#include <linux/aio.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> @@ -33,6 +32,7 @@ #include <linux/security.h> #include <linux/ctype.h> #include <linux/kmemleak.h> +#include <linux/filter.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> @@ -49,7 +49,6 @@ #include <linux/times.h> #include <linux/limits.h> #include <linux/dcache.h> -#include <linux/dnotify.h> #include <linux/syscalls.h> #include <linux/vmstat.h> #include <linux/nfs_fs.h> @@ -57,19 +56,15 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/perf_event.h> -#include <linux/kprobes.h> -#include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> -#include <linux/sched/coredump.h> #include <linux/kexec.h> #include <linux/bpf.h> #include <linux/mount.h> #include <linux/userfaultfd_k.h> -#include <linux/coredump.h> #include <linux/latencytop.h> #include <linux/pid.h> #include <linux/delayacct.h> @@ -96,64 +91,21 @@ #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) #include <linux/lockdep.h> #endif -#ifdef CONFIG_CHR_DEV_SG -#include <scsi/sg.h> -#endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -#include <linux/stackleak.h> -#endif -#ifdef CONFIG_LOCKUP_DETECTOR -#include <linux/nmi.h> -#endif #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ -#ifdef CONFIG_LOCKUP_DETECTOR -static int sixty = 60; -#endif - -static int __maybe_unused neg_one = -1; -static int __maybe_unused two = 2; -static int __maybe_unused four = 4; -static unsigned long zero_ul; -static unsigned long one_ul = 1; -static unsigned long long_max = LONG_MAX; -static int one_hundred = 100; -static int two_hundred = 200; -static int one_thousand = 1000; -#ifdef CONFIG_PRINTK -static int ten_thousand = 10000; -#endif + #ifdef CONFIG_PERF_EVENTS -static int six_hundred_forty_kb = 640 * 1024; +static const int six_hundred_forty_kb = 640 * 1024; #endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; - -/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static int maxolduid = 65535; -static int minolduid; +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -static int ngroups_max = NGROUPS_MAX; +static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/* - * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs - * and hung_task_check_interval_secs - */ -#ifdef CONFIG_DETECT_HUNG_TASK -static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); -#endif - -#ifdef CONFIG_INOTIFY_USER -#include <linux/inotify.h> -#endif -#ifdef CONFIG_FANOTIFY -#include <linux/fanotify.h> -#endif - #ifdef CONFIG_PROC_SYSCTL /** @@ -190,8 +142,8 @@ int sysctl_legacy_va_layout; #endif #ifdef CONFIG_COMPACTION -static int min_extfrag_threshold; -static int max_extfrag_threshold = 1000; +/* min_extfrag_threshold is SYSCTL_ZERO */; +static const int max_extfrag_threshold = 1000; #endif #endif /* CONFIG_SYSCTL */ @@ -802,12 +754,12 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); } -static int do_proc_douintvec(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) +int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) { return __do_proc_douintvec(table->data, table, write, buffer, lenp, ppos, conv, data); @@ -936,17 +888,6 @@ static int proc_taint(struct ctl_table *table, int write, return err; } -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} -#endif - /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value @@ -1142,67 +1083,6 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - unsigned int val; - - val = round_pipe_size(*lvalp); - if (val == 0) - return -EINVAL; - - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; - } - - return 0; -} - -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, NULL); -} - -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} - -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} -#endif - #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -1265,10 +1145,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); - if (err) + if (err || neg) { + err = -EINVAL; break; - if (neg) - continue; + } + val = convmul * val / convdiv; if ((min && val < *min) || (max && val > *max)) { err = -EINVAL; @@ -1926,29 +1807,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_COREDUMP - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -1962,7 +1820,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, + .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif @@ -2129,15 +1987,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_CHR_DEV_SG - { - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_BSD_PROCESS_ACCT { .procname = "acct", @@ -2173,30 +2022,18 @@ static struct ctl_table kern_table[] = { .proc_handler = sysctl_max_threads, }, { - .procname = "random", - .mode = 0555, - .child = random_table, - }, - { .procname = "usermodehelper", .mode = 0555, .child = usermodehelper_table, }, -#ifdef CONFIG_FW_LOADER_USER_HELPER - { - .procname = "firmware_config", - .mode = 0555, - .child = firmware_config_table, - }, -#endif { .procname = "overflowuid", .data = &overflowuid, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, }, { .procname = "overflowgid", @@ -2204,8 +2041,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, }, #ifdef CONFIG_S390 { @@ -2250,66 +2087,9 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, -#if defined CONFIG_PRINTK - { - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_delay", - .data = &printk_delay_msec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &ten_thousand, - }, - { - .procname = "printk_devkmsg", - .data = devkmsg_log_str, - .maxlen = DEVKMSG_STR_MAX_SIZE, - .mode = 0644, - .proc_handler = devkmsg_sysctl_set_loglvl, - }, - { - .procname = "dmesg_restrict", - .data = &dmesg_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "kptr_restrict", - .data = &kptr_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#endif { .procname = "ngroups_max", - .data = &ngroups_max, + .data = (void *)&ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, @@ -2321,96 +2101,6 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = &watchdog_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = &sixty, - }, - { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = &sysctl_hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", @@ -2513,60 +2203,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DETECT_HUNG_TASK -#ifdef CONFIG_SMP - { - .procname = "hung_task_all_cpu_backtrace", - .data = &sysctl_hung_task_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ - { - .procname = "hung_task_panic", - .data = &sysctl_hung_task_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_check_interval_secs", - .data = &sysctl_hung_task_check_interval_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, - }, -#endif #ifdef CONFIG_RT_MUTEXES { .procname = "max_lock_depth", @@ -2626,7 +2262,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "perf_event_max_stack", @@ -2635,7 +2271,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &six_hundred_forty_kb, + .extra2 = (void *)&six_hundred_forty_kb, }, { .procname = "perf_event_max_contexts_per_stack", @@ -2644,7 +2280,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_thousand, + .extra2 = SYSCTL_ONE_THOUSAND, }, #endif { @@ -2675,7 +2311,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = bpf_unpriv_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "bpf_stats_enabled", @@ -2707,17 +2343,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_INT_MAX, }, #endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE - { - .procname = "stack_erasing", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = stack_erasing_sysctl, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif { } }; @@ -2729,7 +2354,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "panic_on_oom", @@ -2738,7 +2363,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "oom_kill_allocating_task", @@ -2783,7 +2408,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_bytes", @@ -2791,7 +2416,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, - .extra1 = &one_ul, + .extra1 = SYSCTL_LONG_ONE, }, { .procname = "dirty_ratio", @@ -2800,7 +2425,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_bytes", @@ -2808,7 +2433,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, .proc_handler = dirty_bytes_handler, - .extra1 = &dirty_bytes_min, + .extra1 = (void *)&dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", @@ -2840,7 +2465,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two_hundred, + .extra2 = SYSCTL_TWO_HUNDRED, }, #ifdef CONFIG_HUGETLB_PAGE { @@ -2897,7 +2522,7 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &four, + .extra2 = SYSCTL_FOUR, }, #ifdef CONFIG_COMPACTION { @@ -2914,7 +2539,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "extfrag_threshold", @@ -2922,8 +2547,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &min_extfrag_threshold, - .extra2 = &max_extfrag_threshold, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&max_extfrag_threshold, }, { .procname = "compact_unevictable_allowed", @@ -2959,7 +2584,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &one_thousand, + .extra2 = SYSCTL_THREE_THOUSAND, }, { .procname = "percpu_pagelist_high_fraction", @@ -3038,7 +2663,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "min_slab_ratio", @@ -3047,7 +2672,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_slab_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, #endif #ifdef CONFIG_SMP @@ -3181,221 +2806,6 @@ static struct ctl_table vm_table[] = { { } }; -static struct ctl_table fs_table[] = { - { - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = sizeof(files_stat), - .mode = 0444, - .proc_handler = proc_nr_files, - }, - { - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(files_stat.max_files), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero_ul, - .extra2 = &long_max, - }, - { - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, - { - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_dentry, - }, - { - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_FILE_LOCKING - { - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DNOTIFY - { - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU -#ifdef CONFIG_FILE_LOCKING - { - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_AIO - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif /* CONFIG_AIO */ -#ifdef CONFIG_INOTIFY_USER - { - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_FANOTIFY - { - .procname = "fanotify", - .mode = 0555, - .child = fanotify_table, - }, -#endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif -#endif - { - .procname = "protected_symlinks", - .data = &sysctl_protected_symlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_hardlinks", - .data = &sysctl_protected_hardlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_fifos", - .data = &sysctl_protected_fifos, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "protected_regular", - .data = &sysctl_protected_regular, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .procname = "binfmt_misc", - .mode = 0555, - .child = sysctl_mount_point, - }, -#endif - { - .procname = "pipe-max-size", - .data = &pipe_max_size, - .maxlen = sizeof(pipe_max_size), - .mode = 0644, - .proc_handler = proc_dopipe_max_size, - }, - { - .procname = "pipe-user-pages-hard", - .data = &pipe_user_pages_hard, - .maxlen = sizeof(pipe_user_pages_hard), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "pipe-user-pages-soft", - .data = &pipe_user_pages_soft, - .maxlen = sizeof(pipe_user_pages_soft), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "mount-max", - .data = &sysctl_mount_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { } -}; - static struct ctl_table debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { @@ -3406,17 +2816,6 @@ static struct ctl_table debug_table[] = { .proc_handler = proc_dointvec }, #endif -#if defined(CONFIG_OPTPROBES) - { - .procname = "kprobes-optimization", - .data = &sysctl_kprobes_optimization, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_kprobes_optimization_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif { } }; @@ -3424,41 +2823,18 @@ static struct ctl_table dev_table[] = { { } }; -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; +DECLARE_SYSCTL_BASE(kernel, kern_table); +DECLARE_SYSCTL_BASE(vm, vm_table); +DECLARE_SYSCTL_BASE(debug, debug_table); +DECLARE_SYSCTL_BASE(dev, dev_table); -int __init sysctl_init(void) +int __init sysctl_init_bases(void) { - struct ctl_table_header *hdr; + register_sysctl_base(kernel); + register_sysctl_base(vm); + register_sysctl_base(debug); + register_sysctl_base(dev); - hdr = register_sysctl_table(sysctl_base_table); - kmemleak_not_leak(hdr); return 0; } #endif /* CONFIG_SYSCTL */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b8a14d2fb5ba..1cf73807b450 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -107,7 +107,7 @@ static u64 suspend_start; * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as * a lower bound for cs->uncertainty_margin values when registering clocks. */ -#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC) +#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC) #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); @@ -199,23 +199,30 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -ulong max_cswd_read_retries = 3; +ulong max_cswd_read_retries = 2; module_param(max_cswd_read_retries, ulong, 0644); EXPORT_SYMBOL_GPL(max_cswd_read_retries); static int verify_n_cpus = 8; module_param(verify_n_cpus, int, 0644); -static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) +enum wd_read_status { + WD_READ_SUCCESS, + WD_READ_UNSTABLE, + WD_READ_SKIP +}; + +static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) { unsigned int nretries; - u64 wd_end, wd_delta; - int64_t wd_delay; + u64 wd_end, wd_end2, wd_delta; + int64_t wd_delay, wd_seq_delay; for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) { local_irq_disable(); *wdnow = watchdog->read(watchdog); *csnow = cs->read(cs); wd_end = watchdog->read(watchdog); + wd_end2 = watchdog->read(watchdog); local_irq_enable(); wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask); @@ -226,13 +233,34 @@ static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", smp_processor_id(), watchdog->name, nretries); } - return true; + return WD_READ_SUCCESS; } + + /* + * Now compute delay in consecutive watchdog read to see if + * there is too much external interferences that cause + * significant delay in reading both clocksource and watchdog. + * + * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2, + * report system busy, reinit the watchdog and skip the current + * watchdog test. + */ + wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask); + wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift); + if (wd_seq_delay > WATCHDOG_MAX_SKEW/2) + goto skip_test; } pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n", smp_processor_id(), watchdog->name, wd_delay, nretries); - return false; + return WD_READ_UNSTABLE; + +skip_test: + pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n", + smp_processor_id(), watchdog->name, wd_seq_delay); + pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n", + cs->name, wd_delay); + return WD_READ_SKIP; } static u64 csnow_mid; @@ -257,7 +285,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (cpu == smp_processor_id()) cpu = cpumask_next(cpu, cpu_online_mask); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) @@ -279,7 +307,7 @@ static void clocksource_verify_choose_cpus(void) cpu = prandom_u32() % nr_cpu_ids; cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } @@ -356,6 +384,7 @@ static void clocksource_watchdog(struct timer_list *unused) int next_cpu, reset_pending; int64_t wd_nsec, cs_nsec; struct clocksource *cs; + enum wd_read_status read_ret; u32 md; spin_lock(&watchdog_lock); @@ -373,9 +402,12 @@ static void clocksource_watchdog(struct timer_list *unused) continue; } - if (!cs_watchdog_read(cs, &csnow, &wdnow)) { - /* Clock readout unreliable, so give it up. */ - __clocksource_unstable(cs); + read_ret = cs_watchdog_read(cs, &csnow, &wdnow); + + if (read_ret != WD_READ_SUCCESS) { + if (read_ret == WD_READ_UNSTABLE) + /* Clock readout unreliable, so give it up. */ + __clocksource_unstable(cs); continue; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6bffe5af8cb1..17a283ce2b20 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1375,6 +1375,13 @@ static inline void tick_nohz_irq_enter(void) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); + /* + * If all CPUs are idle. We may need to update a stale jiffies value. + * Note nohz_full is a special case: a timekeeper is guaranteed to stay + * alive but it might be busy looping with interrupts disabled in some + * rare case (typically stop machine). So we must make sure we have a + * last resort. + */ if (ts->tick_stopped) tick_nohz_update_jiffies(now); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b348749a9fc6..dcdcb85121e4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1306,8 +1306,7 @@ int do_settimeofday64(const struct timespec64 *ts) timekeeping_forward_now(tk); xt = tk_xtime(tk); - ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; - ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; + ts_delta = timespec64_sub(*ts, xt); if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { ret = -EINVAL; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e3d2c23c413d..85f1021ad459 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2054,26 +2054,28 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); /** - * usleep_range - Sleep for an approximate time - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping * * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range() instead of udelay(). The sleep improves responsiveness + * usleep_range_state() instead of udelay(). The sleep improves responsiveness * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces * power usage by allowing hrtimers to take advantage of an already- * scheduled interrupt instead of scheduling a new one just for this sleep. */ -void __sched usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range_state(unsigned long min, unsigned long max, + unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; for (;;) { - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } -EXPORT_SYMBOL(usleep_range); +EXPORT_SYMBOL(usleep_range_state); diff --git a/kernel/torture.c b/kernel/torture.c index bb8f411c974b..ef27a6c82451 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -570,7 +570,7 @@ int torture_shuffle_init(long shuffint) shuffle_idle_cpu = -1; if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask"); + TOROUT_ERRSTRING("Failed to alloc mask"); return -ENOMEM; } @@ -934,7 +934,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, *tp = kthread_run(fn, arg, "%s", s); if (IS_ERR(*tp)) { ret = PTR_ERR(*tp); - VERBOSE_TOROUT_ERRSTRING(f); + TOROUT_ERRSTRING(f); *tp = NULL; } torture_shuffle_task_register(*tp); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 420ff4bc67fd..a5eb5e7fd624 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -70,6 +70,19 @@ config HAVE_C_RECORDMCOUNT help C version of recordmcount available? +config HAVE_BUILDTIME_MCOUNT_SORT + bool + help + An architecture selects this if it sorts the mcount_loc section + at build time. + +config BUILDTIME_MCOUNT_SORT + bool + default y + depends on HAVE_BUILDTIME_MCOUNT_SORT && DYNAMIC_FTRACE + help + Sort the mcount_loc section at build time. + config TRACER_MAX_TRACE bool @@ -915,6 +928,20 @@ config EVENT_TRACE_TEST_SYSCALLS TBD - enable a way to actually call the syscalls as we test their events +config FTRACE_SORT_STARTUP_TEST + bool "Verify compile time sorting of ftrace functions" + depends on DYNAMIC_FTRACE + depends on BUILDTIME_MCOUNT_SORT + help + Sorting of the mcount_loc sections that is used to find the + where the ftrace knows where to patch functions for tracing + and other callbacks is done at compile time. But if the sort + is not done correctly, it will cause non-deterministic failures. + When this is set, the sorted sections will be verified that they + are in deed sorted and will warn if they are not. + + If unsure, say N + config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1183c88634aa..af68a67179b4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -34,7 +34,7 @@ static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; static LIST_HEAD(running_trace_list); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -121,12 +121,12 @@ static void trace_note_tsk(struct task_struct *tsk) struct blk_trace *bt; tsk->btrace_seq = blktrace_seq; - spin_lock_irqsave(&running_trace_lock, flags); + raw_spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm), 0); } - spin_unlock_irqrestore(&running_trace_lock, flags); + raw_spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -666,9 +666,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_add(&bt->running_list, &running_trace_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -676,9 +676,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1045,7 +1045,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, } r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.device_to = cpu_to_be32(disk_devt(rq->q->disk)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), @@ -1608,9 +1608,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ae9755037b7e..21aa30644219 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -345,7 +345,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -394,7 +394,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, }; @@ -450,9 +450,9 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = { .func = bpf_trace_vprintk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, - .arg3_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -492,9 +492,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -509,7 +509,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -533,7 +533,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -694,7 +694,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -764,7 +764,7 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID, - .ret_btf_id = &btf_task_struct_ids[0], + .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) @@ -779,7 +779,7 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = { .func = bpf_task_pt_regs, .gpl_only = true, .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &btf_task_struct_ids[0], + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .ret_type = RET_PTR_TO_BTF_ID, .ret_btf_id = &bpf_task_pt_regs_ids[0], }; @@ -1004,7 +1004,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -1012,7 +1012,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx) { /* This helper call is inlined by verifier. */ - return ((u64 *)ctx)[-1]; + return ((u64 *)ctx)[-2]; } static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { @@ -1091,6 +1091,53 @@ static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .arg2_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value) +{ + /* This helper call is inlined by verifier. */ + u64 nr_args = ((u64 *)ctx)[-1]; + + if ((u64) n >= nr_args) + return -EINVAL; + *value = ((u64 *)ctx)[n]; + return 0; +} + +static const struct bpf_func_proto bpf_get_func_arg_proto = { + .func = get_func_arg, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) +{ + /* This helper call is inlined by verifier. */ + u64 nr_args = ((u64 *)ctx)[-1]; + + *value = ((u64 *)ctx)[nr_args]; + return 0; +} + +static const struct bpf_func_proto bpf_get_func_ret_proto = { + .func = get_func_ret, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_1(get_func_arg_cnt, void *, ctx) +{ + /* This helper call is inlined by verifier. */ + return ((u64 *)ctx)[-1]; +} + +static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { + .func = get_func_arg_cnt, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1206,6 +1253,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_func_ip_proto_tracing; case BPF_FUNC_get_branch_snapshot: return &bpf_get_branch_snapshot_proto; + case BPF_FUNC_find_vma: + return &bpf_find_vma_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); default: @@ -1285,7 +1334,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1400,9 +1449,6 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { -#ifndef CONFIG_X86 - return -ENOENT; -#else static const u32 br_entry_size = sizeof(struct perf_branch_entry); struct perf_branch_stack *br_stack = ctx->data->br_stack; u32 to_copy; @@ -1411,7 +1457,7 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, return -EINVAL; if (unlikely(!br_stack)) - return -EINVAL; + return -ENOENT; if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE) return br_stack->nr * br_entry_size; @@ -1423,7 +1469,6 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, memcpy(buf, br_stack->entries, to_copy); return to_copy; -#endif } static const struct bpf_func_proto bpf_read_branch_records_proto = { @@ -1511,7 +1556,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1565,7 +1610,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -1631,6 +1676,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) NULL; case BPF_FUNC_d_path: return &bpf_d_path_proto; + case BPF_FUNC_get_func_arg: + return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL; + case BPF_FUNC_get_func_ret: + return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; + case BPF_FUNC_get_func_arg_cnt: + return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 30bc880c3849..f9feb197b2da 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5217,6 +5217,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; + struct ftrace_hash *hash; int ret = -ENODEV; mutex_lock(&direct_mutex); @@ -5225,7 +5226,8 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; - if (direct_functions->count == 1) + hash = direct_ops.func_hash->filter_hash; + if (hash->count == 1) unregister_ftrace_function(&direct_ops); ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0); @@ -5540,6 +5542,10 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) err = unregister_ftrace_function(ops); remove_direct_functions_hash(hash, addr); mutex_unlock(&direct_mutex); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; return err; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); @@ -6388,6 +6394,27 @@ static int ftrace_cmp_ips(const void *a, const void *b) return 0; } +#ifdef CONFIG_FTRACE_SORT_STARTUP_TEST +static void test_is_sorted(unsigned long *start, unsigned long count) +{ + int i; + + for (i = 1; i < count; i++) { + if (WARN(start[i - 1] > start[i], + "[%d] %pS at %lx is not sorted with %pS at %lx\n", i, + (void *)start[i - 1], start[i - 1], + (void *)start[i], start[i])) + break; + } + if (i == count) + pr_info("ftrace section at %px sorted properly\n", start); +} +#else +static void test_is_sorted(unsigned long *start, unsigned long count) +{ +} +#endif + static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -6406,8 +6433,17 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; - sort(start, count, sizeof(*start), - ftrace_cmp_ips, NULL); + /* + * Sorting mcount in vmlinux at build time depend on + * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in + * modules can not be sorted at build time. + */ + if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) { + sort(start, count, sizeof(*start), + ftrace_cmp_ips, NULL); + } else { + test_is_sorted(start, count); + } start_pg = ftrace_allocate_pages(count); if (!start_pg) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2699e9e562b1..05dfc7a12d3d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5898,16 +5898,13 @@ static __init int test_ringbuffer(void) rb_data[cpu].buffer = buffer; rb_data[cpu].cpu = cpu; rb_data[cpu].cnt = cpu; - rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], - "rbtester/%d", cpu); + rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu], + cpu, "rbtester/%u"); if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_threads[cpu]); goto out_free; } - - kthread_bind(rb_threads[cpu], cpu); - wake_up_process(rb_threads[cpu]); } /* Now create the rb hammer! */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88de94da596b..c860f582b078 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -980,6 +980,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev ring_buffer_write(buffer, event->array[0], &event->array[1]); /* Release the temp buffer */ this_cpu_dec(trace_buffered_event_cnt); + /* ring_buffer_unlock_commit() enables preemption */ + preempt_enable_notrace(); } else ring_buffer_unlock_commit(buffer, event); } @@ -2601,6 +2603,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_HARDIRQ; if (in_serving_softirq()) trace_flags |= TRACE_FLAG_SOFTIRQ; + if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) + trace_flags |= TRACE_FLAG_BH_OFF; if (tif_need_resched()) trace_flags |= TRACE_FLAG_NEED_RESCHED; @@ -2745,8 +2749,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, *current_rb = tr->array_buffer.buffer; if (!tr->no_filter_buffering_ref && - (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && - (entry = this_cpu_read(trace_buffered_event))) { + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) { + preempt_disable_notrace(); /* * Filtering is on, so try to use the per cpu buffer first. * This buffer will simulate a ring_buffer_event, @@ -2764,33 +2768,38 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, * is still quicker than no copy on match, but having * to discard out of the ring buffer on a failed match. */ - int max_len = PAGE_SIZE - struct_size(entry, array, 1); + if ((entry = __this_cpu_read(trace_buffered_event))) { + int max_len = PAGE_SIZE - struct_size(entry, array, 1); - val = this_cpu_inc_return(trace_buffered_event_cnt); + val = this_cpu_inc_return(trace_buffered_event_cnt); - /* - * Preemption is disabled, but interrupts and NMIs - * can still come in now. If that happens after - * the above increment, then it will have to go - * back to the old method of allocating the event - * on the ring buffer, and if the filter fails, it - * will have to call ring_buffer_discard_commit() - * to remove it. - * - * Need to also check the unlikely case that the - * length is bigger than the temp buffer size. - * If that happens, then the reserve is pretty much - * guaranteed to fail, as the ring buffer currently - * only allows events less than a page. But that may - * change in the future, so let the ring buffer reserve - * handle the failure in that case. - */ - if (val == 1 && likely(len <= max_len)) { - trace_event_setup(entry, type, trace_ctx); - entry->array[0] = len; - return entry; + /* + * Preemption is disabled, but interrupts and NMIs + * can still come in now. If that happens after + * the above increment, then it will have to go + * back to the old method of allocating the event + * on the ring buffer, and if the filter fails, it + * will have to call ring_buffer_discard_commit() + * to remove it. + * + * Need to also check the unlikely case that the + * length is bigger than the temp buffer size. + * If that happens, then the reserve is pretty much + * guaranteed to fail, as the ring buffer currently + * only allows events less than a page. But that may + * change in the future, so let the ring buffer reserve + * handle the failure in that case. + */ + if (val == 1 && likely(len <= max_len)) { + trace_event_setup(entry, type, trace_ctx); + entry->array[0] = len; + /* Return with preemption disabled */ + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); } - this_cpu_dec(trace_buffered_event_cnt); + /* __trace_buffer_lock_reserve() disables preemption */ + preempt_enable_notrace(); } entry = __trace_buffer_lock_reserve(*current_rb, type, len, @@ -3207,7 +3216,7 @@ struct trace_buffer_struct { char buffer[4][TRACE_BUF_SIZE]; }; -static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct __percpu *trace_percpu_buffer; /* * This allows for lockless recording. If we're nested too deeply, then @@ -3217,7 +3226,7 @@ static char *get_trace_buf(void) { struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - if (!buffer || buffer->nesting >= 4) + if (!trace_percpu_buffer || buffer->nesting >= 4) return NULL; buffer->nesting++; @@ -3236,7 +3245,7 @@ static void put_trace_buf(void) static int alloc_percpu_trace_buffer(void) { - struct trace_buffer_struct *buffers; + struct trace_buffer_struct __percpu *buffers; if (trace_percpu_buffer) return 0; @@ -4183,7 +4192,7 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" + "# / _-----=> irqs-off/BH-disabled\n" "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" @@ -4224,7 +4233,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space); seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); @@ -4834,6 +4843,12 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) return 0; } +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + stream_open(inode, filp); + return tracing_open_generic_tr(inode, filp); +} + static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -5635,7 +5650,7 @@ static const char readme_msg[] = "\t - a numeric literal: e.g. ms_per_sec=1000,\n" "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n" "\n" - "\t hist trigger aritmethic expressions support addition(+), subtraction(-),\n" + "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n" "\t multiplication(*) and division(/) operators. An operand can be either a\n" "\t variable reference, field or numeric literal.\n" "\n" @@ -6718,10 +6733,9 @@ waitagain: cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ - memset_startat(iter, 0, seq); + trace_iterator_reset(iter); cpumask_clear(iter->started); trace_seq_init(&iter->seq); - iter->pos = -1; trace_event_read_lock(); trace_access_lock(iter->cpu_file); @@ -7110,9 +7124,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tt) event_triggers_post_call(tr->trace_marker_file, tt); - if (written > 0) - *fpos += written; - return written; } @@ -7171,9 +7182,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); - if (written > 0) - *fpos += written; - return written; } @@ -7573,16 +7581,14 @@ static const struct file_operations tracing_free_buffer_fops = { }; static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic_tr, + .open = tracing_mark_open, .write = tracing_mark_write, - .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_mark_raw_fops = { - .open = tracing_open_generic_tr, + .open = tracing_mark_open, .write = tracing_mark_raw_write, - .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; @@ -7734,7 +7740,8 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) err = kzalloc(sizeof(*err), GFP_KERNEL); if (!err) err = ERR_PTR(-ENOMEM); - tr->n_err_log_entries++; + else + tr->n_err_log_entries++; return err; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6b60ab9475ed..d038ddbf1bea 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -83,6 +83,9 @@ enum trace_type { #undef __dynamic_array #define __dynamic_array(type, item) type item[]; +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item) type item[]; + #undef F_STRUCT #define F_STRUCT(args...) args @@ -1334,10 +1337,12 @@ __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { if (this_cpu_read(trace_buffered_event) == event) { - /* Simply release the temp buffer */ + /* Simply release the temp buffer and enable preemption */ this_cpu_dec(trace_buffered_event_cnt); + preempt_enable_notrace(); return; } + /* ring_buffer_discard_commit() enables preemption */ ring_buffer_discard_commit(buffer, event); } @@ -1366,14 +1371,26 @@ __event_trigger_test_discard(struct trace_event_file *file, if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, buffer, entry, event); - if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || - (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, entry))) { - __trace_event_discard_commit(buffer, event); - return true; - } + if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_FILTERED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + goto discard; + + if (file->flags & EVENT_FILE_FL_FILTERED && + !filter_match_preds(file->filter, entry)) + goto discard; + + if ((file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(file)) + goto discard; return false; + discard: + __trace_event_discard_commit(buffer, event); + return true; } /** @@ -1453,6 +1470,7 @@ struct filter_pred { static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING || field->filter_type == FILTER_STATIC_STRING || field->filter_type == FILTER_PTR_STRING || field->filter_type == FILTER_COMM; @@ -1560,15 +1578,13 @@ extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data); -extern int event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param); +extern int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); extern int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); extern void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *test, struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); @@ -1594,6 +1610,30 @@ get_named_trigger_data(struct event_trigger_data *data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); +extern bool event_trigger_check_remove(const char *glob); +extern bool event_trigger_empty_param(const char *param); +extern int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required); +extern struct event_trigger_data * +event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data); +extern int event_trigger_parse_num(char *trigger, + struct event_trigger_data *trigger_data); +extern int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data); +extern void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data); +extern int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + char *cmd, + char *trigger, + struct event_trigger_data *trigger_data, + int *n_registered); /** * struct event_trigger_ops - callbacks for trace event triggers @@ -1601,10 +1641,20 @@ extern int register_trigger_hist_enable_disable_cmds(void); * The methods in this structure provide per-event trigger hooks for * various trigger operations. * + * The @init and @free methods are used during trigger setup and + * teardown, typically called from an event_command's @parse() + * function implementation. + * + * The @print method is used to print the trigger spec. + * + * The @trigger method is the function that actually implements the + * trigger and is called in the context of the triggering event + * whenever that event occurs. + * * All the methods below, except for @init() and @free(), must be * implemented. * - * @func: The trigger 'probe' function called when the triggering + * @trigger: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that * registered the trigger (see struct event_command) along with @@ -1633,9 +1683,10 @@ extern int register_trigger_hist_enable_disable_cmds(void); * (see trace_event_triggers.c). */ struct event_trigger_ops { - void (*func)(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *rbe); + void (*trigger)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, @@ -1684,7 +1735,7 @@ struct event_trigger_ops { * All the methods below, except for @set_filter() and @unreg_all(), * must be implemented. * - * @func: The callback function responsible for parsing and + * @parse: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the * user. It allocates the trigger instance and registers it with * the appropriate trace event. It makes use of the other @@ -1719,21 +1770,24 @@ struct event_trigger_ops { * * @get_trigger_ops: The callback function invoked to retrieve the * event_trigger_ops implementation associated with the command. + * This callback function allows a single event_command to + * support multiple trigger implementations via different sets of + * event_trigger_ops, depending on the value of the @param + * string. */ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; int flags; - int (*func)(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *params); + int (*parse)(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, + char *param_and_filter); int (*reg)(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg)(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg_all)(struct trace_event_file *file); @@ -1914,14 +1968,7 @@ extern struct trace_iterator *tracepoint_print_iter; */ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) { - const size_t offset = offsetof(struct trace_iterator, seq); - - /* - * Keep gcc from complaining about overwriting more than just one - * member in the structure. - */ - memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); - + memset_startat(iter, 0, seq); iter->pos = -1; } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 928867f527e7..191db32dec46 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -489,18 +489,12 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) if (trace_trigger_soft_disabled(edata->file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = edata->file; - dsize = get_eprobe_size(&edata->ep->tp, rec); - fbuffer.regs = NULL; - - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file, - call->event.type, - sizeof(*entry) + edata->ep->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + + entry = trace_event_buffer_reserve(&fbuffer, edata->file, + sizeof(*entry) + edata->ep->tp.size + dsize); + + if (!entry) return; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); @@ -549,29 +543,29 @@ static void eprobe_trigger_func(struct event_trigger_data *data, } static struct event_trigger_ops eprobe_trigger_ops = { - .func = eprobe_trigger_func, + .trigger = eprobe_trigger_func, .print = eprobe_trigger_print, .init = eprobe_trigger_init, .free = eprobe_trigger_free, }; -static int eprobe_trigger_cmd_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { return -1; } -static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +static int eprobe_trigger_reg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) { return -1; } -static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +static void eprobe_trigger_unreg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) { } @@ -586,7 +580,7 @@ static struct event_command event_trigger_cmd = { .name = "eprobe", .trigger_type = ETT_EVENT_EPROBE, .flags = EVENT_CMD_FL_NEEDS_REC, - .func = eprobe_trigger_cmd_func, + .parse = eprobe_trigger_cmd_parse, .reg = eprobe_trigger_reg_func, .unreg = eprobe_trigger_unreg_func, .unreg_all = NULL, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4021b9a79f93..3147614c1812 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2678,12 +2678,24 @@ static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; struct trace_event_file *file; + unsigned int first; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return NULL; + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + + if (!trace_pid_list_first(pid_list, &first) || + !trace_pid_list_first(no_pid_list, &first)) + file->flags |= EVENT_FILE_FL_PID_FILTER; + file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); @@ -3449,10 +3461,8 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); - if (!entry) { - pr_warn("Could not create tracefs 'enable' entry\n"); + if (!entry) return -ENOMEM; - } /* There are not as crucial, just warn if they are not created */ @@ -3468,17 +3478,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); /* ring buffer internal formats */ - entry = trace_create_file("header_page", TRACE_MODE_READ, d_events, + trace_create_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); - if (!entry) - pr_warn("Could not create tracefs 'header_page' entry\n"); - entry = trace_create_file("header_event", TRACE_MODE_READ, d_events, + trace_create_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); - if (!entry) - pr_warn("Could not create tracefs 'header_event' entry\n"); tr->event_dir = d_events; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c9124038b140..b458a9afa2c0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -5,6 +5,7 @@ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ +#include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> @@ -654,6 +655,52 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); +/* user space strings temp buffer */ +#define USTRING_BUF_SIZE 1024 + +struct ustring_buffer { + char buffer[USTRING_BUF_SIZE]; +}; + +static __percpu struct ustring_buffer *ustring_per_cpu; + +static __always_inline char *test_string(char *str) +{ + struct ustring_buffer *ubuf; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* For safety, do not trust the string pointer */ + if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + return NULL; + return kstr; +} + +static __always_inline char *test_ustring(char *str) +{ + struct ustring_buffer *ubuf; + char __user *ustr; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* user space address? */ + ustr = (char __user *)str; + if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + return NULL; + + return kstr; +} + /* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event) { @@ -667,19 +714,43 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } -/* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event) +static __always_inline int filter_pchar(struct filter_pred *pred, char *str) { - char **addr = (char **)(event + pred->offset); int cmp, match; - int len = strlen(*addr) + 1; /* including tailing '\0' */ + int len; - cmp = pred->regex.match(*addr, &pred->regex, len); + len = strlen(str) + 1; /* including tailing '\0' */ + cmp = pred->regex.match(str, &pred->regex, len); match = cmp ^ pred->not; return match; } +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; + + str = test_string(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); +} + +/* Filter predicate for char * pointers in user space*/ +static int filter_pred_pchar_user(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; + + str = test_ustring(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); +} /* * Filter predicate for dynamic sized arrays of characters. @@ -706,6 +777,29 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* + * Filter predicate for relative dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry as same as dynamic string. + * The difference is that the relative one records the location offset + * from the field itself, not the event entry. + */ +static int filter_pred_strrelloc(struct filter_pred *pred, void *event) +{ + u32 *item = (u32 *)(event + pred->offset); + u32 str_item = *item; + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; + char *addr = (char *)(&item[1]) + str_loc; + int cmp, match; + + cmp = pred->regex.match(addr, &pred->regex, str_len); + + match = cmp ^ pred->not; + + return match; +} + /* Filter predicate for CPUs. */ static int filter_pred_cpu(struct filter_pred *pred, void *event) { @@ -756,7 +850,7 @@ static int filter_pred_none(struct filter_pred *pred, void *event) * * Note: * - @str might not be NULL-terminated if it's of type DYN_STRING - * or STATIC_STRING, unless @len is zero. + * RDYN_STRING, or STATIC_STRING, unless @len is zero. */ static int regex_match_full(char *str, struct regex *r, int len) @@ -1083,6 +1177,9 @@ int filter_assign_type(const char *type) if (strstr(type, "__data_loc") && strstr(type, "char")) return FILTER_DYN_STRING; + if (strstr(type, "__rel_loc") && strstr(type, "char")) + return FILTER_RDYN_STRING; + if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; @@ -1158,6 +1255,7 @@ static int parse_pred(const char *str, void *data, struct filter_pred *pred = NULL; char num_buf[24]; /* Big enough to hold an address */ char *field_name; + bool ustring = false; char q; u64 val; int len; @@ -1192,6 +1290,12 @@ static int parse_pred(const char *str, void *data, return -EINVAL; } + /* See if the field is a user space string */ + if ((len = str_has_prefix(str + i, ".ustring"))) { + ustring = true; + i += len; + } + while (isspace(str[i])) i++; @@ -1318,10 +1422,24 @@ static int parse_pred(const char *str, void *data, pred->fn = filter_pred_string; pred->regex.field_len = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) + } else if (field->filter_type == FILTER_DYN_STRING) { pred->fn = filter_pred_strloc; - else - pred->fn = filter_pred_pchar; + } else if (field->filter_type == FILTER_RDYN_STRING) + pred->fn = filter_pred_strrelloc; + else { + + if (!ustring_per_cpu) { + /* Once allocated, keep it around for good */ + ustring_per_cpu = alloc_percpu(struct ustring_buffer); + if (!ustring_per_cpu) + goto err_mem; + } + + if (ustring) + pred->fn = filter_pred_pchar_user; + else + pred->fn = filter_pred_pchar; + } /* go past the last quote */ i++; @@ -1387,6 +1505,9 @@ static int parse_pred(const char *str, void *data, err_free: kfree(pred); return -EINVAL; +err_mem: + kfree(pred); + return -ENOMEM; } enum { diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9555b8e1d1e3..ada87bfb5bb8 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -217,6 +217,20 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, return (u64)(unsigned long)addr; } +static u64 hist_field_reldynstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 *item = event + hist_field->field->offset; + u32 str_item = *item; + int str_loc = str_item & 0xffff; + char *addr = (char *)&item[1] + str_loc; + + return (u64)(unsigned long)addr; +} + static u64 hist_field_pstring(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -1956,8 +1970,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (field->filter_type == FILTER_STATIC_STRING) { hist_field->fn = hist_field_string; hist_field->size = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) + } else if (field->filter_type == FILTER_DYN_STRING) { hist_field->fn = hist_field_dynstring; + } else if (field->filter_type == FILTER_RDYN_STRING) + hist_field->fn = hist_field_reldynstring; else hist_field->fn = hist_field_pstring; } else { @@ -2487,6 +2503,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); expr->fn = hist_field_unary_minus; expr->operands[0] = operand1; + expr->size = operand1->size; + expr->is_signed = operand1->is_signed; expr->operator = FIELD_OP_UNARY_MINUS; expr->name = expr_str(expr, 0); expr->type = kstrdup_const(operand1->type, GFP_KERNEL); @@ -2703,6 +2721,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, /* The operand sizes should be the same, so just pick one */ expr->size = operand1->size; + expr->is_signed = operand1->is_signed; expr->operator = field_op; expr->type = kstrdup_const(operand1->type, GFP_KERNEL); @@ -2745,9 +2764,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, } static struct event_command trigger_hist_cmd; -static int event_hist_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param); +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); static bool compatible_keys(struct hist_trigger_data *target_hist_data, struct hist_trigger_data *hist_data, @@ -2950,8 +2969,8 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, var_hist->hist_data = hist_data; /* Create the new histogram with our variable */ - ret = event_hist_trigger_func(&trigger_hist_cmd, file, - "", "hist", cmd); + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "", "hist", cmd); if (ret) { kfree(cmd); kfree(var_hist->cmd); @@ -3757,7 +3776,7 @@ static int check_synth_field(struct synth_event *event, if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || - field->is_signed != hist_field->is_signed) + (!field->is_string && field->is_signed != hist_field->is_signed)) return -EINVAL; } @@ -3919,6 +3938,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, var_ref_idx = find_var_ref_idx(hist_data, var_ref); if (WARN_ON(var_ref_idx < 0)) { + kfree(p); ret = var_ref_idx; goto err; } @@ -4961,7 +4981,8 @@ static inline void add_to_key(char *compound_key, void *key, struct ftrace_event_field *field; field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING) + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) size = *(u32 *)(rec + field->offset) >> 16; else if (field->filter_type == FILTER_STATIC_STRING) size = field->size; @@ -5712,8 +5733,8 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_field_var_hists; i++) { file = hist_data->field_var_hists[i]->hist_data->event_file; cmd = hist_data->field_var_hists[i]->cmd; - ret = event_hist_trigger_func(&trigger_hist_cmd, file, - "!hist", "hist", cmd); + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "!hist", "hist", cmd); WARN_ON_ONCE(ret < 0); } } @@ -5742,7 +5763,7 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_hist_trigger_ops = { - .func = event_hist_trigger, + .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_init, .free = event_hist_trigger_free, @@ -5776,7 +5797,7 @@ static void event_hist_trigger_named_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_hist_trigger_named_ops = { - .func = event_hist_trigger, + .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_named_init, .free = event_hist_trigger_named_free, @@ -5893,7 +5914,7 @@ static bool hist_trigger_match(struct event_trigger_data *data, return true; } -static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, +static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -6045,7 +6066,7 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data, return false; } -static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, +static void hist_unregister_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -6129,9 +6150,9 @@ static void hist_unreg_all(struct trace_event_file *file) } } -static int event_hist_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; struct event_trigger_data *trigger_data; @@ -6146,7 +6167,9 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); - if (glob && strlen(glob)) { + WARN_ON(!glob); + + if (strlen(glob)) { hist_err_clear(); last_cmd_set(file, param); } @@ -6179,7 +6202,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, continue; } break; - } while (p); + } while (1); if (!p) param = NULL; @@ -6245,7 +6268,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) @@ -6254,7 +6277,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of triggers registered, * but if it didn't register any it returns zero. Consider no @@ -6297,7 +6320,7 @@ enable: return ret; out_unreg: - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); @@ -6314,7 +6337,7 @@ static struct event_command trigger_hist_cmd = { .name = "hist", .trigger_type = ETT_EVENT_HIST, .flags = EVENT_CMD_FL_NEEDS_REC, - .func = event_hist_trigger_func, + .parse = event_hist_trigger_parse, .reg = hist_register_trigger, .unreg = hist_unregister_trigger, .unreg_all = hist_unreg_all, @@ -6366,28 +6389,28 @@ hist_enable_count_trigger(struct event_trigger_data *data, } static struct event_trigger_ops hist_enable_trigger_ops = { - .func = hist_enable_trigger, + .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_enable_count_trigger_ops = { - .func = hist_enable_count_trigger, + .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_disable_trigger_ops = { - .func = hist_enable_trigger, + .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_disable_count_trigger_ops = { - .func = hist_enable_count_trigger, + .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, @@ -6429,7 +6452,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file) static struct event_command trigger_hist_enable_cmd = { .name = ENABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, @@ -6440,7 +6463,7 @@ static struct event_command trigger_hist_enable_cmd = { static struct event_command trigger_hist_disable_cmd = { .name = DISABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index c188045c5f97..d6b4935a78c0 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -168,10 +168,14 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size) continue; if (field->filter_type == FILTER_STATIC_STRING) continue; - if (field->filter_type == FILTER_DYN_STRING) { + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { u32 *str_item; int str_loc = entry_size & 0xffff; + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; + str_item = (u32 *)(entry + field->offset); *str_item = str_loc; /* string length is 0. */ } else { @@ -214,7 +218,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) if (field->filter_type == FILTER_STATIC_STRING) { strlcpy(entry + field->offset, addr, field->size); - } else if (field->filter_type == FILTER_DYN_STRING) { + } else if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { int str_len = strlen(addr) + 1; int str_loc = entry_size & 0xffff; u32 *str_item; @@ -229,6 +234,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) strlcpy(entry + (entry_size - str_len), addr, str_len); str_item = (u32 *)(entry + field->offset); + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; *str_item = (str_len << 16) | str_loc; } else { char **paddr; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 22db3ce95e74..154db74dadbc 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1237,9 +1237,8 @@ static int __create_synth_event(const char *name, const char *raw_fields) argv + consumed, &consumed, &field_version); if (IS_ERR(field)) { - argv_free(argv); ret = PTR_ERR(field); - goto err; + goto err_free_arg; } /* @@ -1262,18 +1261,19 @@ static int __create_synth_event(const char *name, const char *raw_fields) if (cmd_version > 1 && n_fields_this_loop >= 1) { synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str)); ret = -EINVAL; - goto err; + goto err_free_arg; } fields[n_fields++] = field; if (n_fields == SYNTH_FIELDS_MAX) { synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; - goto err; + goto err_free_arg; } n_fields_this_loop++; } + argv_free(argv); if (consumed < argc) { synth_err(SYNTH_ERR_INVALID_CMD, 0); @@ -1281,7 +1281,6 @@ static int __create_synth_event(const char *name, const char *raw_fields) goto err; } - argv_free(argv); } if (n_fields == 0) { @@ -1307,6 +1306,8 @@ static int __create_synth_event(const char *name, const char *raw_fields) kfree(saved_fields); return ret; + err_free_arg: + argv_free(argv); err: for (i = 0; i < n_fields; i++) free_synth_field(fields[i]); @@ -1978,7 +1979,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_next_val); /** * synth_event_add_val - Add a named field's value to an open synth trace * @field_name: The name of the synthetic event field value to set - * @val: The value to set the next field to + * @val: The value to set the named field to * @trace_state: A pointer to object tracking the piecewise trace state * * Set the value of the named field in an event that's been opened by @@ -2053,6 +2054,13 @@ static int create_synth_event(const char *raw_command) last_cmd_set(raw_command); + name = raw_command; + + /* Don't try to process if not our system */ + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + p = strpbrk(raw_command, " \t"); if (!p) { synth_err(SYNTH_ERR_INVALID_CMD, 0); @@ -2061,12 +2069,6 @@ static int create_synth_event(const char *raw_command) fields = skip_spaces(p); - name = raw_command; - - if (name[0] != 's' || name[1] != ':') - return -ECANCELED; - name += 2; - /* This interface accepts group name prefix */ if (strchr(name, '/')) { len = str_has_prefix(name, SYNTH_SYSTEM "/"); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 3d5c07239a2a..d00fee705f9c 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -68,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, if (data->paused) continue; if (!rec) { - data->ops->func(data, buffer, rec, event); + data->ops->trigger(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -78,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, buffer, rec, event); + data->ops->trigger(data, buffer, rec, event); } return tt; } @@ -106,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, NULL, NULL, NULL); + data->ops->trigger(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -245,7 +245,7 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) mutex_lock(&trigger_cmd_mutex); list_for_each_entry(p, &trigger_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(p, file, buff, command, next); + ret = p->parse(p, file, buff, command, next); goto out_unlock; } } @@ -540,7 +540,6 @@ void update_cond_flag(struct trace_event_file *file) /** * register_trigger - Generic event_command @reg implementation * @glob: The raw string used to register the trigger - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data to associate with the trigger * @file: The trace_event_file associated with the event * @@ -551,7 +550,7 @@ void update_cond_flag(struct trace_event_file *file) * * Return: 0 on success, errno otherwise */ -static int register_trigger(char *glob, struct event_trigger_ops *ops, +static int register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -589,7 +588,6 @@ out: /** * unregister_trigger - Generic event_command @unreg implementation * @glob: The raw string used to register the trigger - * @ops: The trigger ops associated with the trigger * @test: Trigger-specific data used to find the trigger to remove * @file: The trace_event_file associated with the event * @@ -598,7 +596,7 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -static void unregister_trigger(char *glob, struct event_trigger_ops *ops, +static void unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file) { @@ -621,8 +619,350 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, data->ops->free(data->ops, data); } +/* + * Event trigger parsing helper functions. + * + * These functions help make it easier to write an event trigger + * parsing function i.e. the struct event_command.parse() callback + * function responsible for parsing and registering a trigger command + * written to the 'trigger' file. + * + * A trigger command (or just 'trigger' for short) takes the form: + * [trigger] [if filter] + * + * The struct event_command.parse() callback (and other struct + * event_command functions) refer to several components of a trigger + * command. Those same components are referenced by the event trigger + * parsing helper functions defined below. These components are: + * + * cmd - the trigger command name + * glob - the trigger command name optionally prefaced with '!' + * param_and_filter - text following cmd and ':' + * param - text following cmd and ':' and stripped of filter + * filter - the optional filter text following (and including) 'if' + * + * To illustrate the use of these componenents, here are some concrete + * examples. For the following triggers: + * + * echo 'traceon:5 if pid == 0' > trigger + * - 'traceon' is both cmd and glob + * - '5 if pid == 0' is the param_and_filter + * - '5' is the param + * - 'if pid == 0' is the filter + * + * echo 'enable_event:sys:event:n' > trigger + * - 'enable_event' is both cmd and glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'hist:keys=pid if prio > 50' > trigger + * - 'hist' is both cmd and glob + * - 'keys=pid if prio > 50' is the param_and_filter + * - 'keys=pid' is the param + * - 'if prio > 50' is the filter + * + * echo '!enable_event:sys:event:n' > trigger + * - 'enable_event' the cmd + * - '!enable_event' is the glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'traceoff' > trigger + * - 'traceoff' is both cmd and glob + * - there is no param_and_filter + * - there is no param + * - there is no filter + * + * There are a few different categories of event trigger covered by + * these helpers: + * + * - triggers that don't require a parameter e.g. traceon + * - triggers that do require a parameter e.g. enable_event and hist + * - triggers that though they may not require a param may support an + * optional 'n' param (n = number of times the trigger should fire) + * e.g.: traceon:5 or enable_event:sys:event:n + * - triggers that do not support an 'n' param e.g. hist + * + * These functions can be used or ignored as necessary - it all + * depends on the complexity of the trigger, and the granularity of + * the functions supported reflects the fact that some implementations + * may need to customize certain aspects of their implementations and + * won't need certain functions. For instance, the hist trigger + * implementation doesn't use event_trigger_separate_filter() because + * it has special requirements for handling the filter. + */ + +/** + * event_trigger_check_remove - check whether an event trigger specifies remove + * @glob: The trigger command string, with optional remove(!) operator + * + * The event trigger callback implementations pass in 'glob' as a + * parameter. This is the command name either with or without a + * remove(!) operator. This function simply parses the glob and + * determines whether the command corresponds to a trigger removal or + * a trigger addition. + * + * Return: true if this is a remove command, false otherwise + */ +bool event_trigger_check_remove(const char *glob) +{ + return (glob && glob[0] == '!') ? true : false; +} + +/** + * event_trigger_empty_param - check whether the param is empty + * @param: The trigger param string + * + * The event trigger callback implementations pass in 'param' as a + * parameter. This corresponds to the string following the command + * name minus the command name. This function can be called by a + * callback implementation for any command that requires a param; a + * callback that doesn't require a param can ignore it. + * + * Return: true if this is an empty param, false otherwise + */ +bool event_trigger_empty_param(const char *param) +{ + return !param; +} + +/** + * event_trigger_separate_filter - separate an event trigger from a filter + * @param: The param string containing trigger and possibly filter + * @trigger: outparam, will be filled with a pointer to the trigger + * @filter: outparam, will be filled with a pointer to the filter + * @param_required: Specifies whether or not the param string is required + * + * Given a param string of the form '[trigger] [if filter]', this + * function separates the filter from the trigger and returns the + * trigger in *trigger and the filter in *filter. Either the *trigger + * or the *filter may be set to NULL by this function - if not set to + * NULL, they will contain strings corresponding to the trigger and + * filter. + * + * There are two cases that need to be handled with respect to the + * passed-in param: either the param is required, or it is not + * required. If @param_required is set, and there's no param, it will + * return -EINVAL. If @param_required is not set and there's a param + * that starts with a number, that corresponds to the case of a + * trigger with :n (n = number of times the trigger should fire) and + * the parsing continues normally; otherwise the function just returns + * and assumes param just contains a filter and there's nothing else + * to do. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required) +{ + int ret = 0; + + *param = *filter = NULL; + + if (!param_and_filter) { + if (param_required) + ret = -EINVAL; + goto out; + } + + /* + * Here we check for an optional param. The only legal + * optional param is :n, and if that's the case, continue + * below. Otherwise we assume what's left is a filter and + * return it as the filter string for the caller to deal with. + */ + if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { + *filter = param_and_filter; + goto out; + } + + /* + * Separate the param from the filter (param [if filter]). + * Here we have either an optional :n param or a required + * param and an optional filter. + */ + *param = strsep(¶m_and_filter, " \t"); + + /* + * Here we have a filter, though it may be empty. + */ + if (param_and_filter) { + *filter = skip_spaces(param_and_filter); + if (!**filter) + *filter = NULL; + } +out: + return ret; +} + +/** + * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * @cmd_ops: The event_command operations for the trigger + * @cmd: The cmd string + * @param: The param string + * @private_data: User data to associate with the event trigger + * + * Allocate an event_trigger_data instance and initialize it. The + * @cmd_ops are used along with the @cmd and @param to get the + * trigger_ops to assign to the event_trigger_data. @private_data can + * also be passed in and associated with the event_trigger_data. + * + * Use event_trigger_free() to free an event_trigger_data object. + * + * Return: The trigger_data object success, NULL otherwise + */ +struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, param); + + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + return NULL; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + trigger_data->private_data = private_data; + + INIT_LIST_HEAD(&trigger_data->list); + INIT_LIST_HEAD(&trigger_data->named_list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + return trigger_data; +} + +/** + * event_trigger_parse_num - parse and return the number param for a trigger + * @param: The param string + * @trigger_data: The trigger_data for the trigger + * + * Parse the :n (n = number of times the trigger should fire) param + * and set the count variable in the trigger_data to the parsed count. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_parse_num(char *param, + struct event_trigger_data *trigger_data) +{ + char *number; + int ret = 0; + + if (param) { + number = strsep(¶m, ":"); + + if (!strlen(number)) + return -EINVAL; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + } + + return ret; +} + +/** + * event_trigger_set_filter - set an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @param: The string containing the filter + * @trigger_data: The trigger_data for the trigger + * + * Set the filter for the trigger. If the filter is NULL, just return + * without error. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data) +{ + if (param && cmd_ops->set_filter) + return cmd_ops->set_filter(param, trigger_data, file); + + return 0; +} + +/** + * event_trigger_reset_filter - reset an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @trigger_data: The trigger_data for the trigger + * + * Reset the filter for the trigger to no filter. + */ +void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data) +{ + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); +} + +/** + * event_trigger_register - register an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @cmd: The cmd string + * @param: The param string + * @trigger_data: The trigger_data for the trigger + * @n_registered: optional outparam, the number of triggers registered + * + * Register an event trigger. The @cmd_ops are used to call the + * cmd_ops->reg() function which actually does the registration. The + * cmd_ops->reg() function returns the number of triggers registered, + * which is assigned to n_registered, if n_registered is non-NULL. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + char *cmd, + char *param, + struct event_trigger_data *trigger_data, + int *n_registered) +{ + int ret; + + if (n_registered) + *n_registered = 0; + + ret = cmd_ops->reg(glob, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + cmd_ops->unreg(glob, trigger_data, file); + ret = -ENOENT; + } else if (ret > 0) { + if (n_registered) + *n_registered = ret; + /* Just return zero, not the number of enabled functions */ + ret = 0; + } + + return ret; +} + +/* + * End event trigger parsing helper functions. + */ + /** - * event_trigger_callback - Generic event_command @func implementation + * event_trigger_parse - Generic event_command @parse implementation * @cmd_ops: The command ops, used for trigger registration * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger @@ -632,15 +972,15 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, * Common implementation for event command parsing and trigger * instantiation. * - * Usually used directly as the @func method in event command + * Usually used directly as the @parse method in event command * implementations. * * Return: 0 on success, errno otherwise */ static int -event_trigger_callback(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +event_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct event_trigger_data *trigger_data; struct event_trigger_ops *trigger_ops; @@ -673,7 +1013,7 @@ event_trigger_callback(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); kfree(trigger_data); ret = 0; goto out; @@ -708,14 +1048,14 @@ event_trigger_callback(struct event_command *cmd_ops, out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ event_trigger_init(trigger_ops, trigger_data); - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ if (!ret) { - cmd_ops->unreg(glob, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob, trigger_data, file); ret = -ENOENT; } else if (ret > 0) ret = 0; @@ -1023,28 +1363,28 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops traceon_trigger_ops = { - .func = traceon_trigger, + .trigger = traceon_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceon_count_trigger_ops = { - .func = traceon_count_trigger, + .trigger = traceon_count_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceoff_trigger_ops = { - .func = traceoff_trigger, + .trigger = traceoff_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceoff_count_trigger_ops = { - .func = traceoff_count_trigger, + .trigger = traceoff_count_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1069,7 +1409,7 @@ onoff_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_traceon_cmd = { .name = "traceon", .trigger_type = ETT_TRACE_ONOFF, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = onoff_get_trigger_ops, @@ -1080,7 +1420,7 @@ static struct event_command trigger_traceoff_cmd = { .name = "traceoff", .trigger_type = ETT_TRACE_ONOFF, .flags = EVENT_CMD_FL_POST_TRIGGER, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = onoff_get_trigger_ops, @@ -1116,14 +1456,14 @@ snapshot_count_trigger(struct event_trigger_data *data, } static int -register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, +register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { if (tracing_alloc_snapshot_instance(file->tr) != 0) return 0; - return register_trigger(glob, ops, data, file); + return register_trigger(glob, data, file); } static int @@ -1135,14 +1475,14 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops snapshot_trigger_ops = { - .func = snapshot_trigger, + .trigger = snapshot_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops snapshot_count_trigger_ops = { - .func = snapshot_count_trigger, + .trigger = snapshot_count_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1157,7 +1497,7 @@ snapshot_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_snapshot_cmd = { .name = "snapshot", .trigger_type = ETT_SNAPSHOT, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_snapshot_trigger, .unreg = unregister_trigger, .get_trigger_ops = snapshot_get_trigger_ops, @@ -1226,14 +1566,14 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops stacktrace_trigger_ops = { - .func = stacktrace_trigger, + .trigger = stacktrace_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops stacktrace_count_trigger_ops = { - .func = stacktrace_count_trigger, + .trigger = stacktrace_count_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1249,7 +1589,7 @@ static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, .flags = EVENT_CMD_FL_POST_TRIGGER, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = stacktrace_get_trigger_ops, @@ -1353,36 +1693,36 @@ void event_enable_trigger_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_enable_trigger_ops = { - .func = event_enable_trigger, + .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_enable_count_trigger_ops = { - .func = event_enable_count_trigger, + .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_disable_trigger_ops = { - .func = event_enable_trigger, + .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_disable_count_trigger_ops = { - .func = event_enable_count_trigger, + .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -int event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; @@ -1455,7 +1795,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, trigger_data->private_data = enable_data; if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); kfree(trigger_data); kfree(enable_data); ret = 0; @@ -1502,7 +1842,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, ret = trace_event_enable_disable(event_enable_file, 1, 1); if (ret < 0) goto out_put; - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. @@ -1532,7 +1872,6 @@ int event_enable_trigger_func(struct event_command *cmd_ops, } int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { @@ -1574,7 +1913,6 @@ out: } void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *test, struct trace_event_file *file) { @@ -1628,7 +1966,7 @@ event_enable_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_enable_cmd = { .name = ENABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .get_trigger_ops = event_enable_get_trigger_ops, @@ -1638,7 +1976,7 @@ static struct event_command trigger_enable_cmd = { static struct event_command trigger_disable_cmd = { .name = DISABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .get_trigger_ops = event_enable_get_trigger_ops, diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 56bb7b890578..d440ddd5fd8b 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -491,18 +491,14 @@ static void stop_per_cpu_kthreads(void) static int start_cpu_kthread(unsigned int cpu) { struct task_struct *kthread; - char comm[24]; - snprintf(comm, 24, "hwlatd/%d", cpu); - - kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm); + kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); return -ENOMEM; } per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread; - wake_up_process(kthread); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 33272a7b6912..508f14af4f2c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "trace_kprobe: " fmt +#include <linux/bpf-cgroup.h> #include <linux/security.h> #include <linux/module.h> #include <linux/uaccess.h> @@ -327,11 +328,9 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) static void __disable_trace_kprobe(struct trace_probe *tp) { - struct trace_probe *pos; struct trace_kprobe *tk; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tk = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { if (!trace_kprobe_is_registered(tk)) continue; if (trace_kprobe_is_return(tk)) @@ -348,7 +347,7 @@ static void __disable_trace_kprobe(struct trace_probe *tp) static int enable_trace_kprobe(struct trace_event_call *call, struct trace_event_file *file) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_kprobe *tk; bool enabled; int ret = 0; @@ -369,8 +368,7 @@ static int enable_trace_kprobe(struct trace_event_call *call, if (enabled) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tk = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { if (trace_kprobe_has_gone(tk)) continue; ret = __enable_trace_kprobe(tk); @@ -559,11 +557,9 @@ static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, struct trace_kprobe *comp) { struct trace_probe_event *tpe = orig->tp.event; - struct trace_probe *pos; int i; - list_for_each_entry(pos, &tpe->probes, list) { - orig = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(orig, &tpe->probes, tp.list) { if (strcmp(trace_kprobe_symbol(orig), trace_kprobe_symbol(comp)) || trace_kprobe_offset(orig) != trace_kprobe_offset(comp)) @@ -1175,15 +1171,18 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct dyn_event *ev = v; struct trace_kprobe *tk; + unsigned long nmissed; if (!is_trace_kprobe(ev)) return 0; tk = to_trace_kprobe(ev); + nmissed = trace_kprobe_is_return(tk) ? + tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; seq_printf(m, " %-44s %15lu %15lu\n", trace_probe_name(&tk->tp), trace_kprobe_nhit(tk), - tk->rp.kp.nmissed); + nmissed); return 0; } @@ -1383,17 +1382,11 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, if (trace_trigger_soft_disabled(trace_file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = trace_file; - dsize = __get_data_size(&tk->tp, regs); - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, - call->event.type, - sizeof(*entry) + tk->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) return; fbuffer.regs = regs; @@ -1430,16 +1423,11 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (trace_trigger_soft_disabled(trace_file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = trace_file; - dsize = __get_data_size(&tk->tp, regs); - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, - call->event.type, - sizeof(*entry) + tk->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) return; fbuffer.regs = regs; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 7520d43aed55..870a08da5b48 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -138,8 +138,7 @@ static void osnoise_unregister_instance(struct trace_array *tr) if (!found) return; - synchronize_rcu(); - kfree(inst); + kvfree_rcu(inst); } /* @@ -1701,7 +1700,7 @@ static int start_kthread(unsigned int cpu) snprintf(comm, 24, "osnoise/%d", cpu); } - kthread = kthread_create_on_cpu(main, NULL, cpu, comm); + kthread = kthread_run_on_cpu(main, NULL, cpu, comm); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); @@ -1710,7 +1709,6 @@ static int start_kthread(unsigned int cpu) } per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; - wake_up_process(kthread); return 0; } @@ -2123,6 +2121,13 @@ out_unhook_irq: return -EINVAL; } +static void osnoise_unhook_events(void) +{ + unhook_thread_events(); + unhook_softirq_events(); + unhook_irq_events(); +} + /* * osnoise_workload_start - start the workload and hook to events */ @@ -2155,7 +2160,14 @@ static int osnoise_workload_start(void) retval = start_per_cpu_kthreads(); if (retval) { - unhook_irq_events(); + trace_osnoise_callback_enabled = false; + /* + * Make sure that ftrace_nmi_enter/exit() see + * trace_osnoise_callback_enabled as false before continuing. + */ + barrier(); + + osnoise_unhook_events(); return retval; } @@ -2186,9 +2198,7 @@ static void osnoise_workload_stop(void) stop_per_cpu_kthreads(); - unhook_irq_events(); - unhook_softirq_events(); - unhook_thread_events(); + osnoise_unhook_events(); } static void osnoise_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3547e7176ff7..8aa493d25c73 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -445,14 +445,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; + int bh_off; int nmi; nmi = entry->flags & TRACE_FLAG_NMI; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + bh_off = entry->flags & TRACE_FLAG_BH_OFF; irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + bh_off ? 'b' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 3ed2a3f37297..73d90179b51b 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -356,6 +356,8 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) return -EINVAL; } *pbuf = kstrndup(str, len - 1, GFP_KERNEL); + if (!*pbuf) + return -ENOMEM; return 0; } @@ -1138,8 +1140,7 @@ int trace_probe_remove_file(struct trace_probe *tp, return -ENOENT; list_del_rcu(&link->list); - synchronize_rcu(); - kfree(link); + kvfree_rcu(link); if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8bfcd3b09422..f755bde42fd0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -323,8 +323,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, sys_data->enter_event->event.type, size, trace_ctx); if (!event) return; @@ -367,8 +366,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, sys_data->exit_event->event.type, sizeof(*entry), trace_ctx); if (!event) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0a5c0db3137e..9711589273cd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "trace_uprobe: " fmt +#include <linux/bpf-cgroup.h> #include <linux/security.h> #include <linux/ctype.h> #include <linux/module.h> @@ -409,12 +410,10 @@ static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, struct trace_uprobe *comp) { struct trace_probe_event *tpe = orig->tp.event; - struct trace_probe *pos; struct inode *comp_inode = d_real_inode(comp->path.dentry); int i; - list_for_each_entry(pos, &tpe->probes, list) { - orig = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(orig, &tpe->probes, tp.list) { if (comp_inode != d_real_inode(orig->path.dentry) || comp->offset != orig->offset) continue; @@ -949,8 +948,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; - struct trace_buffer *buffer; - struct ring_buffer_event *event; + struct trace_event_buffer fbuffer; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); @@ -965,12 +963,10 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - call->event.type, size, 0); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) return; - entry = ring_buffer_event_data(event); if (is_ret_probe(tu)) { entry->vaddr[0] = func; entry->vaddr[1] = instruction_pointer(regs); @@ -982,7 +978,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - event_trigger_unlock_commit(trace_file, buffer, event, entry, 0); + trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ @@ -1075,14 +1071,12 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) static void __probe_event_disable(struct trace_probe *tp) { - struct trace_probe *pos; struct trace_uprobe *tu; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { if (!tu->inode) continue; @@ -1094,7 +1088,7 @@ static void __probe_event_disable(struct trace_probe *tp) static int probe_event_enable(struct trace_event_call *call, struct trace_event_file *file, filter_func_t filter) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; bool enabled; int ret; @@ -1129,8 +1123,7 @@ static int probe_event_enable(struct trace_event_call *call, if (ret) goto err_flags; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = trace_uprobe_enable(tu, filter); if (ret) { __probe_event_disable(tp); @@ -1275,7 +1268,7 @@ static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter, static int uprobe_perf_close(struct trace_event_call *call, struct perf_event *event) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; int ret = 0; @@ -1287,8 +1280,7 @@ static int uprobe_perf_close(struct trace_event_call *call, if (trace_uprobe_filter_remove(tu->tp.event->filter, event)) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); if (ret) break; @@ -1300,7 +1292,7 @@ static int uprobe_perf_close(struct trace_event_call *call, static int uprobe_perf_open(struct trace_event_call *call, struct perf_event *event) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; int err = 0; @@ -1312,7 +1304,7 @@ static int uprobe_perf_open(struct trace_event_call *call, if (trace_uprobe_filter_add(tu->tp.event->filter, event)) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); @@ -1618,6 +1610,11 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->path = path; tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); + if (!tu->filename) { + ret = -ENOMEM; + goto error; + } + init_trace_event_call(tu); ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 39bb56d2dcbe..9628b5571846 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -15,6 +15,7 @@ #include <linux/jhash.h> #include <linux/slab.h> #include <linux/sort.h> +#include <linux/kmemleak.h> #include "tracing_map.h" #include "trace.h" @@ -307,6 +308,7 @@ static void tracing_map_array_free(struct tracing_map_array *a) for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; + kmemleak_free(a->pages[i]); free_page((unsigned long)a->pages[i]); } @@ -342,6 +344,7 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); if (!a->pages[i]) goto free; + kmemleak_alloc(a->pages[i], PAGE_SIZE, 1, GFP_KERNEL); } out: return a; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index f00de83d0246..1d261fbe367b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -38,11 +38,10 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); stats->ac_btime64 = btime; - if (thread_group_leader(tsk)) { + if (tsk->flags & PF_EXITING) stats->ac_exitcode = tsk->exit_code; - if (tsk->flags & PF_FORKNOEXEC) - stats->ac_flag |= AFORK; - } + if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC)) + stats->ac_flag |= AFORK; if (tsk->flags & PF_SUPERPRIV) stats->ac_flag |= ASU; if (tsk->flags & PF_DUMPCORE) diff --git a/kernel/ucount.c b/kernel/ucount.c index 4f5613dac227..65b597431c86 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -190,6 +190,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) kfree(new); } else { hlist_add_head(&new->node, hashent); + get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } @@ -210,6 +211,7 @@ void put_ucounts(struct ucounts *ucounts) if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); kfree(ucounts); } } @@ -264,15 +266,16 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) { struct ucounts *iter; + long max = LONG_MAX; long ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long max = READ_ONCE(iter->ns->ucount_max[type]); long new = atomic_long_add_return(v, &iter->ucount[type]); if (new < 0 || new > max) ret = LONG_MAX; else if (iter == ucounts) ret = new; + max = READ_ONCE(iter->ns->ucount_max[type]); } return ret; } @@ -312,15 +315,16 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; + long max = LONG_MAX; long dec, ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long max = READ_ONCE(iter->ns->ucount_max[type]); long new = atomic_long_add_return(1, &iter->ucount[type]); if (new < 0 || new > max) goto unwind; if (iter == ucounts) ret = new; + max = READ_ONCE(iter->ns->ucount_max[type]); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. @@ -339,15 +343,16 @@ unwind: return 0; } -bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit) { struct ucounts *iter; - if (get_ucounts_value(ucounts, type) > max) - return true; + long max = rlimit; + if (rlimit > LONG_MAX) + max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - max = READ_ONCE(iter->ns->ucount_max[type]); if (get_ucounts_value(iter, type) > max) return true; + max = READ_ONCE(iter->ns->ucount_max[type]); } return false; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ad912511a0c0..99afb88d2e85 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -740,6 +740,106 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_unlock(&watchdog_mutex); return err; } + +static const int sixty = 60; + +static struct ctl_table watchdog_sysctls[] = { + { + .procname = "watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog_thresh, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&sixty, + }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + { + .procname = "soft_watchdog", + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif + {} +}; + +static void __init watchdog_sysctl_init(void) +{ + register_sysctl_init("kernel", watchdog_sysctls); +} +#else +#define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) @@ -753,4 +853,5 @@ void __init lockup_detector_init(void) if (!watchdog_nmi_probe()) nmi_watchdog_available = true; lockup_detector_setup(); + watchdog_sysctl_init(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 613917bbc4e7..33f1106b4f99 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -154,6 +154,9 @@ struct worker_pool { unsigned long watchdog_ts; /* L: watchdog timestamp */ + /* The current concurrency level. */ + atomic_t nr_running; + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -178,18 +181,11 @@ struct worker_pool { int refcnt; /* PL: refcnt for unbound pools */ /* - * The current concurrency level. As it's likely to be accessed - * from other CPUs during try_to_wake_up(), put it in a separate - * cacheline. - */ - atomic_t nr_running ____cacheline_aligned_in_smp; - - /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; -} ____cacheline_aligned_in_smp; +}; /* * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS @@ -868,8 +864,17 @@ void wq_worker_running(struct task_struct *task) if (!worker->sleeping) return; + + /* + * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check + * and the nr_running increment below, we may ruin the nr_running reset + * and leave with an unexpected pool->nr_running == 1 on the newly unbound + * pool. Protect against such race. + */ + preempt_disable(); if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(&worker->pool->nr_running); + preempt_enable(); worker->sleeping = 0; } @@ -878,8 +883,7 @@ void wq_worker_running(struct task_struct *task) * @task: task going to sleep * * This function is called from schedule() when a busy worker is - * going to sleep. Preemption needs to be disabled to protect ->sleeping - * assignment. + * going to sleep. */ void wq_worker_sleeping(struct task_struct *task) { @@ -904,6 +908,16 @@ void wq_worker_sleeping(struct task_struct *task) raw_spin_lock_irq(&pool->lock); /* + * Recheck in case unbind_workers() preempted us. We don't + * want to decrement nr_running after the worker is unbound + * and nr_running has been reset. + */ + if (worker->flags & WORKER_NOT_RUNNING) { + raw_spin_unlock_irq(&pool->lock); + return; + } + + /* * The counterpart of the following dec_and_test, implied mb, * worklist not empty test sequence is in insert_work(). * Please read comment there. @@ -1531,7 +1545,8 @@ out: * @work: work to queue * * We queue the work to a specific CPU, the caller must ensure it - * can't go away. + * can't go away. Callers that fail to ensure that the specified + * CPU cannot go away will execute on a randomly chosen CPU. * * Return: %false if @work was already on a queue, %true otherwise. */ @@ -1811,14 +1826,8 @@ static void worker_enter_idle(struct worker *worker) if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); - /* - * Sanity check nr_running. Because unbind_workers() releases - * pool->lock between setting %WORKER_UNBOUND and zapping - * nr_running, the warning may trigger spuriously. Check iff - * unbind is not in progress. - */ - WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && - pool->nr_workers == pool->nr_idle && + /* Sanity check nr_running. */ + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && atomic_read(&pool->nr_running)); } @@ -4979,38 +4988,22 @@ static void unbind_workers(int cpu) /* * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers - * except for the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they may become diasporas. + * must be on the cpu. After this, they may become diasporas. + * And the preemption disabled section in their sched callbacks + * are guaranteed to see WORKER_UNBOUND since the code here + * is on the same cpu. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; - raw_spin_unlock_irq(&pool->lock); - - for_each_pool_worker(worker, pool) { - kthread_set_per_cpu(worker->task, -1); - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); - } - - mutex_unlock(&wq_pool_attach_mutex); - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the %WORKER_UNBOUND flag. - * This is necessary as scheduler callbacks may be invoked - * from other cpus. - */ - schedule(); - - /* - * Sched callbacks are disabled now. Zap nr_running. - * After this, nr_running stays zero and need_more_worker() - * and keep_working() are always true as long as the - * worklist is not empty. This pool now behaves as an - * unbound (in terms of concurrency management) pool which + * The handling of nr_running in sched callbacks are disabled + * now. Zap nr_running. After this, nr_running stays zero and + * need_more_worker() and keep_working() are always true as + * long as the worklist is not empty. This pool now behaves as + * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ atomic_set(&pool->nr_running, 0); @@ -5020,9 +5013,16 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - raw_spin_lock_irq(&pool->lock); wake_up_worker(pool); + raw_spin_unlock_irq(&pool->lock); + + for_each_pool_worker(worker, pool) { + kthread_set_per_cpu(worker->task, -1); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + } + + mutex_unlock(&wq_pool_attach_mutex); } } @@ -5059,17 +5059,6 @@ static void rebind_workers(struct worker_pool *pool) unsigned int worker_flags = worker->flags; /* - * A bound idle worker should actually be on the runqueue - * of the associated CPU for local wake-ups targeting it to - * work. Kick all idle workers so that they migrate to the - * associated CPU. Doing this in the same loop as - * replacing UNBOUND with REBOUND is safe as no worker will - * be bound before @pool->lock is released. - */ - if (worker_flags & WORKER_IDLE) - wake_up_process(worker->task); - - /* * We want to clear UNBOUND but can't directly call * worker_clr_flags() or adjust nr_running. Atomically * replace UNBOUND with another NOT_RUNNING flag REBOUND. |