aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--kernel/audit.c47
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c7
-rw-r--r--kernel/auditsc.c11
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/bloom_filter.c6
-rw-r--r--kernel/bpf/bpf_inode_storage.c6
-rw-r--r--kernel/bpf/bpf_iter.c35
-rw-r--r--kernel/bpf/bpf_local_storage.c50
-rw-r--r--kernel/bpf/bpf_struct_ops.c6
-rw-r--r--kernel/bpf/bpf_task_storage.c10
-rw-r--r--kernel/bpf/btf.c541
-rw-r--r--kernel/bpf/cgroup.c2
-rw-r--r--kernel/bpf/core.c6
-rw-r--r--kernel/bpf/cpumap.c12
-rw-r--r--kernel/bpf/devmap.c36
-rw-r--r--kernel/bpf/helpers.c31
-rw-r--r--kernel/bpf/local_storage.c3
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/map_iter.c4
-rw-r--r--kernel/bpf/mmap_unlock_work.h65
-rw-r--r--kernel/bpf/net_namespace.c1
-rw-r--r--kernel/bpf/reuseport_array.c6
-rw-r--r--kernel/bpf/ringbuf.c2
-rw-r--r--kernel/bpf/stackmap.c82
-rw-r--r--kernel/bpf/syscall.c7
-rw-r--r--kernel/bpf/task_iter.c82
-rw-r--r--kernel/bpf/trampoline.c8
-rw-r--r--kernel/bpf/verifier.c966
-rw-r--r--kernel/cgroup/cgroup-internal.h19
-rw-r--r--kernel/cgroup/cgroup-v1.c33
-rw-r--r--kernel/cgroup/cgroup.c95
-rw-r--r--kernel/cgroup/cpuset.c16
-rw-r--r--kernel/cgroup/rstat.c51
-rw-r--r--kernel/crash_core.c11
-rw-r--r--kernel/entry/common.c4
-rw-r--r--kernel/entry/kvm.c4
-rw-r--r--kernel/events/core.c4
-rw-r--r--kernel/irq/chip.c4
-rw-r--r--kernel/irq/handle.c11
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/kcsan/Makefile3
-rw-r--r--kernel/kcsan/core.c347
-rw-r--r--kernel/kcsan/kcsan_test.c426
-rw-r--r--kernel/kcsan/report.c51
-rw-r--r--kernel/kcsan/selftest.c143
-rw-r--r--kernel/locking/locktorture.c4
-rw-r--r--kernel/notifier.c15
-rw-r--r--kernel/power/power.h1
-rw-r--r--kernel/power/swap.c16
-rw-r--r--kernel/printk/printk.c104
-rw-r--r--kernel/rcu/Kconfig20
-rw-r--r--kernel/rcu/rcu_segcblist.c10
-rw-r--r--kernel/rcu/rcu_segcblist.h12
-rw-r--r--kernel/rcu/rcuscale.c14
-rw-r--r--kernel/rcu/rcutorture.c234
-rw-r--r--kernel/rcu/refscale.c50
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/tasks.h476
-rw-r--r--kernel/rcu/tree.c131
-rw-r--r--kernel/rcu/tree.h31
-rw-r--r--kernel/rcu/tree_exp.h14
-rw-r--r--kernel/rcu/tree_nocb.h160
-rw-r--r--kernel/rcu/tree_plugin.h250
-rw-r--r--kernel/rcu/tree_stall.h27
-rw-r--r--kernel/scftorture.c16
-rw-r--r--kernel/sched/Makefile7
-rw-r--r--kernel/sched/core.c86
-rw-r--r--kernel/sched/core_sched.c66
-rw-r--r--kernel/sched/cpuacct.c107
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c87
-rw-r--r--kernel/sched/psi.c47
-rw-r--r--kernel/sched/rt.c23
-rw-r--r--kernel/sched/sched.h76
-rw-r--r--kernel/sched/stats.h5
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/sysctl.c1
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/torture.c4
-rw-r--r--kernel/trace/bpf_trace.c93
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/trace/trace_uprobe.c1
-rw-r--r--kernel/ucount.c15
-rw-r--r--kernel/workqueue.c101
88 files changed, 3916 insertions, 1683 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 121d37e700a6..e4bbe2c70c26 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -718,7 +718,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
{
int rc = 0;
struct sk_buff *skb;
- static unsigned int failed = 0;
+ unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
@@ -735,32 +735,30 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
continue;
}
+retry:
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
- /* fatal failure for our queue flush attempt? */
+ /* send failed - try a few times unless fatal error */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
- /* yes - error processing for the queue */
sk = NULL;
if (err_hook)
(*err_hook)(skb);
- if (!skb_hook)
- goto out;
- /* keep processing with the skb_hook */
+ if (rc == -EAGAIN)
+ rc = 0;
+ /* continue to drain the queue */
continue;
} else
- /* no - requeue to preserve ordering */
- skb_queue_head(queue, skb);
+ goto retry;
} else {
- /* it worked - drop the extra reference and continue */
+ /* skb sent - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
-out:
return (rc >= 0 ? 0 : rc);
}
@@ -1446,7 +1444,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
return err;
}
- sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
if (audit_sig_sid)
security_release_secctx(ctx, len);
@@ -1459,7 +1457,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
- sig_data, sizeof(*sig_data) + len);
+ sig_data, struct_size(sig_data, ctx, len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -1542,6 +1540,20 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_next(nlh, &len);
}
audit_ctl_unlock();
+
+ /* can't block with the ctrl lock, so penalize the sender now */
+ if (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
+
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(audit_backlog_wait_time);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ }
}
/* Log information about who is connecting to the audit multicast socket */
@@ -1609,7 +1621,8 @@ static int __net_init audit_net_init(struct net *net)
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
- aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ /* limit the timeout in case auditd is blocked/stopped */
+ aunet->sk->sk_sndtimeo = HZ / 10;
return 0;
}
@@ -1825,7 +1838,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
- * while holding the mutex */
+ * while holding the mutex, although we do penalize the sender
+ * later in audit_receive() when it is safe to block
+ */
if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
@@ -2132,7 +2147,7 @@ int audit_log_task_context(struct audit_buffer *ab)
int error;
u32 sid;
- security_task_getsecid_subj(current, &sid);
+ security_current_getsecid_subj(&sid);
if (!sid)
return 0;
@@ -2353,7 +2368,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid_subj(current, &audit_sig_sid);
+ security_current_getsecid_subj(&audit_sig_sid);
}
return audit_signal_info_syscall(t);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 72324afcffef..e7315d487163 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
- tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
+ tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d75acb014ccd..42d99896e7a6 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -637,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
void *bufp;
int i;
- data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
memset(data, 0, sizeof(*data));
@@ -1092,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
break;
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
data,
- sizeof(*data) + data->buflen);
+ struct_size(data, buf, data->buflen));
if (skb)
skb_queue_tail(q, skb);
kfree(data);
@@ -1368,8 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_task_getsecid_subj(current,
- &sid);
+ security_current_getsecid_subj(&sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b517947bfa48..fce5d43a933f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -666,7 +666,16 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
- security_task_getsecid_subj(tsk, &sid);
+ /* @tsk should always be equal to
+ * @current with the exception of
+ * fork()/copy_process() in which case
+ * the new @tsk creds are still a dup
+ * of @current's creds so we can still
+ * use security_current_getsecid_subj()
+ * here even though it always refs
+ * @current's creds
+ */
+ security_current_getsecid_subj(&sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index cf6ca339f3cd..c1a9be6a4b9f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
+
+obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
+$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
+ $(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 277a05e9c984..b141a1346f72 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -82,6 +82,11 @@ static int bloom_map_delete_elem(struct bpf_map *map, void *value)
return -EOPNOTSUPP;
}
+static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
{
u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
@@ -192,6 +197,7 @@ const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = bloom_map_alloc,
.map_free = bloom_map_free,
+ .map_get_next_key = bloom_map_get_next_key,
.map_push_elem = bloom_map_push_elem,
.map_peek_elem = bloom_map_peek_elem,
.map_pop_elem = bloom_map_pop_elem,
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 96ceed0e0fb5..e29d9e3d853e 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -17,6 +17,7 @@
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
@@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
if (!bsb)
return NULL;
- inode_storage = rcu_dereference(bsb->storage);
+ inode_storage =
+ rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
if (!inode_storage)
return NULL;
@@ -172,6 +174,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
{
struct bpf_local_storage_data *sdata;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
@@ -204,6 +207,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
BPF_CALL_2(bpf_inode_storage_delete,
struct bpf_map *, map, struct inode *, inode)
{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!inode)
return -EINVAL;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b2ee45064e06..b7aef5b3416d 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -714,3 +714,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+
+/* maximum number of loops */
+#define MAX_LOOPS BIT(23)
+
+BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
+ u64, flags)
+{
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 ret;
+ u32 i;
+
+ if (flags)
+ return -EINVAL;
+ if (nr_loops > MAX_LOOPS)
+ return -E2BIG;
+
+ for (i = 0; i < nr_loops; i++) {
+ ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ return i + 1;
+ }
+
+ return i;
+}
+
+const struct bpf_func_proto bpf_loop_proto = {
+ .func = bpf_loop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b305270b7a4b..71de2a89869c 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -11,6 +11,9 @@
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
@@ -81,6 +84,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
+void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ kfree_rcu(local_storage, rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ kfree_rcu(selem, rcu);
+}
+
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
@@ -93,7 +112,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
bool free_local_storage;
void *owner;
- smap = rcu_dereference(SDATA(selem)->smap);
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
owner = local_storage->owner;
/* All uncharging on the owner must be done first.
@@ -118,12 +137,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
*
* Although the unlock will be done under
* rcu_read_lock(), it is more intutivie to
- * read if kfree_rcu(local_storage, rcu) is done
+ * read if the freeing of the storage is done
* after the raw_spin_unlock_bh(&local_storage->lock).
*
* Hence, a "bool free_local_storage" is returned
- * to the caller which then calls the kfree_rcu()
- * after unlock.
+ * to the caller which then calls then frees the storage after
+ * all the RCU grace periods have expired.
*/
}
hlist_del_init_rcu(&selem->snode);
@@ -131,8 +150,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- kfree_rcu(selem, rcu);
-
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
return free_local_storage;
}
@@ -146,7 +164,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
/* selem has already been unlinked from sk */
return;
- local_storage = rcu_dereference(selem->local_storage);
+ local_storage = rcu_dereference_check(selem->local_storage,
+ bpf_rcu_lock_held());
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
@@ -154,7 +173,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_local_storage)
- kfree_rcu(local_storage, rcu);
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_rcu);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -174,7 +194,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
/* selem has already be unlinked from smap */
return;
- smap = rcu_dereference(SDATA(selem)->smap);
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
b = select_bucket(smap, selem);
raw_spin_lock_irqsave(&b->lock, flags);
if (likely(selem_linked_to_map(selem)))
@@ -213,12 +233,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem;
/* Fast path (cache hit) */
- sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
+ sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
+ bpf_rcu_lock_held());
if (sdata && rcu_access_pointer(sdata->smap) == smap)
return sdata;
/* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
+ rcu_read_lock_trace_held())
if (rcu_access_pointer(SDATA(selem)->smap) == smap)
break;
@@ -306,7 +328,8 @@ int bpf_local_storage_alloc(void *owner,
* bucket->list, first_selem can be freed immediately
* (instead of kfree_rcu) because
* bpf_local_storage_map_free() does a
- * synchronize_rcu() before walking the bucket->list.
+ * synchronize_rcu_mult (waiting for both sleepable and
+ * normal programs) before walking the bucket->list.
* Hence, no one is accessing selem from the
* bucket->list under rcu_read_lock().
*/
@@ -342,7 +365,8 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!map_value_has_spin_lock(&smap->map)))
return ERR_PTR(-EINVAL);
- local_storage = rcu_dereference(*owner_storage(smap, owner));
+ local_storage = rcu_dereference_check(*owner_storage(smap, owner),
+ bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
/* Very first elem for the owner */
err = check_flags(NULL, map_flags);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 8ecfe4752769..21069dbe9138 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -165,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
break;
}
- if (btf_member_bitfield_size(t, member)) {
+ if (__btf_member_bitfield_size(t, member)) {
pr_warn("bit field member %s in struct %s is not supported\n",
mname, st_ops->name);
break;
@@ -296,7 +296,7 @@ static int check_zero_holes(const struct btf_type *t, void *data)
const struct btf_type *mtype;
for_each_member(i, t, member) {
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (moff > prev_mend &&
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
@@ -387,7 +387,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_prog *prog;
u32 moff;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index ebfa8bc90892..5da7bed0f5f6 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -17,6 +17,7 @@
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
@@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
struct bpf_local_storage *task_storage;
struct bpf_local_storage_map *smap;
- task_storage = rcu_dereference(task->bpf_storage);
+ task_storage =
+ rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
if (!task_storage)
return NULL;
@@ -229,6 +231,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
{
struct bpf_local_storage_data *sdata;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
@@ -260,6 +263,7 @@ BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
{
int ret;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
@@ -323,7 +327,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &btf_task_struct_ids[0],
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
@@ -334,5 +338,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &btf_task_struct_ids[0],
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9bdb03767db5..33bb8ae4a804 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -25,6 +25,7 @@
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <net/sock.h>
+#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
* the data types of BPF program/map. Hence, it basically focus
@@ -282,6 +283,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = "DATASEC",
[BTF_KIND_FLOAT] = "FLOAT",
[BTF_KIND_DECL_TAG] = "DECL_TAG",
+ [BTF_KIND_TYPE_TAG] = "TYPE_TAG",
};
const char *btf_type_str(const struct btf_type *t)
@@ -418,6 +420,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
return true;
}
@@ -834,7 +837,7 @@ static const char *btf_show_name(struct btf_show *show)
const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
const char *name = NULL, *prefix = "", *parens = "";
const struct btf_member *m = show->state.member;
- const struct btf_type *t = show->state.type;
+ const struct btf_type *t;
const struct btf_array *array;
u32 id = show->state.type_id;
const char *member = NULL;
@@ -1737,6 +1740,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
id = type->type;
type = btf_type_by_id(btf, type->type);
break;
@@ -2345,6 +2349,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
+ const char *value;
+
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
@@ -2360,7 +2366,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- /* typedef type must have a valid name, and other ref types,
+ /* typedef/type_tag type must have a valid name, and other ref types,
* volatile, const, restrict, should have a null name.
*/
if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
@@ -2369,6 +2375,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
+ } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
} else {
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
@@ -2958,7 +2970,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- offset = btf_member_bit_offset(t, member);
+ offset = __btf_member_bit_offset(t, member);
if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
@@ -3083,7 +3095,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
if (off != -ENOENT)
/* only one such field is allowed */
return -E2BIG;
- off = btf_member_bit_offset(t, member);
+ off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
@@ -3173,8 +3185,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
btf_show_start_member(show, member);
- member_offset = btf_member_bit_offset(t, member);
- bitfield_size = btf_member_bitfield_size(t, member);
+ member_offset = __btf_member_bit_offset(t, member);
+ bitfield_size = __btf_member_bitfield_size(t, member);
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
if (bitfield_size) {
@@ -4059,6 +4071,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = &datasec_ops,
[BTF_KIND_FLOAT] = &float_ops,
[BTF_KIND_DECL_TAG] = &decl_tag_ops,
+ [BTF_KIND_TYPE_TAG] = &modifier_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -4460,8 +4473,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
log->len_total = log_size;
/* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
- !log->level || !log->ubuf) {
+ if (!bpf_verifier_log_attr_valid(log)) {
err = -EINVAL;
goto errout;
}
@@ -4814,7 +4826,7 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
/* t comes in already as a pointer */
t = btf_type_by_id(btf, t->type);
@@ -4823,8 +4835,7 @@ static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
t = btf_type_by_id(btf, t->type);
- /* char, signed char, unsigned char */
- return btf_type_is_int(t) && t->size == 1;
+ return btf_type_is_int(t);
}
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
@@ -4929,10 +4940,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+ u32 type, flag;
- if (ctx_arg_info->offset == off &&
- (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
- ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
+ type = base_type(ctx_arg_info->reg_type);
+ flag = type_flag(ctx_arg_info->reg_type);
+ if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
+ (flag & PTR_MAYBE_NULL)) {
info->reg_type = ctx_arg_info->reg_type;
return true;
}
@@ -4945,7 +4958,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
- if (is_string_ptr(btf, t))
+ if (is_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -5048,7 +5061,7 @@ again:
if (array_elem->nelems != 0)
goto error;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off < moff)
goto error;
@@ -5071,14 +5084,14 @@ error:
for_each_member(i, t, member) {
/* offset of the field in bytes */
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
- if (btf_member_bitfield_size(t, member)) {
- u32 end_bit = btf_member_bit_offset(t, member) +
- btf_member_bitfield_size(t, member);
+ if (__btf_member_bitfield_size(t, member)) {
+ u32 end_bit = __btf_member_bit_offset(t, member) +
+ __btf_member_bitfield_size(t, member);
/* off <= moff instead of off == moff because clang
* does not generate a BTF member for anonymous
@@ -5563,12 +5576,53 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#endif
};
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ bpf_log(log, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_type_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
+
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
+ bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
@@ -5621,7 +5675,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- if (btf_is_kernel(btf)) {
+ if (btf_get_prog_ctx_type(log, btf, t,
+ env->prog->type, i)) {
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
+ */
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
+ }
+ if (check_ctx_reg(env, reg, regno))
+ return -EINVAL;
+ } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) {
const struct btf_type *reg_ref_t;
const struct btf *reg_btf;
const char *reg_ref_tname;
@@ -5637,14 +5704,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
- } else if (reg2btf_ids[reg->type]) {
+ } else {
reg_btf = btf_vmlinux;
reg_ref_id = *reg2btf_ids[reg->type];
- } else {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
- func_name, i,
- btf_type_str(ref_t), ref_tname, regno);
- return -EINVAL;
}
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
@@ -5660,23 +5722,24 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_tname);
return -EINVAL;
}
- } else if (btf_get_prog_ctx_type(log, btf, t,
- env->prog->type, i)) {
- /* If function expects ctx type in BTF check that caller
- * is passing PTR_TO_CTX.
- */
- if (reg->type != PTR_TO_CTX) {
- bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_type_str(t));
- return -EINVAL;
- }
- if (check_ctx_reg(env, reg, regno))
- return -EINVAL;
} else if (ptr_to_mem_ok) {
const struct btf_type *resolve_ret;
u32 type_size;
+ if (is_kfunc) {
+ /* Permit pointer to mem, but only when argument
+ * type is pointer to scalar, or struct composed
+ * (recursively) of scalars.
+ */
+ if (!btf_type_is_scalar(ref_t) &&
+ !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
+ bpf_log(log,
+ "arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
+ i, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ }
+
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
if (IS_ERR(resolve_ret)) {
bpf_log(log,
@@ -5689,6 +5752,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (check_mem_reg(env, reg, regno, type_size))
return -EINVAL;
} else {
+ bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
+ is_kfunc ? "kernel " : "", func_name, func_id);
return -EINVAL;
}
}
@@ -5738,7 +5803,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs)
{
- return btf_check_func_arg_match(env, btf, func_id, regs, false);
+ return btf_check_func_arg_match(env, btf, func_id, regs, true);
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -5846,7 +5911,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
}
- reg->type = PTR_TO_MEM_OR_NULL;
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
reg->id = ++env->id_gen;
continue;
@@ -6157,6 +6222,8 @@ btf_module_read(struct file *file, struct kobject *kobj,
return len;
}
+static void purge_cand_cache(struct btf *btf);
+
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
void *module)
{
@@ -6191,6 +6258,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
goto out;
}
+ purge_cand_cache(NULL);
mutex_lock(&btf_module_mutex);
btf_mod->module = module;
btf_mod->btf = btf;
@@ -6233,6 +6301,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
list_del(&btf_mod->list);
if (btf_mod->sysfs_attr)
sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
+ purge_cand_cache(btf_mod->btf);
btf_put(btf_mod->btf);
kfree(btf_mod->sysfs_attr);
kfree(btf_mod);
@@ -6336,13 +6405,16 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
.func = bpf_btf_find_by_name_kind,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
-BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
+BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
+#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
+BTF_TRACING_TYPE_xxx
+#undef BTF_TRACING_TYPE
/* BTF ID set registration API for modules */
@@ -6391,3 +6463,384 @@ DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
#endif
+
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id)
+{
+ return -EOPNOTSUPP;
+}
+
+static bool bpf_core_is_flavor_sep(const char *s)
+{
+ /* check X___Y name pattern, where X and Y are not underscores */
+ return s[0] != '_' && /* X */
+ s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
+ s[4] != '_'; /* Y */
+}
+
+size_t bpf_core_essential_name_len(const char *name)
+{
+ size_t n = strlen(name);
+ int i;
+
+ for (i = n - 5; i >= 0; i--) {
+ if (bpf_core_is_flavor_sep(name + i))
+ return i + 1;
+ }
+ return n;
+}
+
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
+};
+
+static void bpf_free_cands(struct bpf_cand_cache *cands)
+{
+ if (!cands->cnt)
+ /* empty candidate array was allocated on stack */
+ return;
+ kfree(cands);
+}
+
+static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
+{
+ kfree(cands->name);
+ kfree(cands);
+}
+
+#define VMLINUX_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
+
+#define MODULE_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static void __print_cand_cache(struct bpf_verifier_log *log,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ bpf_log(log, "[%d]%s(", i, cc->name);
+ for (j = 0; j < cc->cnt; j++) {
+ bpf_log(log, "%d", cc->cands[j].id);
+ if (j < cc->cnt - 1)
+ bpf_log(log, " ");
+ }
+ bpf_log(log, "), ");
+ }
+}
+
+static void print_cand_cache(struct bpf_verifier_log *log)
+{
+ mutex_lock(&cand_cache_mutex);
+ bpf_log(log, "vmlinux_cand_cache:");
+ __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ bpf_log(log, "\nmodule_cand_cache:");
+ __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ bpf_log(log, "\n");
+ mutex_unlock(&cand_cache_mutex);
+}
+
+static u32 hash_cands(struct bpf_cand_cache *cands)
+{
+ return jhash(cands->name, cands->name_len, 0);
+}
+
+static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
+
+ if (cc && cc->name_len == cands->name_len &&
+ !strncmp(cc->name, cands->name, cands->name_len))
+ return cc;
+ return NULL;
+}
+
+static size_t sizeof_cands(int cnt)
+{
+ return offsetof(struct bpf_cand_cache, cands[cnt]);
+}
+
+static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
+
+ if (*cc) {
+ bpf_free_cands_from_cache(*cc);
+ *cc = NULL;
+ }
+ new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* strdup the name, since it will stay in cache.
+ * the cands->name points to strings in prog's BTF and the prog can be unloaded.
+ */
+ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
+ bpf_free_cands(cands);
+ if (!new_cands->name) {
+ kfree(new_cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ *cc = new_cands;
+ return new_cands;
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ if (!btf) {
+ /* when new module is loaded purge all of module_cand_cache,
+ * since new module might have candidates with the name
+ * that matches cached cands.
+ */
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ continue;
+ }
+ /* when module is unloaded purge cache entries
+ * that match module's btf
+ */
+ for (j = 0; j < cc->cnt; j++)
+ if (cc->cands[j].btf == btf) {
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ break;
+ }
+ }
+
+}
+
+static void purge_cand_cache(struct btf *btf)
+{
+ mutex_lock(&cand_cache_mutex);
+ __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ mutex_unlock(&cand_cache_mutex);
+}
+#endif
+
+static struct bpf_cand_cache *
+bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
+ int targ_start_id)
+{
+ struct bpf_cand_cache *new_cands;
+ const struct btf_type *t;
+ const char *targ_name;
+ size_t targ_essent_len;
+ int n, i;
+
+ n = btf_nr_types(targ_btf);
+ for (i = targ_start_id; i < n; i++) {
+ t = btf_type_by_id(targ_btf, i);
+ if (btf_kind(t) != cands->kind)
+ continue;
+
+ targ_name = btf_name_by_offset(targ_btf, t->name_off);
+ if (!targ_name)
+ continue;
+
+ /* the resched point is before strncmp to make sure that search
+ * for non-existing name will have a chance to schedule().
+ */
+ cond_resched();
+
+ if (strncmp(cands->name, targ_name, cands->name_len) != 0)
+ continue;
+
+ targ_essent_len = bpf_core_essential_name_len(targ_name);
+ if (targ_essent_len != cands->name_len)
+ continue;
+
+ /* most of the time there is only one candidate for a given kind+name pair */
+ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(new_cands, cands, sizeof_cands(cands->cnt));
+ bpf_free_cands(cands);
+ cands = new_cands;
+ cands->cands[cands->cnt].btf = targ_btf;
+ cands->cands[cands->cnt].id = i;
+ cands->cnt++;
+ }
+ return cands;
+}
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
+{
+ struct bpf_cand_cache *cands, *cc, local_cand = {};
+ const struct btf *local_btf = ctx->btf;
+ const struct btf_type *local_type;
+ const struct btf *main_btf;
+ size_t local_essent_len;
+ struct btf *mod_btf;
+ const char *name;
+ int id;
+
+ main_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(main_btf))
+ return ERR_CAST(main_btf);
+
+ local_type = btf_type_by_id(local_btf, local_type_id);
+ if (!local_type)
+ return ERR_PTR(-EINVAL);
+
+ name = btf_name_by_offset(local_btf, local_type->name_off);
+ if (str_is_empty(name))
+ return ERR_PTR(-EINVAL);
+ local_essent_len = bpf_core_essential_name_len(name);
+
+ cands = &local_cand;
+ cands->name = name;
+ cands->kind = btf_kind(local_type);
+ cands->name_len = local_essent_len;
+
+ cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ /* cands is a pointer to stack here */
+ if (cc) {
+ if (cc->cnt)
+ return cc;
+ goto check_modules;
+ }
+
+ /* Attempt to find target candidates in vmlinux BTF first */
+ cands = bpf_core_add_cands(cands, main_btf, 1);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
+
+ /* populate cache even when cands->cnt == 0 */
+ cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ if (IS_ERR(cc))
+ return ERR_CAST(cc);
+
+ /* if vmlinux BTF has any candidate, don't go for module BTFs */
+ if (cc->cnt)
+ return cc;
+
+check_modules:
+ /* cands is a pointer to stack here and cands->cnt == 0 */
+ cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ if (cc)
+ /* if cache has it return it even if cc->cnt == 0 */
+ return cc;
+
+ /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+ if (IS_ERR(cands)) {
+ btf_put(mod_btf);
+ return ERR_CAST(cands);
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(mod_btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0
+ * or pointer to stack if cands->cnd == 0.
+ * Copy it into the cache even when cands->cnt == 0 and
+ * return the result.
+ */
+ return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+}
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+ int relo_idx, void *insn)
+{
+ bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
+ struct bpf_core_cand_list cands = {};
+ struct bpf_core_spec *specs;
+ int err;
+
+ /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
+ * into arrays of btf_ids of struct fields and array indices.
+ */
+ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
+ if (!specs)
+ return -ENOMEM;
+
+ if (need_cands) {
+ struct bpf_cand_cache *cc;
+ int i;
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(ctx, relo->type_id);
+ if (IS_ERR(cc)) {
+ bpf_log(ctx->log, "target candidate search failed for %d\n",
+ relo->type_id);
+ err = PTR_ERR(cc);
+ goto out;
+ }
+ if (cc->cnt) {
+ cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
+ if (!cands.cands) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < cc->cnt; i++) {
+ bpf_log(ctx->log,
+ "CO-RE relocating %s %s: found target candidate [%d]\n",
+ btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
+ cands.cands[i].btf = cc->cands[i].btf;
+ cands.cands[i].id = cc->cands[i].id;
+ }
+ cands.len = cc->cnt;
+ /* cand_cache_mutex needs to span the cache lookup and
+ * copy of btf pointer into bpf_core_cand_list,
+ * since module can be unloaded while bpf_core_apply_relo_insn
+ * is working with module's btf.
+ */
+ }
+
+ err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
+ relo, relo_idx, ctx->btf, &cands, specs);
+out:
+ kfree(specs);
+ if (need_cands) {
+ kfree(cands.cands);
+ mutex_unlock(&cand_cache_mutex);
+ if (ctx->log->level & BPF_LOG_LEVEL2)
+ print_cand_cache(ctx->log);
+ }
+ return err;
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43eb3501721b..514b4681a90a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1789,7 +1789,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2405e39d800f..de3e5bc6781f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1574,7 +1574,8 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
- if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+
+ if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
@@ -1891,7 +1892,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
- * @fp: bpf_prog populated with internal BPF program
+ * @fp: bpf_prog populated with BPF program
* @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
@@ -2300,7 +2301,6 @@ static void bpf_prog_free_deferred(struct work_struct *work)
}
}
-/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 585b2b77ccc4..b3e6b9422238 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -195,7 +195,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
}
return;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(skb->dev, rcpu->prog, act);
@@ -254,7 +254,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame(xdpf);
@@ -746,15 +746,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
list_add(&bq->flush_node, flush_list);
}
-int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- struct xdp_frame *xdpf;
-
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
/* Info needed when constructing SKB on remote CPU */
xdpf->dev_rx = dev_rx;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f02d04540c0c..fe019dbdb3f0 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -348,7 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
frames[nframes++] = xdpf;
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dev, xdp_prog, act);
@@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
bq->q[bq->count++] = xdpf;
}
-static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx,
struct bpf_prog *xdp_prog)
{
- struct xdp_frame *xdpf;
int err;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
- err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ err = xdp_ok_fwd_dev(dev, xdpf->len);
if (unlikely(err))
return err;
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
@@ -507,7 +502,7 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
__skb_push(skb, skb->mac_len);
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dst->dev, dst->xdp_prog, act);
@@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
return act;
}
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- return __xdp_enqueue(dev, xdp, dev_rx, NULL);
+ return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
- return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
+ return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
}
-static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp)
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
if (!obj ||
!obj->dev->netdev_ops->ndo_xdp_xmit)
return false;
- if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+ if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
return false;
return true;
@@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes)
return n;
}
-int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dst, *last_dst = NULL;
int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
- struct xdp_frame *xdpf;
int num_excluded = 0;
unsigned int i;
int err;
@@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
excluded_devices[num_excluded++] = dev_rx->ifindex;
}
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
for (i = 0; i < map->max_entries; i++) {
dst = rcu_dereference_check(dtab->netdev_map[i],
rcu_read_lock_bh_held());
- if (!is_valid_dst(dst, xdp))
+ if (!is_valid_dst(dst, xdpf))
continue;
if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
@@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_rcu(dst, head, index_hlist,
lockdep_is_held(&dtab->index_lock)) {
- if (!is_valid_dst(dst, xdp))
+ if (!is_valid_dst(dst, xdpf))
continue;
if (is_ifindex_excluded(excluded_devices, num_excluded,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 649f07623df6..01cfdf40c838 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@@ -530,7 +531,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
.func = bpf_strtol,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
@@ -558,13 +559,27 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.func = bpf_strtoul,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
#endif
+BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
+{
+ return strncmp(s1, s2, s1_sz);
+}
+
+const struct bpf_func_proto bpf_strncmp_proto = {
+ .func = bpf_strncmp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+};
+
BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
struct bpf_pidns_info *, nsdata, u32, size)
{
@@ -630,7 +645,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -667,7 +682,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
.func = bpf_per_cpu_ptr,
.gpl_only = false,
- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
.arg2_type = ARG_ANYTHING,
};
@@ -680,7 +695,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
.func = bpf_this_cpu_ptr,
.gpl_only = false,
- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
};
@@ -1011,7 +1026,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg1_type = ARG_PTR_TO_MEM_OR_NULL,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_PTR_TO_CONST_STR,
- .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -1376,6 +1391,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_query_proto;
case BPF_FUNC_for_each_map_elem:
return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_strncmp:
+ return &bpf_strncmp_proto;
default:
break;
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 035e9e3a7132..23f7f9d08a62 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -163,8 +163,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return 0;
}
- new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) +
- map->value_size,
+ new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
map->numa_node);
if (!new)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 423549d2c52e..5763cc7ac4f1 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -412,7 +412,7 @@ static int trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(im_node->child[1], node);
}
- /* Finally, assign the intermediate node to the determined spot */
+ /* Finally, assign the intermediate node to the determined slot */
rcu_assign_pointer(*slot, im_node);
out:
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6a9542af4212..b0fa190b0979 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map_elem, key),
- PTR_TO_RDONLY_BUF_OR_NULL },
+ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
{ offsetof(struct bpf_iter__bpf_map_elem, value),
- PTR_TO_RDWR_BUF_OR_NULL },
+ PTR_TO_BUF | PTR_MAYBE_NULL },
},
};
diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
new file mode 100644
index 000000000000..5d18d7d85bef
--- /dev/null
+++ b/kernel/bpf/mmap_unlock_work.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021 Facebook
+ */
+
+#ifndef __MMAP_UNLOCK_WORK_H__
+#define __MMAP_UNLOCK_WORK_H__
+#include <linux/irq_work.h>
+
+/* irq_work to run mmap_read_unlock() in irq_work */
+struct mmap_unlock_irq_work {
+ struct irq_work irq_work;
+ struct mm_struct *mm;
+};
+
+DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+/*
+ * We cannot do mmap_read_unlock() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To look up vma when the irqs are
+ * disabled, we need to run mmap_read_unlock() in irq_work. We use a
+ * percpu variable to do the irq_work. If the irq_work is already used
+ * by another lookup, we fall over.
+ */
+static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = false;
+
+ if (irqs_disabled()) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ work = this_cpu_ptr(&mmap_unlock_work);
+ if (irq_work_is_busy(&work->irq_work)) {
+ /* cannot queue more up_read, fallback */
+ irq_work_busy = true;
+ }
+ } else {
+ /*
+ * PREEMPT_RT does not allow to trylock mmap sem in
+ * interrupt disabled context. Force the fallback code.
+ */
+ irq_work_busy = true;
+ }
+ }
+
+ *work_ptr = work;
+ return irq_work_busy;
+}
+
+static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm)
+{
+ if (!work) {
+ mmap_read_unlock(mm);
+ } else {
+ work->mm = mm;
+
+ /* The lock will be released once we're out of interrupt
+ * context. Tell lockdep that we've released it now so
+ * it doesn't complain that we forgot to release it.
+ */
+ rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
+ irq_work_queue(&work->irq_work);
+ }
+}
+
+#endif /* __MMAP_UNLOCK_WORK_H__ */
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 542f275bf252..868cc2c43899 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
+#include <linux/bpf-netns.h>
#include <linux/filter.h>
#include <net/net_namespace.h>
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 93a55391791a..556a769b5b80 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -152,16 +152,12 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- u64 array_size;
if (!bpf_capable())
return ERR_PTR(-EPERM);
- array_size = sizeof(*array);
- array_size += (u64)attr->max_entries * sizeof(struct sock *);
-
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size, numa_node);
+ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 9e0c10c6892a..638d7fd7b375 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
.func = bpf_ringbuf_output,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6e75bbee39f0..49e567209c6b 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,10 +7,10 @@
#include <linux/kernel.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
-#include <linux/irq_work.h>
#include <linux/btf_ids.h>
#include <linux/buildid.h>
#include "percpu_freelist.h"
+#include "mmap_unlock_work.h"
#define STACK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
@@ -31,25 +31,6 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
-/* irq_work to run up_read() for build_id lookup in nmi context */
-struct stack_map_irq_work {
- struct irq_work irq_work;
- struct mm_struct *mm;
-};
-
-static void do_up_read(struct irq_work *entry)
-{
- struct stack_map_irq_work *work;
-
- if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
- return;
-
- work = container_of(entry, struct stack_map_irq_work, irq_work);
- mmap_read_unlock_non_owner(work->mm);
-}
-
-static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
-
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -149,35 +130,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u64 *ips, u32 trace_nr, bool user)
{
int i;
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
struct vm_area_struct *vma;
- bool irq_work_busy = false;
- struct stack_map_irq_work *work = NULL;
-
- if (irqs_disabled()) {
- if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- work = this_cpu_ptr(&up_read_work);
- if (irq_work_is_busy(&work->irq_work)) {
- /* cannot queue more up_read, fallback */
- irq_work_busy = true;
- }
- } else {
- /*
- * PREEMPT_RT does not allow to trylock mmap sem in
- * interrupt disabled context. Force the fallback code.
- */
- irq_work_busy = true;
- }
- }
- /*
- * We cannot do up_read() when the irq is disabled, because of
- * risk to deadlock with rq_lock. To do build_id lookup when the
- * irqs are disabled, we need to run up_read() in irq_work. We use
- * a percpu variable to do the irq_work. If the irq_work is
- * already used by another lookup, we fall back to report ips.
- *
- * Same fallback is used for kernel stack (!user) on a stackmap
- * with build_id.
+ /* If the irq_work is in use, fall back to report ips. Same
+ * fallback is used for kernel stack (!user) on a stackmap with
+ * build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
!mmap_read_trylock(current->mm)) {
@@ -203,19 +162,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
}
-
- if (!work) {
- mmap_read_unlock(current->mm);
- } else {
- work->mm = current->mm;
-
- /* The lock will be released once we're out of interrupt
- * context. Tell lockdep that we've released it now so
- * it doesn't complain that we forgot to release it.
- */
- rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_);
- irq_work_queue(&work->irq_work);
- }
+ bpf_mmap_unlock_mm(work, current->mm);
}
static struct perf_callchain_entry *
@@ -542,7 +489,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
- .arg1_btf_id = &btf_task_struct_ids[0],
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
@@ -719,16 +666,3 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_btf_name = "bpf_stack_map",
.map_btf_id = &stack_trace_map_btf_id,
};
-
-static int __init stack_map_init(void)
-{
- int cpu;
- struct stack_map_irq_work *work;
-
- for_each_possible_cpu(cpu) {
- work = per_cpu_ptr(&up_read_work, cpu);
- init_irq_work(&work->irq_work, do_up_read);
- }
- return 0;
-}
-subsys_initcall(stack_map_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1033ee8c0caf..fa4505f9b611 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
@@ -2198,7 +2199,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD fd_array
+#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
{
@@ -4772,7 +4773,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -4819,7 +4820,7 @@ const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
- .arg2_type = ARG_CONST_SIZE,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index b48750bfba5a..d94696198ef8 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -8,6 +8,7 @@
#include <linux/fdtable.h>
#include <linux/filter.h>
#include <linux/btf_ids.h>
+#include "mmap_unlock_work.h"
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
@@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = {
.show = task_vma_seq_show,
};
-BTF_ID_LIST(btf_task_file_ids)
-BTF_ID(struct, file)
-BTF_ID(struct, vm_area_struct)
-
static const struct bpf_iter_seq_info task_seq_info = {
.seq_ops = &task_seq_ops,
.init_seq_private = init_seq_pidns,
@@ -586,23 +583,88 @@ static struct bpf_iter_reg task_vma_reg_info = {
.seq_info = &task_vma_seq_info,
};
+BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
+ bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ struct vm_area_struct *vma;
+ bool irq_work_busy = false;
+ struct mm_struct *mm;
+ int ret = -ENOENT;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!task)
+ return -ENOENT;
+
+ mm = task->mm;
+ if (!mm)
+ return -ENOENT;
+
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+
+ if (irq_work_busy || !mmap_read_trylock(mm))
+ return -EBUSY;
+
+ vma = find_vma(mm, start);
+
+ if (vma && vma->vm_start <= start && vma->vm_end > start) {
+ callback_fn((u64)(long)task, (u64)(long)vma,
+ (u64)(long)callback_ctx, 0, 0);
+ ret = 0;
+ }
+ bpf_mmap_unlock_mm(work, mm);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_find_vma_proto = {
+ .func = bpf_find_vma,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FUNC,
+ .arg4_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg5_type = ARG_ANYTHING,
+};
+
+DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+static void do_mmap_read_unlock(struct irq_work *entry)
+{
+ struct mmap_unlock_irq_work *work;
+
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+ return;
+
+ work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
+ mmap_read_unlock_non_owner(work->mm);
+}
+
static int __init task_iter_init(void)
{
- int ret;
+ struct mmap_unlock_irq_work *work;
+ int ret, cpu;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&mmap_unlock_work, cpu);
+ init_irq_work(&work->irq_work, do_mmap_read_unlock);
+ }
- task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
+ task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
- task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
- task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
+ task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
ret = bpf_iter_reg_target(&task_file_reg_info);
if (ret)
return ret;
- task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
- task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
+ task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
return bpf_iter_reg_target(&task_vma_reg_info);
}
late_initcall(task_iter_init);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index e98de5e73ba5..4b6974a195c1 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -27,6 +27,14 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline_table */
static DEFINE_MUTEX(trampoline_mutex);
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+
+ return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN;
+}
+
void *bpf_jit_alloc_exec_page(void)
{
void *image;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f3001937bbb9..bfb45381fb3f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4,6 +4,7 @@
* Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
*/
#include <uapi/linux/btf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -293,13 +294,15 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
"verifier log line truncated - local buffer too short\n");
- n = min(log->len_total - log->len_used - 1, n);
- log->kbuf[n] = '\0';
-
if (log->level == BPF_LOG_KERNEL) {
- pr_err("BPF:%s\n", log->kbuf);
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
return;
}
+
+ n = min(log->len_total - log->len_used - 1, n);
+ log->kbuf[n] = '\0';
if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
log->len_used += n;
else
@@ -439,18 +442,6 @@ static bool reg_type_not_null(enum bpf_reg_type type)
type == PTR_TO_SOCK_COMMON;
}
-static bool reg_type_may_be_null(enum bpf_reg_type type)
-{
- return type == PTR_TO_MAP_VALUE_OR_NULL ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_SOCK_COMMON_OR_NULL ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_BTF_ID_OR_NULL ||
- type == PTR_TO_MEM_OR_NULL ||
- type == PTR_TO_RDONLY_BUF_OR_NULL ||
- type == PTR_TO_RDWR_BUF_OR_NULL;
-}
-
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
return reg->type == PTR_TO_MAP_VALUE &&
@@ -459,12 +450,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_MEM ||
- type == PTR_TO_MEM_OR_NULL;
+ return base_type(type) == PTR_TO_SOCKET ||
+ base_type(type) == PTR_TO_TCP_SOCK ||
+ base_type(type) == PTR_TO_MEM;
+}
+
+static bool type_is_rdonly_mem(u32 type)
+{
+ return type & MEM_RDONLY;
}
static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
@@ -472,14 +465,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
return type == ARG_PTR_TO_SOCK_COMMON;
}
-static bool arg_type_may_be_null(enum bpf_arg_type type)
+static bool type_may_be_null(u32 type)
{
- return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
- type == ARG_PTR_TO_MEM_OR_NULL ||
- type == ARG_PTR_TO_CTX_OR_NULL ||
- type == ARG_PTR_TO_SOCKET_OR_NULL ||
- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
- type == ARG_PTR_TO_STACK_OR_NULL;
+ return type & PTR_MAYBE_NULL;
}
/* Determine whether the function releases some resources allocated by another
@@ -539,39 +527,54 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
insn->imm == BPF_CMPXCHG;
}
-/* string representation of 'enum bpf_reg_type' */
-static const char * const reg_type_str[] = {
- [NOT_INIT] = "?",
- [SCALAR_VALUE] = "inv",
- [PTR_TO_CTX] = "ctx",
- [CONST_PTR_TO_MAP] = "map_ptr",
- [PTR_TO_MAP_VALUE] = "map_value",
- [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_STACK] = "fp",
- [PTR_TO_PACKET] = "pkt",
- [PTR_TO_PACKET_META] = "pkt_meta",
- [PTR_TO_PACKET_END] = "pkt_end",
- [PTR_TO_FLOW_KEYS] = "flow_keys",
- [PTR_TO_SOCKET] = "sock",
- [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
- [PTR_TO_SOCK_COMMON] = "sock_common",
- [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
- [PTR_TO_TCP_SOCK] = "tcp_sock",
- [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
- [PTR_TO_TP_BUFFER] = "tp_buffer",
- [PTR_TO_XDP_SOCK] = "xdp_sock",
- [PTR_TO_BTF_ID] = "ptr_",
- [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
- [PTR_TO_MEM] = "mem",
- [PTR_TO_MEM_OR_NULL] = "mem_or_null",
- [PTR_TO_RDONLY_BUF] = "rdonly_buf",
- [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
- [PTR_TO_RDWR_BUF] = "rdwr_buf",
- [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
- [PTR_TO_FUNC] = "func",
- [PTR_TO_MAP_KEY] = "map_key",
-};
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+static const char *reg_type_str(struct bpf_verifier_env *env,
+ enum bpf_reg_type type)
+{
+ char postfix[16] = {0}, prefix[16] = {0};
+ static const char * const str[] = {
+ [NOT_INIT] = "?",
+ [SCALAR_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_STACK] = "fp",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
+ [PTR_TO_PACKET_END] = "pkt_end",
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
+ [PTR_TO_SOCKET] = "sock",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
+ [PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_BUF] = "buf",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_MAP_KEY] = "map_key",
+ };
+
+ if (type & PTR_MAYBE_NULL) {
+ if (base_type(type) == PTR_TO_BTF_ID ||
+ base_type(type) == PTR_TO_PERCPU_BTF_ID)
+ strncpy(postfix, "or_null_", 16);
+ else
+ strncpy(postfix, "_or_null", 16);
+ }
+
+ if (type & MEM_RDONLY)
+ strncpy(prefix, "rdonly_", 16);
+
+ snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
+ prefix, str[base_type(type)], postfix);
+ return env->type_str_buf;
+}
static char slot_type_char[] = {
[STACK_INVALID] = '?',
@@ -606,6 +609,44 @@ static const char *kernel_type_name(const struct btf* btf, u32 id)
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}
+static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+{
+ env->scratched_regs |= 1U << regno;
+}
+
+static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+{
+ env->scratched_stack_slots |= 1UL << spi;
+}
+
+static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+{
+ return (env->scratched_regs >> regno) & 1;
+}
+
+static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+{
+ return (env->scratched_stack_slots >> regno) & 1;
+}
+
+static bool verifier_state_scratched(const struct bpf_verifier_env *env)
+{
+ return env->scratched_regs || env->scratched_stack_slots;
+}
+
+static void mark_verifier_state_clean(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = 0U;
+ env->scratched_stack_slots = 0UL;
+}
+
+/* Used for printing the entire verifier state. */
+static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = ~0U;
+ env->scratched_stack_slots = ~0UL;
+}
+
/* The reg state of a pointer or a bounded scalar was saved when
* it was spilled to the stack.
*/
@@ -621,7 +662,8 @@ static void scrub_spilled_slot(u8 *stype)
}
static void print_verifier_state(struct bpf_verifier_env *env,
- const struct bpf_func_state *state)
+ const struct bpf_func_state *state,
+ bool print_all)
{
const struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -634,9 +676,11 @@ static void print_verifier_state(struct bpf_verifier_env *env,
t = reg->type;
if (t == NOT_INIT)
continue;
+ if (!print_all && !reg_scratched(env, i))
+ continue;
verbose(env, " R%d", i);
print_liveness(env, reg->live);
- verbose(env, "=%s", reg_type_str[t]);
+ verbose(env, "=%s", reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
@@ -644,9 +688,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- if (t == PTR_TO_BTF_ID ||
- t == PTR_TO_BTF_ID_OR_NULL ||
- t == PTR_TO_PERCPU_BTF_ID)
+ if (base_type(t) == PTR_TO_BTF_ID ||
+ base_type(t) == PTR_TO_PERCPU_BTF_ID)
verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
@@ -655,10 +698,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose(env, ",off=%d", reg->off);
if (type_is_pkt_pointer(t))
verbose(env, ",r=%d", reg->range);
- else if (t == CONST_PTR_TO_MAP ||
- t == PTR_TO_MAP_KEY ||
- t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL)
+ else if (base_type(t) == CONST_PTR_TO_MAP ||
+ base_type(t) == PTR_TO_MAP_KEY ||
+ base_type(t) == PTR_TO_MAP_VALUE)
verbose(env, ",ks=%d,vs=%d",
reg->map_ptr->key_size,
reg->map_ptr->value_size);
@@ -723,12 +765,14 @@ static void print_verifier_state(struct bpf_verifier_env *env,
types_buf[BPF_REG_SIZE] = 0;
if (!valid)
continue;
+ if (!print_all && !stack_slot_scratched(env, i))
+ continue;
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, state->stack[i].spilled_ptr.live);
if (is_spilled_reg(&state->stack[i])) {
reg = &state->stack[i].spilled_ptr;
t = reg->type;
- verbose(env, "=%s", reg_type_str[t]);
+ verbose(env, "=%s", reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
@@ -748,6 +792,26 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (state->in_async_callback_fn)
verbose(env, " async_cb");
verbose(env, "\n");
+ mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+ return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+ BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+static void print_insn_state(struct bpf_verifier_env *env,
+ const struct bpf_func_state *state)
+{
+ if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+ /* remove new line character */
+ bpf_vlog_reset(&env->log, env->prev_log_len - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+ } else {
+ verbose(env, "%d:", env->insn_idx);
+ }
+ print_verifier_state(env, state, false);
}
/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
@@ -1141,8 +1205,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
- switch (reg->type) {
- case PTR_TO_MAP_VALUE_OR_NULL: {
+ if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
const struct bpf_map *map = reg->map_ptr;
if (map->inner_map_meta) {
@@ -1161,32 +1224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
} else {
reg->type = PTR_TO_MAP_VALUE;
}
- break;
- }
- case PTR_TO_SOCKET_OR_NULL:
- reg->type = PTR_TO_SOCKET;
- break;
- case PTR_TO_SOCK_COMMON_OR_NULL:
- reg->type = PTR_TO_SOCK_COMMON;
- break;
- case PTR_TO_TCP_SOCK_OR_NULL:
- reg->type = PTR_TO_TCP_SOCK;
- break;
- case PTR_TO_BTF_ID_OR_NULL:
- reg->type = PTR_TO_BTF_ID;
- break;
- case PTR_TO_MEM_OR_NULL:
- reg->type = PTR_TO_MEM;
- break;
- case PTR_TO_RDONLY_BUF_OR_NULL:
- reg->type = PTR_TO_RDONLY_BUF;
- break;
- case PTR_TO_RDWR_BUF_OR_NULL:
- reg->type = PTR_TO_RDWR_BUF;
- break;
- default:
- WARN_ONCE(1, "unknown nullable register type");
+ return;
}
+
+ reg->type &= ~PTR_MAYBE_NULL;
}
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
@@ -1366,22 +1407,28 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
+static bool __reg32_bound_s64(s32 a)
+{
+ return a >= 0 && a <= S32_MAX;
+}
+
static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
{
reg->umin_value = reg->u32_min_value;
reg->umax_value = reg->u32_max_value;
- /* Attempt to pull 32-bit signed bounds into 64-bit bounds
- * but must be positive otherwise set to worse case bounds
- * and refine later from tnum.
+
+ /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
+ * be positive otherwise set to worse case bounds and refine later
+ * from tnum.
*/
- if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0)
- reg->smax_value = reg->s32_max_value;
- else
- reg->smax_value = U32_MAX;
- if (reg->s32_min_value >= 0)
+ if (__reg32_bound_s64(reg->s32_min_value) &&
+ __reg32_bound_s64(reg->s32_max_value)) {
reg->smin_value = reg->s32_min_value;
- else
+ reg->smax_value = reg->s32_max_value;
+ } else {
reg->smin_value = 0;
+ reg->smax_value = U32_MAX;
+ }
}
static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
@@ -1538,6 +1585,7 @@ static void init_func_state(struct bpf_verifier_env *env,
state->frameno = frameno;
state->subprogno = subprogno;
init_reg_state(env, state);
+ mark_verifier_state_scratched(env);
}
/* Similar to push_stack(), but for async callbacks */
@@ -2041,7 +2089,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
break;
if (parent->live & REG_LIVE_DONE) {
verbose(env, "verifier BUG type %s var_off %lld off %d\n",
- reg_type_str[parent->type],
+ reg_type_str(env, parent->type),
parent->var_off.value, parent->off);
return -EFAULT;
}
@@ -2225,6 +2273,8 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return -EINVAL;
}
+ mark_reg_scratched(env, regno);
+
reg = &regs[regno];
rw64 = is_reg64(env, insn, regno, reg, t);
if (t == SRC_OP) {
@@ -2329,7 +2379,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (insn->code == 0)
return 0;
- if (env->log.level & BPF_LOG_LEVEL) {
+ if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
verbose(env, "%d: ", idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
@@ -2379,8 +2429,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
*/
if (insn->src_reg != BPF_REG_FP)
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
- return 0;
/* dreg = *(u64 *)[fp - off] was a fill from the stack.
* that [fp - off] slot contains scalar that needs to be
@@ -2403,8 +2451,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
/* scalars can only be spilled into stack */
if (insn->dst_reg != BPF_REG_FP)
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
- return 0;
spi = (-insn->off - 1) / BPF_REG_SIZE;
if (spi >= 64) {
verbose(env, "BUG spi %d\n", spi);
@@ -2587,7 +2633,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
- if (env->log.level & BPF_LOG_LEVEL)
+ if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
for (i = last_idx;;) {
if (skip_first) {
@@ -2674,11 +2720,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
new_marks = true;
reg->precise = true;
}
- if (env->log.level & BPF_LOG_LEVEL) {
- print_verifier_state(env, func);
- verbose(env, "parent %s regs=%x stack=%llx marks\n",
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "parent %s regs=%x stack=%llx marks:",
new_marks ? "didn't have" : "already had",
reg_mask, stack_mask);
+ print_verifier_state(env, func, true);
}
if (!reg_mask && !stack_mask)
@@ -2704,9 +2750,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
static bool is_spillable_regtype(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_MAP_VALUE:
- case PTR_TO_MAP_VALUE_OR_NULL:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
@@ -2715,21 +2760,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_FLOW_KEYS:
case CONST_PTR_TO_MAP:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
- case PTR_TO_RDONLY_BUF:
- case PTR_TO_RDONLY_BUF_OR_NULL:
- case PTR_TO_RDWR_BUF:
- case PTR_TO_RDWR_BUF_OR_NULL:
+ case PTR_TO_BUF:
case PTR_TO_PERCPU_BTF_ID:
case PTR_TO_MEM:
- case PTR_TO_MEM_OR_NULL:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
return true;
@@ -2834,6 +2871,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
}
+ mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
@@ -2955,6 +2993,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+ mark_stack_slot_scratched(env, spi);
if (!env->allow_ptr_leaks
&& *stype != NOT_INIT
@@ -3371,11 +3410,8 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
/* We may have adjusted the register pointing to memory region, so we
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
- */
- if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, state);
-
- /* The minimum value is only important with signed
+ *
+ * The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
@@ -3570,7 +3606,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
- if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {
+ if (base_type(*reg_type) == PTR_TO_BTF_ID) {
*btf = info.btf;
*btf_id = info.btf_id;
} else {
@@ -3638,7 +3674,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
}
verbose(env, "R%d invalid %s access off=%d size=%d\n",
- regno, reg_type_str[reg->type], off, size);
+ regno, reg_type_str(env, reg->type), off, size);
return -EACCES;
}
@@ -4365,15 +4401,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
mark_reg_unknown(env, regs, value_regno);
}
}
- } else if (reg->type == PTR_TO_MEM) {
+ } else if (base_type(reg->type) == PTR_TO_MEM) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+
+ if (type_may_be_null(reg->type)) {
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
+ if (t == BPF_WRITE && rdonly_mem) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into mem\n", value_regno);
return -EACCES;
}
+
err = check_mem_region_access(env, regno, off, size,
reg->mem_size, false);
- if (!err && t == BPF_READ && value_regno >= 0)
+ if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -4403,7 +4454,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
mark_reg_known_zero(env, regs,
value_regno);
- if (reg_type_may_be_null(reg_type))
+ if (type_may_be_null(reg_type))
regs[value_regno].id = ++env->id_gen;
/* A load of ctx field could have different
* actual load size with the one encoded in the
@@ -4411,8 +4462,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (reg_type == PTR_TO_BTF_ID ||
- reg_type == PTR_TO_BTF_ID_OR_NULL) {
+ if (base_type(reg_type) == PTR_TO_BTF_ID) {
regs[value_regno].btf = btf;
regs[value_regno].btf_id = btf_id;
}
@@ -4465,7 +4515,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str[reg->type]);
+ regno, reg_type_str(env, reg->type));
return -EACCES;
}
err = check_sock_access(env, insn_idx, regno, off, size, t);
@@ -4481,26 +4531,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (reg->type == CONST_PTR_TO_MAP) {
err = check_ptr_to_map_access(env, regs, regno, off, size, t,
value_regno);
- } else if (reg->type == PTR_TO_RDONLY_BUF) {
- if (t == BPF_WRITE) {
- verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str[reg->type]);
- return -EACCES;
+ } else if (base_type(reg->type) == PTR_TO_BUF) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+ const char *buf_info;
+ u32 *max_access;
+
+ if (rdonly_mem) {
+ if (t == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ buf_info = "rdonly";
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ buf_info = "rdwr";
+ max_access = &env->prog->aux->max_rdwr_access;
}
+
err = check_buffer_access(env, reg, regno, off, size, false,
- "rdonly",
- &env->prog->aux->max_rdonly_access);
- if (!err && value_regno >= 0)
- mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_RDWR_BUF) {
- err = check_buffer_access(env, reg, regno, off, size, false,
- "rdwr",
- &env->prog->aux->max_rdwr_access);
- if (!err && t == BPF_READ && value_regno >= 0)
+ buf_info, max_access);
+
+ if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EACCES;
}
@@ -4551,9 +4607,16 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
if (insn->imm == BPF_CMPXCHG) {
/* Check comparison of R0 with memory location */
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ const u32 aux_reg = BPF_REG_0;
+
+ err = check_reg_arg(env, aux_reg, SRC_OP);
if (err)
return err;
+
+ if (is_pointer_value(env, aux_reg)) {
+ verbose(env, "R%d leaks addr into mem\n", aux_reg);
+ return -EACCES;
+ }
}
if (is_pointer_value(env, insn->src_reg)) {
@@ -4567,7 +4630,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
is_sk_reg(env, insn->dst_reg)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
return -EACCES;
}
@@ -4588,13 +4651,19 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
load_reg = -1;
}
- /* check whether we can read the memory */
+ /* Check whether we can read the memory, with second call for fetch
+ * case to simulate the register fill.
+ */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, load_reg, true);
+ BPF_SIZE(insn->code), BPF_READ, -1, true);
+ if (!err && load_reg >= 0)
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, load_reg,
+ true);
if (err)
return err;
- /* check whether we can write into the same memory */
+ /* Check whether we can write into the same memory. */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true);
if (err)
@@ -4744,8 +4813,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ const char *buf_info;
+ u32 *max_access;
- switch (reg->type) {
+ switch (base_type(reg->type)) {
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size,
@@ -4764,18 +4835,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_mem_region_access(env, regno, reg->off,
access_size, reg->mem_size,
zero_size_allowed);
- case PTR_TO_RDONLY_BUF:
- if (meta && meta->raw_mode)
- return -EACCES;
- return check_buffer_access(env, reg, regno, reg->off,
- access_size, zero_size_allowed,
- "rdonly",
- &env->prog->aux->max_rdonly_access);
- case PTR_TO_RDWR_BUF:
+ case PTR_TO_BUF:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (meta && meta->raw_mode)
+ return -EACCES;
+
+ buf_info = "rdonly";
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ buf_info = "rdwr";
+ max_access = &env->prog->aux->max_rdwr_access;
+ }
return check_buffer_access(env, reg, regno, reg->off,
access_size, zero_size_allowed,
- "rdwr",
- &env->prog->aux->max_rdwr_access);
+ buf_info, max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
env,
@@ -4787,9 +4860,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
register_is_null(reg))
return 0;
- verbose(env, "R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type],
- reg_type_str[PTR_TO_STACK]);
+ verbose(env, "R%d type=%s ", regno,
+ reg_type_str(env, reg->type));
+ verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
return -EACCES;
}
}
@@ -4800,7 +4873,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
if (register_is_null(reg))
return 0;
- if (reg_type_may_be_null(reg->type)) {
+ if (type_may_be_null(reg->type)) {
/* Assuming that the register contains a value check if the memory
* access is safe. Temporarily save and restore the register's state as
* the conversion shouldn't be visible to a caller.
@@ -4948,9 +5021,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
- return type == ARG_PTR_TO_MEM ||
- type == ARG_PTR_TO_MEM_OR_NULL ||
- type == ARG_PTR_TO_UNINIT_MEM;
+ return base_type(type) == ARG_PTR_TO_MEM ||
+ base_type(type) == ARG_PTR_TO_UNINIT_MEM;
}
static bool arg_type_is_mem_size(enum bpf_arg_type type)
@@ -5055,8 +5127,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
- PTR_TO_RDONLY_BUF,
- PTR_TO_RDWR_BUF,
+ PTR_TO_BUF,
},
};
@@ -5087,31 +5158,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
[ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
[ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
- [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types,
[ARG_CONST_SIZE] = &scalar_types,
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_MAP_PTR] = &const_map_ptr_types,
[ARG_PTR_TO_CTX] = &context_types,
- [ARG_PTR_TO_CTX_OR_NULL] = &context_types,
[ARG_PTR_TO_SOCK_COMMON] = &sock_types,
#ifdef CONFIG_NET
[ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
#endif
[ARG_PTR_TO_SOCKET] = &fullsock_types,
- [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
- [ARG_PTR_TO_MEM_OR_NULL] = &mem_types,
[ARG_PTR_TO_UNINIT_MEM] = &mem_types,
[ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
- [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
[ARG_PTR_TO_FUNC] = &func_ptr_types,
- [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
+ [ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
};
@@ -5125,12 +5191,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
const struct bpf_reg_types *compatible;
int i, j;
- compatible = compatible_reg_types[arg_type];
+ compatible = compatible_reg_types[base_type(arg_type)];
if (!compatible) {
verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
return -EFAULT;
}
+ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
+ *
+ * Same for MAYBE_NULL:
+ *
+ * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
+ *
+ * Therefore we fold these flags depending on the arg_type before comparison.
+ */
+ if (arg_type & MEM_RDONLY)
+ type &= ~MEM_RDONLY;
+ if (arg_type & PTR_MAYBE_NULL)
+ type &= ~PTR_MAYBE_NULL;
+
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
if (expected == NOT_INIT)
@@ -5140,14 +5221,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
goto found;
}
- verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
+ verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
for (j = 0; j + 1 < i; j++)
- verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
- verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
+ verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
+ verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
return -EACCES;
found:
- if (type == PTR_TO_BTF_ID) {
+ if (reg->type == PTR_TO_BTF_ID) {
if (!arg_btf_id) {
if (!compatible->btf_id) {
verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -5206,15 +5287,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return -EACCES;
}
- if (arg_type == ARG_PTR_TO_MAP_VALUE ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
+ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
+ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
err = resolve_map_arg_type(env, meta, &arg_type);
if (err)
return err;
}
- if (register_is_null(reg) && arg_type_may_be_null(arg_type))
+ if (register_is_null(reg) && type_may_be_null(arg_type))
/* A NULL register has a SCALAR_VALUE type, so skip
* type checking.
*/
@@ -5283,10 +5363,11 @@ skip_type_check:
err = check_helper_mem_access(env, regno,
meta->map_ptr->key_size, false,
NULL);
- } else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
- (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
- !register_is_null(reg)) ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
+ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ if (type_may_be_null(arg_type) && register_is_null(reg))
+ return 0;
+
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
@@ -5950,6 +6031,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
if (insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
insn->imm == BPF_FUNC_timer_set_callback) {
struct bpf_verifier_state *async_cb;
@@ -6008,9 +6090,9 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
verbose(env, "callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, callee, true);
}
return 0;
}
@@ -6101,6 +6183,27 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_loop_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
+ * u64 flags);
+ * callback_fn(u32 index, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1].type = SCALAR_VALUE;
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int set_timer_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee,
@@ -6130,6 +6233,33 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_find_vma_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_find_vma(struct task_struct *task, u64 addr,
+ * void *callback_fn, void *callback_ctx, u64 flags)
+ * (callback_fn)(struct task_struct *task,
+ * struct vm_area_struct *vma, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].btf = btf_vmlinux;
+ callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA],
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -6177,9 +6307,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
*insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, callee, true);
verbose(env, "to caller at %d:\n", *insn_idx);
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
}
/* clear everything in the callee */
free_func_state(callee);
@@ -6345,13 +6475,11 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
static int check_get_func_ip(struct bpf_verifier_env *env)
{
- enum bpf_attach_type eatype = env->prog->expected_attach_type;
enum bpf_prog_type type = resolve_prog_type(env->prog);
int func_id = BPF_FUNC_get_func_ip;
if (type == BPF_PROG_TYPE_TRACING) {
- if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT &&
- eatype != BPF_MODIFY_RETURN) {
+ if (!bpf_prog_has_trampoline(env->prog)) {
verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
func_id_name(func_id), func_id);
return -ENOTSUPP;
@@ -6370,6 +6498,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
int *insn_idx_p)
{
const struct bpf_func_proto *fn = NULL;
+ enum bpf_return_type ret_type;
+ enum bpf_type_flag ret_flag;
struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
int insn_idx = *insn_idx_p;
@@ -6447,13 +6577,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
- if (func_id == BPF_FUNC_tail_call) {
- err = check_reference_leak(env);
- if (err) {
- verbose(env, "tail_call would lead to reference leak\n");
- return err;
- }
- } else if (is_release_function(func_id)) {
+ if (is_release_function(func_id)) {
err = release_reference(env, meta.ref_obj_id);
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
@@ -6464,35 +6588,47 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs = cur_regs(env);
- /* check that flags argument in get_local_storage(map, flags) is 0,
- * this is required because get_local_storage() can't return an error.
- */
- if (func_id == BPF_FUNC_get_local_storage &&
- !register_is_null(&regs[BPF_REG_2])) {
- verbose(env, "get_local_storage() doesn't support non-zero flags\n");
- return -EINVAL;
- }
-
- if (func_id == BPF_FUNC_for_each_map_elem) {
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ err = check_reference_leak(env);
+ if (err) {
+ verbose(env, "tail_call would lead to reference leak\n");
+ return err;
+ }
+ break;
+ case BPF_FUNC_get_local_storage:
+ /* check that flags argument in get_local_storage(map, flags) is 0,
+ * this is required because get_local_storage() can't return an error.
+ */
+ if (!register_is_null(&regs[BPF_REG_2])) {
+ verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_FUNC_for_each_map_elem:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_map_elem_callback_state);
- if (err < 0)
- return -EINVAL;
- }
-
- if (func_id == BPF_FUNC_timer_set_callback) {
+ break;
+ case BPF_FUNC_timer_set_callback:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_timer_callback_state);
- if (err < 0)
- return -EINVAL;
- }
-
- if (func_id == BPF_FUNC_snprintf) {
+ break;
+ case BPF_FUNC_find_vma:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_find_vma_callback_state);
+ break;
+ case BPF_FUNC_snprintf:
err = check_bpf_snprintf_call(env, regs);
- if (err < 0)
- return err;
+ break;
+ case BPF_FUNC_loop:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_loop_callback_state);
+ break;
}
+ if (err)
+ return err;
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -6503,13 +6639,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* update return register (already marked as written above) */
- if (fn->ret_type == RET_INTEGER) {
+ ret_type = fn->ret_type;
+ ret_flag = type_flag(fn->ret_type);
+ if (ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0);
- } else if (fn->ret_type == RET_VOID) {
+ } else if (ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
- fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+ } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -6523,28 +6660,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].map_uid = meta.map_uid;
- if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
- if (map_value_has_spin_lock(meta.map_ptr))
- regs[BPF_REG_0].id = ++env->id_gen;
- } else {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
+ if (!type_may_be_null(ret_type) &&
+ map_value_has_spin_lock(meta.map_ptr)) {
+ regs[BPF_REG_0].id = ++env->id_gen;
}
- } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+ } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
+ } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
+ } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
+ } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
- } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
+ } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -6562,29 +6696,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
tname, PTR_ERR(ret));
return -EINVAL;
}
- regs[BPF_REG_0].type =
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
- PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = tsize;
} else {
- regs[BPF_REG_0].type =
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
- PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
+ /* MEM_RDONLY may be carried from ret_flag, but it
+ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+ * it will confuse the check of PTR_TO_BTF_ID in
+ * check_mem_access().
+ */
+ ret_flag &= ~MEM_RDONLY;
+
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
- } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
- fn->ret_type == RET_PTR_TO_BTF_ID) {
+ } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
int ret_btf_id;
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
- PTR_TO_BTF_ID :
- PTR_TO_BTF_ID_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
ret_btf_id = *fn->ret_btf_id;
if (ret_btf_id == 0) {
- verbose(env, "invalid return type %d of func %s#%d\n",
- fn->ret_type, func_id_name(func_id), func_id);
+ verbose(env, "invalid return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id),
+ func_id);
return -EINVAL;
}
/* current BPF helper definitions are only coming from
@@ -6593,12 +6728,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].btf = btf_vmlinux;
regs[BPF_REG_0].btf_id = ret_btf_id;
} else {
- verbose(env, "unknown return type %d of func %s#%d\n",
- fn->ret_type, func_id_name(func_id), func_id);
+ verbose(env, "unknown return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id), func_id);
return -EINVAL;
}
- if (reg_type_may_be_null(regs[BPF_REG_0].type))
+ if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
if (is_ptr_cast_function(func_id)) {
@@ -6807,25 +6942,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
verbose(env, "math between %s pointer and %lld is not allowed\n",
- reg_type_str[type], val);
+ reg_type_str(env, type), val);
return false;
}
if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
verbose(env, "%s pointer offset %d is not allowed\n",
- reg_type_str[type], reg->off);
+ reg_type_str(env, type), reg->off);
return false;
}
if (smin == S64_MIN) {
verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
- reg_type_str[type]);
+ reg_type_str(env, type));
return false;
}
if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
verbose(env, "value %lld makes %s pointer be out of bounds\n",
- smin, reg_type_str[type]);
+ smin, reg_type_str(env, type));
return false;
}
@@ -7202,11 +7337,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
}
- switch (ptr_reg->type) {
- case PTR_TO_MAP_VALUE_OR_NULL:
+ if (ptr_reg->type & PTR_MAYBE_NULL) {
verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
+ }
+
+ switch (base_type(ptr_reg->type)) {
case CONST_PTR_TO_MAP:
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)
@@ -7214,14 +7351,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
fallthrough;
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
default:
break;
@@ -8194,12 +8328,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
@@ -8308,6 +8442,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->dst_reg);
}
zext_32_to_64(dst_reg);
+
+ __update_reg_bounds(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
}
} else {
/* case: R = imm
@@ -8940,17 +9078,17 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
struct bpf_reg_state *reg, u32 id,
bool is_null)
{
- if (reg_type_may_be_null(reg->type) && reg->id == id &&
+ if (type_may_be_null(reg->type) && reg->id == id &&
!WARN_ON_ONCE(!reg->id)) {
- /* Old offset (both fixed and variable parts) should
- * have been known-zero, because we don't allow pointer
- * arithmetic on pointers that might be NULL.
- */
if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
!tnum_equals_const(reg->var_off, 0) ||
reg->off)) {
- __mark_reg_known_zero(reg);
- reg->off = 0;
+ /* Old offset (both fixed and variable parts) should
+ * have been known-zero, because we don't allow pointer
+ * arithmetic on pointers that might be NULL. If we
+ * see this happening, don't convert the register.
+ */
+ return;
}
if (is_null) {
reg->type = SCALAR_VALUE;
@@ -9318,7 +9456,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*/
if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- reg_type_may_be_null(dst_reg->type)) {
+ type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
@@ -9334,7 +9472,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
@@ -9373,7 +9511,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->type = aux->btf_var.reg_type;
- switch (dst_reg->type) {
+ switch (base_type(dst_reg->type)) {
case PTR_TO_MEM:
dst_reg->mem_size = aux->btf_var.mem_size;
break;
@@ -9572,7 +9710,7 @@ static int check_return_code(struct bpf_verifier_env *env)
/* enforce return zero from async callbacks like timer */
if (reg->type != SCALAR_VALUE) {
verbose(env, "In async callback the register R0 is not a known value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -9586,7 +9724,7 @@ static int check_return_code(struct bpf_verifier_env *env)
if (is_subprog) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
return 0;
@@ -9650,7 +9788,7 @@ static int check_return_code(struct bpf_verifier_env *env)
if (reg->type != SCALAR_VALUE) {
verbose(env, "At program exit the register R0 is not a known value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -10233,6 +10371,78 @@ err_free:
return err;
}
+#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
+#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_core_relo(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, nr_core_relo, ncopy, expected_size, rec_size;
+ struct bpf_core_relo core_relo = {};
+ struct bpf_prog *prog = env->prog;
+ const struct btf *btf = prog->aux->btf;
+ struct bpf_core_ctx ctx = {
+ .log = &env->log,
+ .btf = btf,
+ };
+ bpfptr_t u_core_relo;
+ int err;
+
+ nr_core_relo = attr->core_relo_cnt;
+ if (!nr_core_relo)
+ return 0;
+ if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
+ return -EINVAL;
+
+ rec_size = attr->core_relo_rec_size;
+ if (rec_size < MIN_CORE_RELO_SIZE ||
+ rec_size > MAX_CORE_RELO_SIZE ||
+ rec_size % sizeof(u32))
+ return -EINVAL;
+
+ u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_core_relo);
+ ncopy = min_t(u32, expected_size, rec_size);
+
+ /* Unlike func_info and line_info, copy and apply each CO-RE
+ * relocation record one at a time.
+ */
+ for (i = 0; i < nr_core_relo; i++) {
+ /* future proofing when sizeof(bpf_core_relo) changes */
+ err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in core_relo");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, core_relo_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ break;
+ }
+
+ if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
+ verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
+ i, core_relo.insn_off, prog->len);
+ err = -EINVAL;
+ break;
+ }
+
+ err = bpf_core_apply(&ctx, &core_relo, i,
+ &prog->insnsi[core_relo.insn_off / 8]);
+ if (err)
+ break;
+ bpfptr_add(&u_core_relo, rec_size);
+ }
+ return err;
+}
+
static int check_btf_info(struct bpf_verifier_env *env,
const union bpf_attr *attr,
bpfptr_t uattr)
@@ -10263,6 +10473,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
if (err)
return err;
+ err = check_core_relo(env, attr, uattr);
+ if (err)
+ return err;
+
return 0;
}
@@ -10431,7 +10645,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return true;
if (rcur->type == NOT_INIT)
return false;
- switch (rold->type) {
+ switch (base_type(rold->type)) {
case SCALAR_VALUE:
if (env->explore_alu_limits)
return false;
@@ -10453,6 +10667,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
+ /* a PTR_TO_MAP_VALUE could be safe to use as a
+ * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+ * checked, doing so could have affected others with the same
+ * id, and we can't check for that because we lost the id when
+ * we converted to a PTR_TO_MAP_VALUE.
+ */
+ if (type_may_be_null(rold->type)) {
+ if (!type_may_be_null(rcur->type))
+ return false;
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+ return false;
+ /* Check our ids match any regs they're supposed to */
+ return check_ids(rold->id, rcur->id, idmap);
+ }
+
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
* 'id' is not compared, since it's only used for maps with
@@ -10464,20 +10694,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_MAP_VALUE_OR_NULL:
- /* a PTR_TO_MAP_VALUE could be safe to use as a
- * PTR_TO_MAP_VALUE_OR_NULL into the same map.
- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
- * checked, doing so could have affected others with the same
- * id, and we can't check for that because we lost the id when
- * we converted to a PTR_TO_MAP_VALUE.
- */
- if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
- return false;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
- return false;
- /* Check our ids match any regs they're supposed to */
- return check_ids(rold->id, rcur->id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
if (rcur->type != rold->type)
@@ -10506,11 +10722,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
case PTR_TO_PACKET_END:
case PTR_TO_FLOW_KEYS:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
/* Only valid matches are exact, which memcmp() above
* would have accepted
@@ -11036,17 +11249,13 @@ next:
/* Return true if it's OK to have the same insn return a different type. */
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_CTX:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
return false;
default:
return true;
@@ -11126,16 +11335,12 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (env->log.level & BPF_LOG_LEVEL2 ||
- (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "%d:", env->insn_idx);
- else
- verbose(env, "\nfrom %d to %d%s:",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- print_verifier_state(env, state->frame[state->curframe]);
+ if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
+ verbose(env, "\nfrom %d to %d%s:",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ print_verifier_state(env, state->frame[state->curframe], true);
do_print_state = false;
}
@@ -11146,9 +11351,15 @@ static int do_check(struct bpf_verifier_env *env)
.private_data = env,
};
+ if (verifier_state_scratched(env))
+ print_insn_state(env, state->frame[state->curframe]);
+
verbose_linfo(env, env->insn_idx, "; ");
+ env->prev_log_len = env->log.len_used;
verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
+ env->prev_log_len = env->log.len_used;
}
if (bpf_prog_is_dev_bound(env->prog->aux)) {
@@ -11270,7 +11481,7 @@ static int do_check(struct bpf_verifier_env *env)
if (is_ctx_reg(env, insn->dst_reg)) {
verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
return -EACCES;
}
@@ -11357,6 +11568,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
process_bpf_exit:
+ mark_verifier_state_scratched(env);
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
&env->insn_idx, pop_log);
@@ -11522,7 +11734,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
err = -EINVAL;
goto err_put;
}
- aux->btf_var.reg_type = PTR_TO_MEM;
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = tsize;
} else {
aux->btf_var.reg_type = PTR_TO_BTF_ID;
@@ -11682,6 +11894,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
break;
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
break;
default:
verbose(env,
@@ -12865,6 +13080,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
+ enum bpf_attach_type eatype = prog->expected_attach_type;
bool expect_blinding = bpf_jit_blinding_enabled(prog);
enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
@@ -13235,11 +13451,79 @@ patch_map_ops_generic:
continue;
}
+ /* Implement bpf_get_func_arg inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[7] = BPF_JMP_A(1);
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ cnt = 9;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement bpf_get_func_ret inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ret) {
+ if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 6;
+ } else {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
+ cnt = 1;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement get_func_arg_cnt inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg_cnt) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
/* Implement bpf_get_func_ip inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_ip) {
- /* Load IP address from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ /* Load IP address from ctx - 16 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
if (!new_prog)
@@ -13353,7 +13637,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_known_zero(env, regs, i);
else if (regs[i].type == SCALAR_VALUE)
mark_reg_unknown(env, regs, i);
- else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
+ else if (base_type(regs[i].type) == PTR_TO_MEM) {
const u32 mem_size = regs[i].mem_size;
mark_reg_known_zero(env, regs, i);
@@ -13941,13 +14225,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
log->ubuf = (char __user *) (unsigned long) attr->log_buf;
log->len_total = attr->log_size;
- ret = -EINVAL;
/* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
- !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
+ if (!bpf_verifier_log_attr_valid(log)) {
+ ret = -EINVAL;
goto err_unlock;
+ }
}
+ mark_verifier_state_clean(env);
+
if (IS_ERR(btf_vmlinux)) {
/* Either gcc or pahole or kernel are broken. */
verbose(env, "in-kernel BTF is malformed\n");
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..6e36e854b512 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -65,6 +65,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
return container_of(kfc, struct cgroup_fs_context, kfc);
}
+struct cgroup_pidlist;
+
+struct cgroup_file_ctx {
+ struct cgroup_namespace *ns;
+
+ struct {
+ void *trigger;
+ } psi;
+
+ struct {
+ bool started;
+ struct css_task_iter iter;
+ } procs;
+
+ struct {
+ struct cgroup_pidlist *pidlist;
+ } procs1;
+};
+
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 81c9e0685948..41e0837a5a0b 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -394,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* next pid to display, if any
*/
struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
@@ -403,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
mutex_lock(&cgrp->pidlist_mutex);
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
+ * start() after open. If the matching pidlist is around, we can use
+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
+ * directly. It could already have been destroyed.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
+ if (ctx->procs1.pidlist)
+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
+ if (!ctx->procs1.pidlist) {
+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
if (ret)
return ERR_PTR(ret);
}
- l = of->priv;
+ l = ctx->procs1.pidlist;
if (pid) {
int end = l->length;
@@ -449,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
@@ -460,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -504,10 +506,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
goto out_unlock;
/*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
+ * Even if we're attaching all tasks in the thread group, we only need
+ * to check permissions on one of them. Check permissions using the
+ * credentials from file open to protect against inherited fd attacks.
*/
- cred = current_cred();
+ cred = of->file->f_cred;
tcred = get_task_cred(task);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 919194de39c8..b31e1465868a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -30,6 +30,7 @@
#include "cgroup-internal.h"
+#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@@ -2650,11 +2651,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-
if (!list_empty(&src_cset->mg_preload_node))
return;
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
@@ -3630,6 +3631,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
@@ -3648,7 +3650,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return PTR_ERR(new);
}
- psi_trigger_replace(&of->priv, new);
+ psi_trigger_replace(&ctx->psi.trigger, new);
cgroup_put(cgrp);
@@ -3679,12 +3681,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
- return psi_trigger_poll(&of->priv, of->file, pt);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
- psi_trigger_replace(&of->priv, NULL);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_replace(&ctx->psi.trigger, NULL);
}
bool cgroup_psi_enabled(void)
@@ -3811,24 +3817,43 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx;
+ int ret;
- if (cft->open)
- return cft->open(of);
- return 0;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
@@ -3845,7 +3870,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
@@ -4751,21 +4776,21 @@ void css_task_iter_end(struct css_task_iter *it)
static void cgroup_procs_release(struct kernfs_open_file *of)
{
- if (of->priv) {
- css_task_iter_end(of->priv);
- kfree(of->priv);
- }
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
- return css_task_iter_next(it);
+ return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4773,21 +4798,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (!it) {
+ if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
-
- it = kzalloc(sizeof(*it), GFP_KERNEL);
- if (!it)
- return ERR_PTR(-ENOMEM);
- of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
@@ -4838,9 +4860,9 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb)
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
int ret;
@@ -4869,11 +4891,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
static int cgroup_attach_permissions(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb, bool threadgroup)
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
{
int ret = 0;
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
if (ret)
return ret;
@@ -4890,8 +4913,10 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
bool threadgroup)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
bool locked;
@@ -4909,9 +4934,16 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- /* process and thread migrations follow same delegation rule */
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, threadgroup);
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
@@ -5711,7 +5743,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
- css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
+ css = ss->css_alloc(NULL);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
@@ -6130,7 +6162,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
goto err;
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
- !(kargs->flags & CLONE_THREAD));
+ !(kargs->flags & CLONE_THREAD),
+ current->nsproxy->cgroup_ns);
if (ret)
goto err;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index d0e163a02099..dc653ab26e50 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -616,19 +616,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
struct cpuset *c, *par;
int ret;
- rcu_read_lock();
-
- /* Each of our child cpusets must be a subset of us */
- ret = -EBUSY;
- cpuset_for_each_child(c, css, cur)
- if (!is_cpuset_subset(c, trial))
- goto out;
-
- /* Remaining checks don't apply to root cpuset */
- ret = 0;
+ /* The checks don't apply to root cpuset */
if (cur == &top_cpuset)
- goto out;
+ return 0;
+ rcu_read_lock();
par = parent_cs(cur);
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
@@ -3536,7 +3528,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
- int allowed; /* is allocation in zone z allowed? */
+ bool allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 1486768f2318..9d331ba44870 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
- if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+ if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
raw_spin_lock_irqsave(cpu_lock, flags);
@@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
+ struct cgroup *parent;
if (pos == root)
return NULL;
@@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* We're gonna walk down to the first leaf and visit/remove it. We
* can pick whatever unvisited node as the starting point.
*/
- if (!pos)
+ if (!pos) {
pos = root;
- else
+ /* return NULL if this subtree is not on-list */
+ if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
+ return NULL;
+ } else {
pos = cgroup_parent(pos);
+ }
/* walk down to the first leaf */
while (true) {
@@ -115,33 +120,25 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- if (rstatc->updated_next) {
- struct cgroup *parent = cgroup_parent(pos);
-
- if (parent) {
- struct cgroup_rstat_cpu *prstatc;
- struct cgroup **nextp;
-
- prstatc = cgroup_rstat_cpu(parent, cpu);
- nextp = &prstatc->updated_children;
- while (true) {
- struct cgroup_rstat_cpu *nrstatc;
-
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
- if (*nextp == pos)
- break;
- WARN_ON_ONCE(*nextp == parent);
- nextp = &nrstatc->updated_next;
- }
- *nextp = rstatc->updated_next;
- }
+ parent = cgroup_parent(pos);
+ if (parent) {
+ struct cgroup_rstat_cpu *prstatc;
+ struct cgroup **nextp;
- rstatc->updated_next = NULL;
- return pos;
+ prstatc = cgroup_rstat_cpu(parent, cpu);
+ nextp = &prstatc->updated_children;
+ while (*nextp != pos) {
+ struct cgroup_rstat_cpu *nrstatc;
+
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &nrstatc->updated_next;
+ }
+ *nextp = rstatc->updated_next;
}
- /* only happens for @root */
- return NULL;
+ rstatc->updated_next = NULL;
+ return pos;
}
/* see cgroup_rstat_flush() */
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index eb53f5ec62c9..256cf6db573c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -6,6 +6,7 @@
#include <linux/buildid.h>
#include <linux/crash_core.h>
+#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
@@ -295,6 +296,16 @@ int __init parse_crashkernel_low(char *cmdline,
"crashkernel=", suffix_tbl[SUFFIX_LOW]);
}
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+ return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index d5a61d565ad5..bad713684c2e 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -187,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
}
/* Return the latest work state for arch_exit_to_user_mode() */
@@ -196,7 +196,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+ unsigned long ti_work = read_thread_flags();
lockdep_assert_irqs_disabled();
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 49972ee99aff..96d476e06c77 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -26,7 +26,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
if (ret)
return ret;
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
return 0;
}
@@ -43,7 +43,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
* disabled in the inner loop before going into guest mode. No need
* to disable interrupts here.
*/
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
return 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 30d94f68c5bd..1362b9b770d8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1808,6 +1808,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -1999,6 +2001,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
ctx->nr_events--;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f895265d7548..c09324663088 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -575,8 +575,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq);
*/
void handle_untracked_irq(struct irq_desc *desc)
{
- unsigned int flags = 0;
-
raw_spin_lock(&desc->lock);
if (!irq_may_run(desc))
@@ -593,7 +591,7 @@ void handle_untracked_irq(struct irq_desc *desc)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
- __handle_irq_event_percpu(desc, &flags);
+ __handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27182003b879..9489f93b3db3 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -136,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int irq = desc->irq_data.irq;
@@ -174,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
}
__irq_wake_thread(desc, action);
-
- fallthrough; /* to add to randomness */
- case IRQ_HANDLED:
- *flags |= action->flags;
break;
default:
@@ -193,11 +189,10 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval;
- unsigned int flags = 0;
- retval = __handle_irq_event_percpu(desc, &flags);
+ retval = __handle_irq_event_percpu(desc);
- add_interrupt_randomness(desc->irq_data.irq, flags);
+ add_interrupt_randomness(desc->irq_data.irq);
if (!irq_settings_no_debug(desc))
note_interrupt(desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 54363527feea..99cbdf55a8bd 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -103,7 +103,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data,
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index c2bb07f5bcc7..4f35d1bced6a 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -8,9 +8,12 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+ $(call cc-option,-mno-outline-atomics) \
-fno-stack-protector -DDISABLE_BRANCH_PROFILING
obj-y := core.o debugfs.o report.o
+
+KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 4b84c8e7884b..fe12dfe254ec 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -40,15 +40,17 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+static bool kcsan_weak_memory = true;
+module_param_named(weak_memory, kcsan_weak_memory, bool, 0644);
+#else
+#define kcsan_weak_memory false
+#endif
+
bool kcsan_enabled;
/* Per-CPU kcsan_ctx for interrupts */
static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
- .disable_count = 0,
- .atomic_next = 0,
- .atomic_nest_count = 0,
- .in_flat_atomic = false,
- .access_mask = 0,
.scoped_accesses = {LIST_POISON1, NULL},
};
@@ -209,15 +211,17 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip);
static noinline void kcsan_check_scoped_accesses(void)
{
struct kcsan_ctx *ctx = get_ctx();
- struct list_head *prev_save = ctx->scoped_accesses.prev;
struct kcsan_scoped_access *scoped_access;
- ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
+ if (ctx->disable_scoped)
+ return;
+
+ ctx->disable_scoped++;
list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) {
check_access(scoped_access->ptr, scoped_access->size,
scoped_access->type, scoped_access->ip);
}
- ctx->scoped_accesses.prev = prev_save;
+ ctx->disable_scoped--;
}
/* Rules for generic atomic accesses. Called from fast-path. */
@@ -325,6 +329,21 @@ static void delay_access(int type)
udelay(delay);
}
+/*
+ * Reads the instrumented memory for value change detection; value change
+ * detection is currently done for accesses up to a size of 8 bytes.
+ */
+static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
+{
+ switch (size) {
+ case 1: return READ_ONCE(*(const u8 *)ptr);
+ case 2: return READ_ONCE(*(const u16 *)ptr);
+ case 4: return READ_ONCE(*(const u32 *)ptr);
+ case 8: return READ_ONCE(*(const u64 *)ptr);
+ default: return 0; /* Ignore; we do not diff the values. */
+ }
+}
+
void kcsan_save_irqtrace(struct task_struct *task)
{
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -339,6 +358,76 @@ void kcsan_restore_irqtrace(struct task_struct *task)
#endif
}
+static __always_inline int get_kcsan_stack_depth(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return current->kcsan_stack_depth;
+#else
+ BUILD_BUG();
+ return 0;
+#endif
+}
+
+static __always_inline void add_kcsan_stack_depth(int val)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ current->kcsan_stack_depth += val;
+#else
+ BUILD_BUG();
+#endif
+}
+
+static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return ctx->disable_scoped ? NULL : &ctx->reorder_access;
+#else
+ return NULL;
+#endif
+}
+
+static __always_inline bool
+find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access)
+ return false;
+
+ /*
+ * Note: If accesses are repeated while reorder_access is identical,
+ * never matches the new access, because !(type & KCSAN_ACCESS_SCOPED).
+ */
+ return reorder_access->ptr == ptr && reorder_access->size == size &&
+ reorder_access->type == type && reorder_access->ip == ip;
+}
+
+static inline void
+set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access || !kcsan_weak_memory)
+ return;
+
+ /*
+ * To avoid nested interrupts or scheduler (which share kcsan_ctx)
+ * reading an inconsistent reorder_access, ensure that the below has
+ * exclusive access to reorder_access by disallowing concurrent use.
+ */
+ ctx->disable_scoped++;
+ barrier();
+ reorder_access->ptr = ptr;
+ reorder_access->size = size;
+ reorder_access->type = type | KCSAN_ACCESS_SCOPED;
+ reorder_access->ip = ip;
+ reorder_access->stack_depth = get_kcsan_stack_depth();
+ barrier();
+ ctx->disable_scoped--;
+}
+
/*
* Pull everything together: check_access() below contains the performance
* critical operations; the fast-path (including check_access) functions should
@@ -377,8 +466,10 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
* The access_mask check relies on value-change comparison. To avoid
* reporting a race where e.g. the writer set up the watchpoint, but the
* reader has access_mask!=0, we have to ignore the found watchpoint.
+ *
+ * reorder_access is never created from an access with access_mask set.
*/
- if (ctx->access_mask)
+ if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip))
return;
/*
@@ -428,11 +519,13 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
atomic_long_t *watchpoint;
u64 old, new, diff;
- unsigned long access_mask;
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
+ bool interrupt_watcher = kcsan_interrupt_watcher;
unsigned long ua_flags = user_access_save();
struct kcsan_ctx *ctx = get_ctx();
+ unsigned long access_mask = ctx->access_mask;
unsigned long irq_flags = 0;
+ bool is_reorder_access;
/*
* Always reset kcsan_skip counter in slow-path to avoid underflow; see
@@ -456,12 +549,32 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
}
/*
+ * The local CPU cannot observe reordering of its own accesses, and
+ * therefore we need to take care of 2 cases to avoid false positives:
+ *
+ * 1. Races of the reordered access with interrupts. To avoid, if
+ * the current access is reorder_access, disable interrupts.
+ * 2. Avoid races of scoped accesses from nested interrupts (below).
+ */
+ is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip);
+ if (is_reorder_access)
+ interrupt_watcher = false;
+ /*
+ * Avoid races of scoped accesses from nested interrupts (or scheduler).
+ * Assume setting up a watchpoint for a non-scoped (normal) access that
+ * also conflicts with a current scoped access. In a nested interrupt,
+ * which shares the context, it would check a conflicting scoped access.
+ * To avoid, disable scoped access checking.
+ */
+ ctx->disable_scoped++;
+
+ /*
* Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
* runtime is entered for every memory access, and potentially useful
* information is lost if dirtied by KCSAN.
*/
kcsan_save_irqtrace(current);
- if (!kcsan_interrupt_watcher)
+ if (!interrupt_watcher)
local_irq_save(irq_flags);
watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
@@ -482,23 +595,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
* Read the current value, to later check and infer a race if the data
* was modified via a non-instrumented access, e.g. from a device.
*/
- old = 0;
- switch (size) {
- case 1:
- old = READ_ONCE(*(const u8 *)ptr);
- break;
- case 2:
- old = READ_ONCE(*(const u16 *)ptr);
- break;
- case 4:
- old = READ_ONCE(*(const u32 *)ptr);
- break;
- case 8:
- old = READ_ONCE(*(const u64 *)ptr);
- break;
- default:
- break; /* ignore; we do not diff the values */
- }
+ old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size);
/*
* Delay this thread, to increase probability of observing a racy
@@ -510,23 +607,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
* Re-read value, and check if it is as expected; if not, we infer a
* racy access.
*/
- access_mask = ctx->access_mask;
- new = 0;
- switch (size) {
- case 1:
- new = READ_ONCE(*(const u8 *)ptr);
- break;
- case 2:
- new = READ_ONCE(*(const u16 *)ptr);
- break;
- case 4:
- new = READ_ONCE(*(const u32 *)ptr);
- break;
- case 8:
- new = READ_ONCE(*(const u64 *)ptr);
- break;
- default:
- break; /* ignore; we do not diff the values */
+ if (!is_reorder_access) {
+ new = read_instrumented_memory(ptr, size);
+ } else {
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * because the memory location may no longer be accessible and
+ * could result in a fault.
+ */
+ new = 0;
+ access_mask = 0;
}
diff = old ^ new;
@@ -596,10 +686,20 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
*/
remove_watchpoint(watchpoint);
atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
+
out_unlock:
- if (!kcsan_interrupt_watcher)
+ if (!interrupt_watcher)
local_irq_restore(irq_flags);
kcsan_restore_irqtrace(current);
+ ctx->disable_scoped--;
+
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * therefore never consider for reordering if access_mask is set.
+ * ASSERT_EXCLUSIVE are not real accesses, ignore them as well.
+ */
+ if (!access_mask && !is_assert)
+ set_reorder_access(ctx, ptr, size, type, ip);
out:
user_access_restore(ua_flags);
}
@@ -607,7 +707,6 @@ out:
static __always_inline void
check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
- const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
atomic_long_t *watchpoint;
long encoded_watchpoint;
@@ -618,12 +717,14 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
if (unlikely(size == 0))
return;
+again:
/*
* Avoid user_access_save in fast-path: find_watchpoint is safe without
* user_access_save, as the address that ptr points to is only used to
* check if a watchpoint exists; ptr is never dereferenced.
*/
- watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write,
+ watchpoint = find_watchpoint((unsigned long)ptr, size,
+ !(type & KCSAN_ACCESS_WRITE),
&encoded_watchpoint);
/*
* It is safe to check kcsan_is_enabled() after find_watchpoint in the
@@ -637,9 +738,42 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
else {
struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
- if (unlikely(should_watch(ctx, ptr, size, type)))
+ if (unlikely(should_watch(ctx, ptr, size, type))) {
kcsan_setup_watchpoint(ptr, size, type, ip);
- else if (unlikely(ctx->scoped_accesses.prev))
+ return;
+ }
+
+ if (!(type & KCSAN_ACCESS_SCOPED)) {
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (reorder_access) {
+ /*
+ * reorder_access check: simulates reordering of
+ * the access after subsequent operations.
+ */
+ ptr = reorder_access->ptr;
+ type = reorder_access->type;
+ ip = reorder_access->ip;
+ /*
+ * Upon a nested interrupt, this context's
+ * reorder_access can be modified (shared ctx).
+ * We know that upon return, reorder_access is
+ * always invalidated by setting size to 0 via
+ * __tsan_func_exit(). Therefore we must read
+ * and check size after the other fields.
+ */
+ barrier();
+ size = READ_ONCE(reorder_access->size);
+ if (size)
+ goto again;
+ }
+ }
+
+ /*
+ * Always checked last, right before returning from runtime;
+ * if reorder_access is valid, checked after it was checked.
+ */
+ if (unlikely(ctx->scoped_accesses.prev))
kcsan_check_scoped_accesses();
}
}
@@ -814,6 +948,22 @@ void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
}
EXPORT_SYMBOL(__kcsan_check_access);
+#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \
+ void __kcsan_##name(void) \
+ { \
+ struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \
+ if (!sa) \
+ return; \
+ if (order_before_cond) \
+ sa->size = 0; \
+ } \
+ EXPORT_SYMBOL(__kcsan_##name)
+
+DEFINE_MEMORY_BARRIER(mb, true);
+DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(release, true);
+
/*
* KCSAN uses the same instrumentation that is emitted by supported compilers
* for ThreadSanitizer (TSAN).
@@ -926,19 +1076,56 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8);
DEFINE_TSAN_VOLATILE_READ_WRITE(16);
/*
- * The below are not required by KCSAN, but can still be emitted by the
- * compiler.
+ * Function entry and exit are used to determine the validty of reorder_access.
+ * Reordering of the access ends at the end of the function scope where the
+ * access happened. This is done for two reasons:
+ *
+ * 1. Artificially limits the scope where missing barriers are detected.
+ * This minimizes false positives due to uninstrumented functions that
+ * contain the required barriers but were missed.
+ *
+ * 2. Simplifies generating the stack trace of the access.
*/
void __tsan_func_entry(void *call_pc);
-void __tsan_func_entry(void *call_pc)
+noinline void __tsan_func_entry(void *call_pc)
{
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ add_kcsan_stack_depth(1);
}
EXPORT_SYMBOL(__tsan_func_entry);
+
void __tsan_func_exit(void);
-void __tsan_func_exit(void)
+noinline void __tsan_func_exit(void)
{
+ struct kcsan_scoped_access *reorder_access;
+
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ reorder_access = get_reorder_access(get_ctx());
+ if (!reorder_access)
+ goto out;
+
+ if (get_kcsan_stack_depth() <= reorder_access->stack_depth) {
+ /*
+ * Access check to catch cases where write without a barrier
+ * (supposed release) was last access in function: because
+ * instrumentation is inserted before the real access, a data
+ * race due to the write giving up a c-s would only be caught if
+ * we do the conflicting access after.
+ */
+ check_access(reorder_access->ptr, reorder_access->size,
+ reorder_access->type, reorder_access->ip);
+ reorder_access->size = 0;
+ reorder_access->stack_depth = INT_MIN;
+ }
+out:
+ add_kcsan_stack_depth(-1);
}
EXPORT_SYMBOL(__tsan_func_exit);
+
void __tsan_init(void);
void __tsan_init(void)
{
@@ -961,10 +1148,19 @@ EXPORT_SYMBOL(__tsan_init);
* functions, whose job is to also execute the operation itself.
*/
+static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
+{
+ if (memorder == __ATOMIC_RELEASE ||
+ memorder == __ATOMIC_SEQ_CST ||
+ memorder == __ATOMIC_ACQ_REL)
+ __kcsan_release();
+}
+
#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
@@ -974,6 +1170,7 @@ EXPORT_SYMBOL(__tsan_init);
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \
@@ -986,6 +1183,7 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
@@ -1018,6 +1216,7 @@ EXPORT_SYMBOL(__tsan_init);
int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
u##bits val, int mo, int fail_mo) \
{ \
+ kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
@@ -1033,6 +1232,7 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
int mo, int fail_mo) \
{ \
+ kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
@@ -1064,10 +1264,47 @@ DEFINE_TSAN_ATOMIC_OPS(64);
void __tsan_atomic_thread_fence(int memorder);
void __tsan_atomic_thread_fence(int memorder)
{
+ kcsan_atomic_builtin_memorder(memorder);
__atomic_thread_fence(memorder);
}
EXPORT_SYMBOL(__tsan_atomic_thread_fence);
+/*
+ * In instrumented files, we emit instrumentation for barriers by mapping the
+ * kernel barriers to an __atomic_signal_fence(), which is interpreted specially
+ * and otherwise has no relation to a real __atomic_signal_fence(). No known
+ * kernel code uses __atomic_signal_fence().
+ *
+ * Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which
+ * are turned into calls to __tsan_atomic_signal_fence(), such instrumentation
+ * can be disabled via the __no_kcsan function attribute (vs. an explicit call
+ * which could not). When __no_kcsan is requested, __atomic_signal_fence()
+ * generates no code.
+ *
+ * Note: The result of using __atomic_signal_fence() with KCSAN enabled is
+ * potentially limiting the compiler's ability to reorder operations; however,
+ * if barriers were instrumented with explicit calls (without LTO), the compiler
+ * couldn't optimize much anyway. The result of a hypothetical architecture
+ * using __atomic_signal_fence() in normal code would be KCSAN false negatives.
+ */
void __tsan_atomic_signal_fence(int memorder);
-void __tsan_atomic_signal_fence(int memorder) { }
+noinline void __tsan_atomic_signal_fence(int memorder)
+{
+ switch (memorder) {
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb:
+ __kcsan_mb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb:
+ __kcsan_wmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb:
+ __kcsan_rmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release:
+ __kcsan_release();
+ break;
+ default:
+ break;
+ }
+}
EXPORT_SYMBOL(__tsan_atomic_signal_fence);
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 660729238588..a36fca063a73 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -16,9 +16,12 @@
#define pr_fmt(fmt) "kcsan_test: " fmt
#include <kunit/test.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
#include <linux/jiffies.h>
#include <linux/kcsan-checks.h>
#include <linux/kernel.h>
+#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/seqlock.h>
#include <linux/spinlock.h>
@@ -151,7 +154,7 @@ struct expect_report {
/* Check observed report matches information in @r. */
__no_kcsan
-static bool report_matches(const struct expect_report *r)
+static bool __report_matches(const struct expect_report *r)
{
const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
bool ret = false;
@@ -213,9 +216,9 @@ static bool report_matches(const struct expect_report *r)
const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC);
const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED);
const char *const access_type_aux =
- (is_atomic && is_scoped) ? " (marked, scoped)"
+ (is_atomic && is_scoped) ? " (marked, reordered)"
: (is_atomic ? " (marked)"
- : (is_scoped ? " (scoped)" : ""));
+ : (is_scoped ? " (reordered)" : ""));
if (i == 1) {
/* Access 2 */
@@ -253,6 +256,40 @@ out:
return ret;
}
+static __always_inline const struct expect_report *
+__report_set_scoped(struct expect_report *r, int accesses)
+{
+ BUILD_BUG_ON(accesses > 3);
+
+ if (accesses & 1)
+ r->access[0].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[0].type &= ~KCSAN_ACCESS_SCOPED;
+
+ if (accesses & 2)
+ r->access[1].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[1].type &= ~KCSAN_ACCESS_SCOPED;
+
+ return r;
+}
+
+__no_kcsan
+static bool report_matches_any_reordered(struct expect_report *r)
+{
+ return __report_matches(__report_set_scoped(r, 0)) ||
+ __report_matches(__report_set_scoped(r, 1)) ||
+ __report_matches(__report_set_scoped(r, 2)) ||
+ __report_matches(__report_set_scoped(r, 3));
+}
+
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+/* Due to reordering accesses, any access may appear as "(reordered)". */
+#define report_matches report_matches_any_reordered
+#else
+#define report_matches __report_matches
+#endif
+
/* ===== Test kernels ===== */
static long test_sink;
@@ -263,6 +300,8 @@ static struct {
long val[8];
} test_struct;
static DEFINE_SEQLOCK(test_seqlock);
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_MUTEX(test_mutex);
/*
* Helper to avoid compiler optimizing out reads, and to generate source values
@@ -271,6 +310,16 @@ static DEFINE_SEQLOCK(test_seqlock);
__no_kcsan
static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
+/*
+ * Generates a delay and some accesses that enter the runtime but do not produce
+ * data races.
+ */
+static noinline void test_delay(int iter)
+{
+ while (iter--)
+ sink_value(READ_ONCE(test_sink));
+}
+
static noinline void test_kernel_read(void) { sink_value(test_var); }
static noinline void test_kernel_write(void)
@@ -432,19 +481,239 @@ static noinline void test_kernel_xor_1bit(void)
kcsan_nestable_atomic_end();
}
+#define TEST_KERNEL_LOCKED(name, acquire, release) \
+ static noinline void test_kernel_##name(void) \
+ { \
+ long *flag = &test_struct.val[0]; \
+ long v = 0; \
+ if (!(acquire)) \
+ return; \
+ while (v++ < 100) { \
+ test_var++; \
+ barrier(); \
+ } \
+ release; \
+ test_delay(10); \
+ }
+
+TEST_KERNEL_LOCKED(with_memorder,
+ cmpxchg_acquire(flag, 0, 1) == 0,
+ smp_store_release(flag, 0));
+TEST_KERNEL_LOCKED(wrong_memorder,
+ cmpxchg_relaxed(flag, 0, 1) == 0,
+ WRITE_ONCE(*flag, 0));
+TEST_KERNEL_LOCKED(atomic_builtin_with_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELEASE));
+TEST_KERNEL_LOCKED(atomic_builtin_wrong_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELAXED));
+
/* ===== Test cases ===== */
+/*
+ * Tests that various barriers have the expected effect on internal state. Not
+ * exhaustive on atomic_t operations. Unlike the selftest, also checks for
+ * too-strict barrier instrumentation; these can be tolerated, because it does
+ * not cause false positives, but at least we should be aware of such cases.
+ */
+static void test_barrier_nothreads(struct kunit *test)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy;
+
+ KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
+ KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
+
+#define __KCSAN_EXPECT_BARRIER(access_type, barrier, order_before, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = sizeof(test_var); \
+ barrier; \
+ KUNIT_EXPECT_EQ_MSG(test, reorder_access->size, \
+ order_before ? 0 : sizeof(test_var), \
+ "improperly instrumented type=(" #access_type "): " name); \
+ } while (0)
+#define KCSAN_EXPECT_READ_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(0, b, o, #b)
+#define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b)
+#define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b)
+
+ /*
+ * Lockdep initialization can strengthen certain locking operations due
+ * to calling into instrumented files; "warm up" our locks.
+ */
+ spin_lock(&test_spinlock);
+ spin_unlock(&test_spinlock);
+ mutex_lock(&test_mutex);
+ mutex_unlock(&test_mutex);
+
+ /* Force creating a valid entry in reorder_access first. */
+ test_var = 0;
+ while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var))
+ __kcsan_check_read(&test_var, sizeof(test_var));
+ KUNIT_ASSERT_EQ(test, reorder_access->size, sizeof(test_var));
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_EXPECT_READ_BARRIER(mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(dma_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_WRITE_BARRIER(mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_RW_BARRIER(mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
+
+#ifdef clear_bit_unlock_is_negative_byte
+ KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+#endif
+ kcsan_nestable_atomic_end();
+}
+
/* Simple test with normal data race. */
__no_kcsan
static void test_basic(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
- static const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@@ -469,14 +738,14 @@ static void test_basic(struct kunit *test)
__no_kcsan
static void test_concurrent_races(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
/* NULL will match any address. */
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) },
},
};
- static const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_rmw_array, NULL, 0, 0 },
{ test_kernel_rmw_array, NULL, 0, 0 },
@@ -498,13 +767,13 @@ static void test_concurrent_races(struct kunit *test)
__no_kcsan
static void test_novalue_change(struct kunit *test)
{
- const struct expect_report expect_rw = {
+ struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
- const struct expect_report expect_ww = {
+ struct expect_report expect_ww = {
.access = {
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@@ -530,13 +799,13 @@ static void test_novalue_change(struct kunit *test)
__no_kcsan
static void test_novalue_change_exception(struct kunit *test)
{
- const struct expect_report expect_rw = {
+ struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
- const struct expect_report expect_ww = {
+ struct expect_report expect_ww = {
.access = {
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@@ -556,7 +825,7 @@ static void test_novalue_change_exception(struct kunit *test)
__no_kcsan
static void test_unknown_origin(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ NULL },
@@ -578,7 +847,7 @@ static void test_unknown_origin(struct kunit *test)
__no_kcsan
static void test_write_write_assume_atomic(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@@ -604,7 +873,7 @@ static void test_write_write_assume_atomic(struct kunit *test)
__no_kcsan
static void test_write_write_struct(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
@@ -626,7 +895,7 @@ static void test_write_write_struct(struct kunit *test)
__no_kcsan
static void test_write_write_struct_part(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
@@ -658,7 +927,7 @@ static void test_read_atomic_write_atomic(struct kunit *test)
__no_kcsan
static void test_read_plain_atomic_write(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
@@ -679,7 +948,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
__no_kcsan
static void test_read_plain_atomic_rmw(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_atomic_rmw, &test_var, sizeof(test_var),
@@ -701,13 +970,13 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
__no_kcsan
static void test_zero_size_access(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
@@ -741,7 +1010,7 @@ static void test_data_race(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_writer(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@@ -759,7 +1028,7 @@ static void test_assert_exclusive_writer(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_access(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@@ -777,19 +1046,19 @@ static void test_assert_exclusive_access(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_access_writer(struct kunit *test)
{
- const struct expect_report expect_access_writer = {
+ struct expect_report expect_access_writer = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
},
};
- const struct expect_report expect_access_access = {
+ struct expect_report expect_access_access = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
@@ -813,7 +1082,7 @@ static void test_assert_exclusive_access_writer(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_bits_change(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_change_bits, &test_var, sizeof(test_var),
@@ -844,13 +1113,13 @@ static void test_assert_exclusive_bits_nochange(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_writer_scoped(struct kunit *test)
{
- const struct expect_report expect_start = {
+ struct expect_report expect_start = {
.access = {
{ test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report expect_inscope = {
+ struct expect_report expect_inscope = {
.access = {
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@@ -871,16 +1140,16 @@ static void test_assert_exclusive_writer_scoped(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_access_scoped(struct kunit *test)
{
- const struct expect_report expect_start1 = {
+ struct expect_report expect_start1 = {
.access = {
{ test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
- const struct expect_report expect_start2 = {
+ struct expect_report expect_start2 = {
.access = { expect_start1.access[0], expect_start1.access[0] },
};
- const struct expect_report expect_inscope = {
+ struct expect_report expect_inscope = {
.access = {
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@@ -985,7 +1254,7 @@ static void test_atomic_builtins(struct kunit *test)
__no_kcsan
static void test_1bit_value_change(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
@@ -1005,6 +1274,90 @@ static void test_1bit_value_change(struct kunit *test)
KUNIT_EXPECT_TRUE(test, match);
}
+__no_kcsan
+static void test_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_with_memorder, test_kernel_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_wrong_memorder, test_kernel_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_with_memorder,
+ test_kernel_atomic_builtin_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_wrong_memorder,
+ test_kernel_atomic_builtin_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
/*
* Generate thread counts for all test cases. Values generated are in interval
* [2, 5] followed by exponentially increasing thread counts from 8 to 32.
@@ -1054,6 +1407,7 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
#define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params)
static struct kunit_case kcsan_test_cases[] = {
+ KUNIT_CASE(test_barrier_nothreads),
KCSAN_KUNIT_CASE(test_basic),
KCSAN_KUNIT_CASE(test_concurrent_races),
KCSAN_KUNIT_CASE(test_novalue_change),
@@ -1078,6 +1432,10 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_seqlock_noreport),
KCSAN_KUNIT_CASE(test_atomic_builtins),
KCSAN_KUNIT_CASE(test_1bit_value_change),
+ KCSAN_KUNIT_CASE(test_correct_barrier),
+ KCSAN_KUNIT_CASE(test_missing_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_correct_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_missing_barrier),
{},
};
@@ -1142,6 +1500,9 @@ static int test_init(struct kunit *test)
observed.nlines = 0;
spin_unlock_irqrestore(&observed.lock, flags);
+ if (strstr(test->name, "nothreads"))
+ return 0;
+
if (!torture_init_begin((char *)test->name, 1))
return -EBUSY;
@@ -1184,6 +1545,9 @@ static void test_exit(struct kunit *test)
struct task_struct **stop_thread;
int i;
+ if (strstr(test->name, "nothreads"))
+ return;
+
if (torture_cleanup_begin())
return;
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index fc15077991c4..67794404042a 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -215,9 +215,9 @@ static const char *get_access_type(int type)
if (type & KCSAN_ACCESS_ASSERT) {
if (type & KCSAN_ACCESS_SCOPED) {
if (type & KCSAN_ACCESS_WRITE)
- return "assert no accesses (scoped)";
+ return "assert no accesses (reordered)";
else
- return "assert no writes (scoped)";
+ return "assert no writes (reordered)";
} else {
if (type & KCSAN_ACCESS_WRITE)
return "assert no accesses";
@@ -240,17 +240,17 @@ static const char *get_access_type(int type)
case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "read-write (marked)";
case KCSAN_ACCESS_SCOPED:
- return "read (scoped)";
+ return "read (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
- return "read (marked, scoped)";
+ return "read (marked, reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE:
- return "write (scoped)";
+ return "write (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
- return "write (marked, scoped)";
+ return "write (marked, reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
- return "read-write (scoped)";
+ return "read-write (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
- return "read-write (marked, scoped)";
+ return "read-write (marked, reordered)";
default:
BUG();
}
@@ -308,10 +308,12 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
/*
* Skips to the first entry that matches the function of @ip, and then replaces
- * that entry with @ip, returning the entries to skip.
+ * that entry with @ip, returning the entries to skip with @replaced containing
+ * the replaced entry.
*/
static int
-replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip)
+replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
{
unsigned long symbolsize, offset;
unsigned long target_func;
@@ -330,6 +332,7 @@ replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned lon
func -= offset;
if (func == target_func) {
+ *replaced = stack_entries[skip];
stack_entries[skip] = ip;
return skip;
}
@@ -342,9 +345,10 @@ fallback:
}
static int
-sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip)
+sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
{
- return ip ? replace_stack_entry(stack_entries, num_entries, ip) :
+ return ip ? replace_stack_entry(stack_entries, num_entries, ip, replaced) :
get_stack_skipnr(stack_entries, num_entries);
}
@@ -360,6 +364,14 @@ static int sym_strcmp(void *addr1, void *addr2)
return strncmp(buf1, buf2, sizeof(buf1));
}
+static void
+print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to)
+{
+ stack_trace_print(stack_entries, num_entries, 0);
+ if (reordered_to)
+ pr_err(" |\n +-> reordered to: %pS\n", (void *)reordered_to);
+}
+
static void print_verbose_info(struct task_struct *task)
{
if (!task)
@@ -378,10 +390,12 @@ static void print_report(enum kcsan_value_change value_change,
struct other_info *other_info,
u64 old, u64 new, u64 mask)
{
+ unsigned long reordered_to = 0;
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
- int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip);
+ int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip, &reordered_to);
unsigned long this_frame = stack_entries[skipnr];
+ unsigned long other_reordered_to = 0;
unsigned long other_frame = 0;
int other_skipnr = 0; /* silence uninit warnings */
@@ -394,7 +408,7 @@ static void print_report(enum kcsan_value_change value_change,
if (other_info) {
other_skipnr = sanitize_stack_entries(other_info->stack_entries,
other_info->num_stack_entries,
- other_info->ai.ip);
+ other_info->ai.ip, &other_reordered_to);
other_frame = other_info->stack_entries[other_skipnr];
/* @value_change is only known for the other thread */
@@ -434,10 +448,9 @@ static void print_report(enum kcsan_value_change value_change,
other_info->ai.cpu_id);
/* Print the other thread's stack trace. */
- stack_trace_print(other_info->stack_entries + other_skipnr,
+ print_stack_trace(other_info->stack_entries + other_skipnr,
other_info->num_stack_entries - other_skipnr,
- 0);
-
+ other_reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(other_info->task);
@@ -451,9 +464,7 @@ static void print_report(enum kcsan_value_change value_change,
get_thread_desc(ai->task_pid), ai->cpu_id);
}
/* Print stack trace of this thread. */
- stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
- 0);
-
+ print_stack_trace(stack_entries + skipnr, num_stack_entries - skipnr, reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(current);
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index b4295a3892b7..75712959c84e 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -7,10 +7,15 @@
#define pr_fmt(fmt) "kcsan: " fmt
+#include <linux/atomic.h>
+#include <linux/bitops.h>
#include <linux/init.h>
+#include <linux/kcsan-checks.h>
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
#include <linux/types.h>
#include "encoding.h"
@@ -103,6 +108,143 @@ static bool __init test_matching_access(void)
return true;
}
+/*
+ * Correct memory barrier instrumentation is critical to avoiding false
+ * positives: simple test to check at boot certain barriers are always properly
+ * instrumented. See kcsan_test for a more complete test.
+ */
+static DEFINE_SPINLOCK(test_spinlock);
+static bool __init test_barrier(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ bool ret = true;
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy;
+ long test_var;
+
+ if (!reorder_access || !IS_ENABLED(CONFIG_SMP))
+ return true;
+
+#define __KCSAN_CHECK_BARRIER(access_type, barrier, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = 1; \
+ barrier; \
+ if (reorder_access->size != 0) { \
+ pr_err("improperly instrumented type=(" #access_type "): " name "\n"); \
+ ret = false; \
+ } \
+ } while (0)
+#define KCSAN_CHECK_READ_BARRIER(b) __KCSAN_CHECK_BARRIER(0, b, #b)
+#define KCSAN_CHECK_WRITE_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE, b, #b)
+#define KCSAN_CHECK_RW_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND, b, #b)
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_CHECK_READ_BARRIER(mb());
+ KCSAN_CHECK_READ_BARRIER(rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb());
+ KCSAN_CHECK_READ_BARRIER(smp_rmb());
+ KCSAN_CHECK_READ_BARRIER(dma_rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_READ_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_READ_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_WRITE_BARRIER(mb());
+ KCSAN_CHECK_WRITE_BARRIER(wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(dma_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_RW_BARRIER(mb());
+ KCSAN_CHECK_RW_BARRIER(wmb());
+ KCSAN_CHECK_RW_BARRIER(rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb());
+ KCSAN_CHECK_RW_BARRIER(smp_wmb());
+ KCSAN_CHECK_RW_BARRIER(smp_rmb());
+ KCSAN_CHECK_RW_BARRIER(dma_wmb());
+ KCSAN_CHECK_RW_BARRIER(dma_rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_RW_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
+
+#ifdef clear_bit_unlock_is_negative_byte
+ KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+#endif
+ kcsan_nestable_atomic_end();
+
+ return ret;
+}
+
static int __init kcsan_selftest(void)
{
int passed = 0;
@@ -120,6 +262,7 @@ static int __init kcsan_selftest(void)
RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
+ RUN_TEST(test_barrier);
pr_info("selftest: %d/%d tests passed\n", passed, total);
if (passed != total)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 397ac13d2ef7..9c2fb613a55d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -1047,7 +1047,7 @@ static int __init lock_torture_init(void)
sizeof(writer_tasks[0]),
GFP_KERNEL);
if (writer_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ TOROUT_ERRSTRING("writer_tasks: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -1058,7 +1058,7 @@ static int __init lock_torture_init(void)
sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ TOROUT_ERRSTRING("reader_tasks: Out of memory");
kfree(writer_tasks);
writer_tasks = NULL;
firsterr = -ENOMEM;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index b8251dc0bc0f..ba005ebf4730 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -20,12 +20,13 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
*/
static int notifier_chain_register(struct notifier_block **nl,
- struct notifier_block *n)
+ struct notifier_block *n)
{
while ((*nl) != NULL) {
if (unlikely((*nl) == n)) {
- WARN(1, "double register detected");
- return 0;
+ WARN(1, "notifier callback %ps already registered",
+ n->notifier_call);
+ return -EEXIST;
}
if (n->priority > (*nl)->priority)
break;
@@ -134,7 +135,7 @@ static int notifier_call_chain_robust(struct notifier_block **nl,
*
* Adds a notifier to an atomic notifier chain.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
struct notifier_block *n)
@@ -216,7 +217,7 @@ NOKPROBE_SYMBOL(atomic_notifier_call_chain);
* Adds a notifier to a blocking notifier chain.
* Must be called in process context.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
struct notifier_block *n)
@@ -335,7 +336,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
* Adds a notifier to a raw notifier chain.
* All locking must be provided by the caller.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *n)
@@ -406,7 +407,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
* Adds a notifier to an SRCU notifier chain.
* Must be called in process context.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
struct notifier_block *n)
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 326f8d032eb5..b4f433943209 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -170,6 +170,7 @@ extern int swsusp_swap_in_use(void);
#define SF_PLATFORM_MODE 1
#define SF_NOCOMPRESS_MODE 2
#define SF_CRC32_MODE 4
+#define SF_HW_SIG 8
/* kernel/power/hibernate.c */
extern int swsusp_check(void);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index ff326c2cb77b..ad10359030a4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -36,6 +36,8 @@
#define HIBERNATE_SIG "S1SUSPEND"
+u32 swsusp_hardware_signature;
+
/*
* When reading an {un,}compressed image, we may restore pages in place,
* in which case some architectures need these pages cleaning before they
@@ -104,7 +106,8 @@ struct swap_map_handle {
struct swsusp_header {
char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
- sizeof(u32)];
+ sizeof(u32) - sizeof(u32)];
+ u32 hw_sig;
u32 crc32;
sector_t image;
unsigned int flags; /* Flags to pass to the "boot" kernel */
@@ -312,7 +315,6 @@ static int hib_wait_io(struct hib_bio_batch *hb)
/*
* Saving part
*/
-
static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
@@ -324,6 +326,10 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
swsusp_header->image = handle->first_sector;
+ if (swsusp_hardware_signature) {
+ swsusp_header->hw_sig = swsusp_hardware_signature;
+ flags |= SF_HW_SIG;
+ }
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
@@ -1537,6 +1543,12 @@ int swsusp_check(void)
} else {
error = -EINVAL;
}
+ if (!error && swsusp_header->flags & SF_HW_SIG &&
+ swsusp_header->hw_sig != swsusp_hardware_signature) {
+ pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n",
+ swsusp_header->hw_sig, swsusp_hardware_signature);
+ error = -EINVAL;
+ }
put:
if (error)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 57b132b658e1..155229f0cf0f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -280,7 +280,6 @@ static struct console *exclusive_console;
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
static int preferred_console = -1;
-static bool has_preferred_console;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
@@ -2861,7 +2860,8 @@ early_param("keep_bootcon", keep_bootcon_setup);
* Care need to be taken with consoles that are statically
* enabled such as netconsole
*/
-static int try_enable_new_console(struct console *newcon, bool user_specified)
+static int try_enable_preferred_console(struct console *newcon,
+ bool user_specified)
{
struct console_cmdline *c;
int i, err;
@@ -2891,10 +2891,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return err;
}
newcon->flags |= CON_ENABLED;
- if (i == preferred_console) {
+ if (i == preferred_console)
newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
return 0;
}
@@ -2909,6 +2907,21 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return -ENOENT;
}
+/* Try to enable the console unconditionally */
+static void try_enable_default_console(struct console *newcon)
+{
+ if (newcon->index < 0)
+ newcon->index = 0;
+
+ if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ return;
+
+ newcon->flags |= CON_ENABLED;
+
+ if (newcon->device)
+ newcon->flags |= CON_CONSDEV;
+}
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -2930,59 +2943,56 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
*/
void register_console(struct console *newcon)
{
- struct console *bcon = NULL;
+ struct console *con;
+ bool bootcon_enabled = false;
+ bool realcon_enabled = false;
int err;
- for_each_console(bcon) {
- if (WARN(bcon == newcon, "console '%s%d' already registered\n",
- bcon->name, bcon->index))
+ for_each_console(con) {
+ if (WARN(con == newcon, "console '%s%d' already registered\n",
+ con->name, con->index))
return;
}
- /*
- * before we register a new CON_BOOT console, make sure we don't
- * already have a valid console
- */
- if (newcon->flags & CON_BOOT) {
- for_each_console(bcon) {
- if (!(bcon->flags & CON_BOOT)) {
- pr_info("Too late to register bootconsole %s%d\n",
- newcon->name, newcon->index);
- return;
- }
- }
+ for_each_console(con) {
+ if (con->flags & CON_BOOT)
+ bootcon_enabled = true;
+ else
+ realcon_enabled = true;
}
- if (console_drivers && console_drivers->flags & CON_BOOT)
- bcon = console_drivers;
-
- if (!has_preferred_console || bcon || !console_drivers)
- has_preferred_console = preferred_console >= 0;
+ /* Do not register boot consoles when there already is a real one. */
+ if (newcon->flags & CON_BOOT && realcon_enabled) {
+ pr_info("Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ return;
+ }
/*
- * See if we want to use this console driver. If we
- * didn't select a console we take the first one
- * that registers here.
+ * See if we want to enable this console driver by default.
+ *
+ * Nope when a console is preferred by the command line, device
+ * tree, or SPCR.
+ *
+ * The first real console with tty binding (driver) wins. More
+ * consoles might get enabled before the right one is found.
+ *
+ * Note that a console with tty binding will have CON_CONSDEV
+ * flag set and will be first in the list.
*/
- if (!has_preferred_console) {
- if (newcon->index < 0)
- newcon->index = 0;
- if (newcon->setup == NULL ||
- newcon->setup(newcon, NULL) == 0) {
- newcon->flags |= CON_ENABLED;
- if (newcon->device) {
- newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
+ if (preferred_console < 0) {
+ if (!console_drivers || !console_drivers->device ||
+ console_drivers->flags & CON_BOOT) {
+ try_enable_default_console(newcon);
}
}
/* See if this console matches one we selected on the command line */
- err = try_enable_new_console(newcon, true);
+ err = try_enable_preferred_console(newcon, true);
/* If not, try to match against the platform default(s) */
if (err == -ENOENT)
- err = try_enable_new_console(newcon, false);
+ err = try_enable_preferred_console(newcon, false);
/* printk() messages are not printed to the Braille console. */
if (err || newcon->flags & CON_BRL)
@@ -2994,8 +3004,10 @@ void register_console(struct console *newcon)
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ if (bootcon_enabled &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
+ }
/*
* Put this console in the list - keep the
@@ -3051,15 +3063,15 @@ void register_console(struct console *newcon)
pr_info("%sconsole [%s%d] enabled\n",
(newcon->flags & CON_BOOT) ? "boot" : "" ,
newcon->name, newcon->index);
- if (bcon &&
+ if (bootcon_enabled &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
/* We need to iterate through all boot consoles, to make
* sure we print everything out, before we unregister them.
*/
- for_each_console(bcon)
- if (bcon->flags & CON_BOOT)
- unregister_console(bcon);
+ for_each_console(con)
+ if (con->flags & CON_BOOT)
+ unregister_console(con);
}
}
EXPORT_SYMBOL(register_console);
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 3128b7cf8e1f..bf8e341e75b4 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -112,7 +112,7 @@ config RCU_STALL_COMMON
making these warnings mandatory for the tree variants.
config RCU_NEED_SEGCBLIST
- def_bool ( TREE_RCU || TREE_SRCU )
+ def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC )
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
@@ -169,24 +169,6 @@ config RCU_FANOUT_LEAF
Take the default if unsure.
-config RCU_FAST_NO_HZ
- bool "Accelerate last non-dyntick-idle CPU's grace periods"
- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
- default n
- help
- This option permits CPUs to enter dynticks-idle state even if
- they have RCU callbacks queued, and prevents RCU from waking
- these CPUs up more than roughly once every four jiffies (by
- default, you can adjust this using the rcutree.rcu_idle_gp_delay
- parameter), thus improving energy efficiency. On the other
- hand, this option increases the duration of RCU grace periods,
- for example, slowing down synchronize_rcu().
-
- Say Y if energy efficiency is critically important, and you
- don't care about increased grace-period durations.
-
- Say N if you are unsure.
-
config RCU_BOOST
bool "Enable RCU priority boosting"
depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index aaa111237b60..81145c3ece25 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -261,16 +261,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Mark the specified rcu_segcblist structure as offloaded.
+ * Mark the specified rcu_segcblist structure as offloaded (or not)
*/
void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
{
- if (offload) {
- rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY);
- rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED);
- } else {
+ if (offload)
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED);
+ else
rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
- }
}
/*
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 9a19328ff251..e373fbe44da5 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -80,11 +80,14 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED);
}
-/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */
+/*
+ * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the
+ * [de]offloading process)?
+ */
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY))
+ rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING))
return true;
return false;
@@ -92,9 +95,8 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
{
- int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED;
-
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags)
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE))
return true;
return false;
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 228f143bf935..5e4f1f83d38e 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -50,8 +50,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s)
#define VERBOSE_SCALEOUT_STRING(s) \
do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0)
-#define VERBOSE_SCALEOUT_ERRSTRING(s) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0)
+#define SCALEOUT_ERRSTRING(s) \
+ pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s)
/*
* The intended use cases for the nreaders and nwriters module parameters
@@ -514,11 +514,11 @@ rcu_scale_cleanup(void)
* during the mid-boot phase, so have to wait till the end.
*/
if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
- VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
if (rcu_gp_is_normal() && gp_exp)
- VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
if (gp_exp && gp_async)
- VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+ SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
if (torture_cleanup_begin())
return;
@@ -845,7 +845,7 @@ rcu_scale_init(void)
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -865,7 +865,7 @@ rcu_scale_init(void)
kcalloc(nrealwriters, sizeof(*writer_n_durations),
GFP_KERNEL);
if (!writer_tasks || !writer_durations || !writer_n_durations) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8b410d982990..33ea446101b3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -46,6 +46,7 @@
#include <linux/oom.h>
#include <linux/tick.h>
#include <linux/rcupdate_trace.h>
+#include <linux/nmi.h>
#include "rcu.h"
@@ -53,15 +54,18 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
/* Bits for ->extendables field, extendables param, and related definitions. */
-#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1)
+#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2)
#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
-#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */
+#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
@@ -75,7 +79,7 @@ torture_param(int, fqs_duration, 0,
"Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
-torture_param(bool, fwd_progress, 1, "Test grace-period forward progress");
+torture_param(int, fwd_progress, 1, "Test grace-period forward progress");
torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
torture_param(int, fwd_progress_holdoff, 60,
"Time between forward-progress tests (s)");
@@ -109,6 +113,8 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
torture_param(int, stall_cpu_holdoff, 10,
"Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false,
+ "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
torture_param(int, stall_gp_kthread, 0,
@@ -140,7 +146,7 @@ static struct task_struct *stats_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
-static struct task_struct *fwd_prog_task;
+static struct task_struct **fwd_prog_tasks;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
static struct task_struct *read_exit_task;
@@ -342,10 +348,12 @@ struct rcu_torture_ops {
void (*gp_kthread_dbg)(void);
bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
+ long cbflood_max;
int irq_capable;
int can_boost;
int extendables;
int slow_gps;
+ int no_pi_lock;
const char *name;
};
@@ -667,6 +675,7 @@ static struct rcu_torture_ops srcu_ops = {
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcu"
};
@@ -700,6 +709,7 @@ static struct rcu_torture_ops srcud_ops = {
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcud"
};
@@ -720,6 +730,7 @@ static struct rcu_torture_ops busted_srcud_ops = {
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted_srcud"
};
@@ -831,6 +842,7 @@ static struct rcu_torture_ops tasks_rude_ops = {
.call = call_rcu_tasks_rude,
.cb_barrier = rcu_barrier_tasks_rude,
.gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -871,6 +883,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
.gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -1420,13 +1433,15 @@ static void rcutorture_one_extend(int *readstate, int newstate,
struct rt_read_seg *rtrsp)
{
unsigned long flags;
- int idxnew = -1;
- int idxold = *readstate;
+ int idxnew1 = -1;
+ int idxnew2 = -1;
+ int idxold1 = *readstate;
+ int idxold2 = idxold1;
int statesnew = ~*readstate & newstate;
int statesold = *readstate & ~newstate;
- WARN_ON_ONCE(idxold < 0);
- WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
+ WARN_ON_ONCE(idxold2 < 0);
+ WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
@@ -1440,8 +1455,10 @@ static void rcutorture_one_extend(int *readstate, int newstate,
preempt_disable();
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
- if (statesnew & RCUTORTURE_RDR_RCU)
- idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
+ if (statesnew & RCUTORTURE_RDR_RCU_1)
+ idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
+ if (statesnew & RCUTORTURE_RDR_RCU_2)
+ idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
/*
* Next, remove old protection, in decreasing order of strength
@@ -1460,12 +1477,20 @@ static void rcutorture_one_extend(int *readstate, int newstate,
local_bh_enable();
if (statesold & RCUTORTURE_RDR_RBH)
rcu_read_unlock_bh();
- if (statesold & RCUTORTURE_RDR_RCU) {
- bool lockit = !statesnew && !(torture_random(trsp) & 0xffff);
+ if (statesold & RCUTORTURE_RDR_RCU_2) {
+ cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
+ WARN_ON_ONCE(idxnew2 != -1);
+ idxold2 = 0;
+ }
+ if (statesold & RCUTORTURE_RDR_RCU_1) {
+ bool lockit;
+ lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
if (lockit)
raw_spin_lock_irqsave(&current->pi_lock, flags);
- cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
+ cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
if (lockit)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
@@ -1475,13 +1500,19 @@ static void rcutorture_one_extend(int *readstate, int newstate,
cur_ops->read_delay(trsp, rtrsp);
/* Update the reader state. */
- if (idxnew == -1)
- idxnew = idxold & ~RCUTORTURE_RDR_MASK;
- WARN_ON_ONCE(idxnew < 0);
- WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1);
- *readstate = idxnew | newstate;
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0);
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1);
+ if (idxnew1 == -1)
+ idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
+ WARN_ON_ONCE(idxnew1 < 0);
+ if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
+ pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
+ if (idxnew2 == -1)
+ idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
+ WARN_ON_ONCE(idxnew2 < 0);
+ WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
+ *readstate = idxnew1 | idxnew2 | newstate;
+ WARN_ON_ONCE(*readstate < 0);
+ if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
+ pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -1491,7 +1522,7 @@ static int rcutorture_extend_mask_max(void)
WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND);
mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables;
- mask = mask | RCUTORTURE_RDR_RCU;
+ mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
return mask;
}
@@ -1506,13 +1537,21 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
+ WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
else
mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
+ // Can't have nested RCU reader without outer RCU reader.
+ if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) {
+ if (oldmask & RCUTORTURE_RDR_RCU_1)
+ mask &= ~RCUTORTURE_RDR_RCU_2;
+ else
+ mask |= RCUTORTURE_RDR_RCU_1;
+ }
+
/*
* Can't enable bh w/irq disabled.
*/
@@ -1532,7 +1571,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
mask |= oldmask & bhs;
}
- return mask ?: RCUTORTURE_RDR_RCU;
+ return mask ?: RCUTORTURE_RDR_RCU_1;
}
/*
@@ -1626,7 +1665,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
rcu_torture_writer_state,
cookie, cur_ops->get_gp_state());
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
- WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
+ WARN_ON_ONCE(readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
@@ -2052,6 +2091,8 @@ static int rcu_torture_stall(void *args)
#else
schedule_timeout_uninterruptible(HZ);
#endif
+ } else if (stall_no_softlockup) {
+ touch_softlockup_watchdog();
}
if (stall_cpu_irqsoff)
local_irq_enable();
@@ -2123,10 +2164,13 @@ struct rcu_fwd {
unsigned long rcu_fwd_startat;
struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
unsigned long rcu_launder_gp_seq_start;
+ int rcu_fwd_id;
};
static DEFINE_MUTEX(rcu_fwd_mutex);
static struct rcu_fwd *rcu_fwds;
+static unsigned long rcu_fwd_seq;
+static atomic_long_t rcu_fwd_max_cbs;
static bool rcu_fwd_emergency_stop;
static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
@@ -2139,8 +2183,9 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
if (rfp->n_launders_hist[i].n_launders > 0)
break;
- pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
- __func__, jiffies - rfp->rcu_fwd_startat);
+ mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
+ pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
+ __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
gps_old = rfp->rcu_launder_gp_seq_start;
for (j = 0; j <= i; j++) {
gps = rfp->n_launders_hist[j].launder_gp_seq;
@@ -2151,6 +2196,7 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
gps_old = gps;
}
pr_cont("\n");
+ mutex_unlock(&rcu_fwd_mutex);
}
/* Callback function for continuous-flood RCU callbacks. */
@@ -2276,7 +2322,8 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
WARN_ON(!cver && gps < 2);
- pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
+ pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__,
+ rfp->rcu_fwd_id, dur, cver, gps);
}
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 1);
@@ -2344,7 +2391,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
rfp->rcu_fwd_cb_head = rfcpn;
n_launders++;
n_launders_sa++;
- } else {
+ } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) {
rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
if (WARN_ON_ONCE(!rfcp)) {
schedule_timeout_interruptible(1);
@@ -2354,8 +2401,11 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_sa = 0;
rfcp->rfc_gps = 0;
rfcp->rfc_rfp = rfp;
+ } else {
+ rfcp = NULL;
}
- cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+ if (rfcp)
+ cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
@@ -2379,6 +2429,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
+ atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
rcu_torture_fwd_cb_hist(rfp);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
@@ -2394,6 +2445,8 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
static int rcutorture_oom_notify(struct notifier_block *self,
unsigned long notused, void *nfreed)
{
+ int i;
+ long ncbs;
struct rcu_fwd *rfp;
mutex_lock(&rcu_fwd_mutex);
@@ -2404,18 +2457,26 @@ static int rcutorture_oom_notify(struct notifier_block *self,
}
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
- rcu_torture_fwd_cb_hist(rfp);
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
+ for (i = 0; i < fwd_progress; i++) {
+ rcu_torture_fwd_cb_hist(&rfp[i]);
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2);
+ }
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
smp_mb(); /* Frees before return to avoid redoing OOM. */
(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
pr_info("%s returning after OOM processing.\n", __func__);
@@ -2430,7 +2491,10 @@ static struct notifier_block rcutorture_oom_nb = {
/* Carry out grace-period forward-progress testing. */
static int rcu_torture_fwd_prog(void *args)
{
+ bool firsttime = true;
+ long max_cbs;
int oldnice = task_nice(current);
+ unsigned long oldseq = READ_ONCE(rcu_fwd_seq);
struct rcu_fwd *rfp = args;
int tested = 0;
int tested_tries = 0;
@@ -2440,21 +2504,38 @@ static int rcu_torture_fwd_prog(void *args)
if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
- WRITE_ONCE(rcu_fwd_emergency_stop, false);
- if (!IS_ENABLED(CONFIG_TINY_RCU) ||
- rcu_inkernel_boot_has_ended())
- rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
- if (rcu_inkernel_boot_has_ended())
+ if (!rfp->rcu_fwd_id) {
+ schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
+ WRITE_ONCE(rcu_fwd_emergency_stop, false);
+ if (!firsttime) {
+ max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0);
+ pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs);
+ }
+ firsttime = false;
+ WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
+ } else {
+ while (READ_ONCE(rcu_fwd_seq) == oldseq)
+ schedule_timeout_interruptible(1);
+ oldseq = READ_ONCE(rcu_fwd_seq);
+ }
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)
rcu_torture_fwd_prog_cr(rfp);
+ if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) &&
+ (!IS_ENABLED(CONFIG_TINY_RCU) ||
+ (rcu_inkernel_boot_has_ended() &&
+ torture_num_online_cpus() > rfp->rcu_fwd_id)))
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
/* Avoid slow periods, better to test when busy. */
if (stutter_wait("rcu_torture_fwd_prog"))
sched_set_normal(current, oldnice);
} while (!torture_must_stop());
/* Short runs might not contain a valid forward-progress attempt. */
- WARN_ON(!tested && tested_tries >= 5);
- pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ if (!rfp->rcu_fwd_id) {
+ WARN_ON(!tested && tested_tries >= 5);
+ pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ }
torture_kthread_stopping("rcu_torture_fwd_prog");
return 0;
}
@@ -2462,17 +2543,28 @@ static int rcu_torture_fwd_prog(void *args)
/* If forward-progress checking is requested and feasible, spawn the thread. */
static int __init rcu_torture_fwd_prog_init(void)
{
+ int i;
+ int ret = 0;
struct rcu_fwd *rfp;
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
+ if (fwd_progress >= nr_cpu_ids) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n");
+ fwd_progress = nr_cpu_ids;
+ } else if (fwd_progress < 0) {
+ fwd_progress = nr_cpu_ids;
+ }
if ((!cur_ops->sync && !cur_ops->call) ||
- !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) {
+ (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) ||
+ cur_ops == &rcu_busted_ops) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
+ fwd_progress = 0;
return 0;
}
if (stall_cpu > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing");
+ fwd_progress = 0;
if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
return -EINVAL; /* In module, can fail back to user. */
WARN_ON(1); /* Make sure rcutorture notices conflict. */
@@ -2482,29 +2574,51 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress_holdoff = 1;
if (fwd_progress_div <= 0)
fwd_progress_div = 4;
- rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
- if (!rfp)
+ rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL);
+ fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL);
+ if (!rfp || !fwd_prog_tasks) {
+ kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
+ fwd_progress = 0;
return -ENOMEM;
- spin_lock_init(&rfp->rcu_fwd_lock);
- rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ }
+ for (i = 0; i < fwd_progress; i++) {
+ spin_lock_init(&rfp[i].rcu_fwd_lock);
+ rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head;
+ rfp[i].rcu_fwd_id = i;
+ }
mutex_lock(&rcu_fwd_mutex);
rcu_fwds = rfp;
mutex_unlock(&rcu_fwd_mutex);
register_oom_notifier(&rcutorture_oom_nb);
- return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
+ for (i = 0; i < fwd_progress; i++) {
+ ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]);
+ if (ret) {
+ fwd_progress = i;
+ return ret;
+ }
+ }
+ return 0;
}
static void rcu_torture_fwd_prog_cleanup(void)
{
+ int i;
struct rcu_fwd *rfp;
- torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
- rfp = rcu_fwds;
+ if (!rcu_fwds || !fwd_prog_tasks)
+ return;
+ for (i = 0; i < fwd_progress; i++)
+ torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]);
+ unregister_oom_notifier(&rcutorture_oom_nb);
mutex_lock(&rcu_fwd_mutex);
+ rfp = rcu_fwds;
rcu_fwds = NULL;
mutex_unlock(&rcu_fwd_mutex);
- unregister_oom_notifier(&rcutorture_oom_nb);
kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
}
/* Callback function for RCU barrier testing. */
@@ -2741,7 +2855,7 @@ static int rcu_torture_read_exit(void *unused)
&trs, "%s",
"rcu_torture_read_exit_child");
if (IS_ERR(tsp)) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
errexit = true;
tsp = NULL;
break;
@@ -3068,7 +3182,7 @@ rcu_torture_init(void)
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -3084,7 +3198,7 @@ rcu_torture_init(void)
rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
GFP_KERNEL);
if (!reader_tasks || !rcu_torture_reader_mbchk) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -3103,7 +3217,7 @@ rcu_torture_init(void)
if (nrealnocbers > 0) {
nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
if (nocb_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 1631ef8a138d..5489ff7f478e 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -44,7 +44,10 @@
pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
#define VERBOSE_SCALEOUT(s, x...) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
+ do { \
+ if (verbose) \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
+ } while (0)
static atomic_t verbose_batch_ctr;
@@ -54,12 +57,11 @@ do { \
(verbose_batched <= 0 || \
!(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \
schedule_timeout_uninterruptible(1); \
- pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
} \
} while (0)
-#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
+#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
@@ -604,7 +606,7 @@ static u64 process_durations(int n)
char *buf;
u64 sum = 0;
- buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
+ buf = kmalloc(800 + 64, GFP_KERNEL);
if (!buf)
return 0;
buf[0] = 0;
@@ -617,13 +619,15 @@ static u64 process_durations(int n)
if (i % 5 == 0)
strcat(buf, "\n");
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
strcat(buf, buf1);
sum += rt->last_duration_ns;
}
- strcat(buf, "\n");
-
- SCALEOUT("%s\n", buf);
+ pr_alert("%s\n", buf);
kfree(buf);
return sum;
@@ -637,7 +641,6 @@ static u64 process_durations(int n)
// point all the timestamps are printed.
static int main_func(void *arg)
{
- bool errexit = false;
int exp, r;
char buf1[64];
char *buf;
@@ -648,10 +651,10 @@ static int main_func(void *arg)
VERBOSE_SCALEOUT("main_func task started");
result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
- buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
+ buf = kzalloc(800 + 64, GFP_KERNEL);
if (!result_avg || !buf) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
- errexit = true;
+ SCALEOUT_ERRSTRING("out of memory");
+ goto oom_exit;
}
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
@@ -663,8 +666,6 @@ static int main_func(void *arg)
// Start exp readers up per experiment
for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
- if (errexit)
- break;
if (torture_must_stop())
goto end;
@@ -698,26 +699,23 @@ static int main_func(void *arg)
// Print the average of all experiments
SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
- if (!errexit) {
- buf[0] = 0;
- strcat(buf, "\n");
- strcat(buf, "Runs\tTime(ns)\n");
- }
-
+ pr_alert("Runs\tTime(ns)\n");
for (exp = 0; exp < nruns; exp++) {
u64 avg;
u32 rem;
- if (errexit)
- break;
avg = div_u64_rem(result_avg[exp], 1000, &rem);
sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
strcat(buf, buf1);
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
}
- if (!errexit)
- SCALEOUT("%s", buf);
+ pr_alert("%s", buf);
+oom_exit:
// This will shutdown everything including us.
if (shutdown) {
shutdown_start = 1;
@@ -841,12 +839,12 @@ ref_scale_init(void)
reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (!reader_tasks) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders);
+ VERBOSE_SCALEOUT("Starting %d reader threads", nreaders);
for (i = 0; i < nreaders; i++) {
firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index a0ba2ed49bc6..92c002d65482 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -99,7 +99,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
- if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
+ if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
swake_up_one(&ssp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 7da3c81c3f59..84f1d91604cc 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -6,6 +6,7 @@
*/
#ifdef CONFIG_TASKS_RCU_GENERIC
+#include "rcu_segcblist.h"
////////////////////////////////////////////////////////////////////////
//
@@ -20,11 +21,33 @@ typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp);
typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
/**
+ * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism.
+ * @cblist: Callback list.
+ * @lock: Lock protecting per-CPU callback list.
+ * @rtp_jiffies: Jiffies counter value for statistics.
+ * @rtp_n_lock_retries: Rough lock-contention statistic.
+ * @rtp_work: Work queue for invoking callbacks.
+ * @rtp_irq_work: IRQ work queue for deferred wakeups.
+ * @barrier_q_head: RCU callback for barrier operation.
+ * @cpu: CPU number corresponding to this entry.
+ * @rtpp: Pointer to the rcu_tasks structure.
+ */
+struct rcu_tasks_percpu {
+ struct rcu_segcblist cblist;
+ raw_spinlock_t __private lock;
+ unsigned long rtp_jiffies;
+ unsigned long rtp_n_lock_retries;
+ struct work_struct rtp_work;
+ struct irq_work rtp_irq_work;
+ struct rcu_head barrier_q_head;
+ int cpu;
+ struct rcu_tasks *rtpp;
+};
+
+/**
* struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
- * @cbs_head: Head of callback list.
- * @cbs_tail: Tail pointer for callback list.
* @cbs_wq: Wait queue allowing new callback to get kthread's attention.
- * @cbs_lock: Lock protecting callback list.
+ * @cbs_gbl_lock: Lock protecting callback list.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
@@ -32,7 +55,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @init_fract: Initial backoff sleep interval.
* @gp_jiffies: Time of last @gp_state transition.
* @gp_start: Most recent grace-period start in jiffies.
- * @n_gps: Number of grace periods completed since boot.
+ * @tasks_gp_seq: Number of grace periods completed since boot.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
* @pregp_func: This flavor's pre-grace-period function (optional).
@@ -41,20 +64,27 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @holdouts_func: This flavor's holdout-list scan function (optional).
* @postgp_func: This flavor's post-grace-period function (optional).
* @call_func: This flavor's call_rcu()-equivalent function.
+ * @rtpcpu: This flavor's rcu_tasks_percpu structure.
+ * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
+ * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
+ * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
+ * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers.
+ * @barrier_q_mutex: Serialize barrier operations.
+ * @barrier_q_count: Number of queues being waited on.
+ * @barrier_q_completion: Barrier wait/wakeup mechanism.
+ * @barrier_q_seq: Sequence number for barrier operations.
* @name: This flavor's textual name.
* @kname: This flavor's kthread name.
*/
struct rcu_tasks {
- struct rcu_head *cbs_head;
- struct rcu_head **cbs_tail;
struct wait_queue_head cbs_wq;
- raw_spinlock_t cbs_lock;
+ raw_spinlock_t cbs_gbl_lock;
int gp_state;
int gp_sleep;
int init_fract;
unsigned long gp_jiffies;
unsigned long gp_start;
- unsigned long n_gps;
+ unsigned long tasks_gp_seq;
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
@@ -65,20 +95,40 @@ struct rcu_tasks {
holdouts_func_t holdouts_func;
postgp_func_t postgp_func;
call_rcu_func_t call_func;
+ struct rcu_tasks_percpu __percpu *rtpcpu;
+ int percpu_enqueue_shift;
+ int percpu_enqueue_lim;
+ int percpu_dequeue_lim;
+ unsigned long percpu_dequeue_gpseq;
+ struct mutex barrier_q_mutex;
+ atomic_t barrier_q_count;
+ struct completion barrier_q_completion;
+ unsigned long barrier_q_seq;
char *name;
char *kname;
};
-#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
-static struct rcu_tasks rt_name = \
-{ \
- .cbs_tail = &rt_name.cbs_head, \
- .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \
- .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \
- .gp_func = gp, \
- .call_func = call, \
- .name = n, \
- .kname = #rt_name, \
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp);
+
+#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
+static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \
+ .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \
+}; \
+static struct rcu_tasks rt_name = \
+{ \
+ .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \
+ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \
+ .gp_func = gp, \
+ .call_func = call, \
+ .rtpcpu = &rt_name ## __percpu, \
+ .name = n, \
+ .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \
+ .percpu_enqueue_lim = 1, \
+ .percpu_dequeue_lim = 1, \
+ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \
+ .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \
+ .kname = #rt_name, \
}
/* Track exiting tasks in order to allow them to be waited for. */
@@ -94,6 +144,15 @@ module_param(rcu_task_ipi_delay, int, 0644);
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
+static int rcu_task_enqueue_lim __read_mostly = -1;
+module_param(rcu_task_enqueue_lim, int, 0444);
+
+static bool rcu_task_cb_adjust;
+static int rcu_task_contend_lim __read_mostly = 100;
+module_param(rcu_task_contend_lim, int, 0444);
+static int rcu_task_collapse_lim __read_mostly = 10;
+module_param(rcu_task_collapse_lim, int, 0444);
+
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
#define RTGS_WAIT_WAIT_CBS 1
@@ -128,6 +187,8 @@ static const char * const rcu_tasks_gp_state_names[] = {
//
// Generic code.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp);
+
/* Record grace-period phase and time. */
static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
{
@@ -148,23 +209,106 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
}
#endif /* #ifndef CONFIG_TINY_RCU */
+// Initialize per-CPU callback lists for the specified flavor of
+// Tasks RCU.
+static void cblist_init_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ int lim;
+
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rcu_task_enqueue_lim < 0) {
+ rcu_task_enqueue_lim = 1;
+ rcu_task_cb_adjust = true;
+ pr_info("%s: Setting adjustable number of callback queues.\n", __func__);
+ } else if (rcu_task_enqueue_lim == 0) {
+ rcu_task_enqueue_lim = 1;
+ }
+ lim = rcu_task_enqueue_lim;
+
+ if (lim > nr_cpu_ids)
+ lim = nr_cpu_ids;
+ WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim));
+ WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
+ smp_store_release(&rtp->percpu_enqueue_lim, lim);
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(!rtpcp);
+ if (cpu)
+ raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ if (rcu_segcblist_empty(&rtpcp->cblist))
+ rcu_segcblist_init(&rtpcp->cblist);
+ INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
+ rtpcp->cpu = cpu;
+ rtpcp->rtpp = rtp;
+ raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim));
+}
+
+// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work);
+
+ rtp = rtpcp->rtpp;
+ wake_up(&rtp->cbs_wq);
+}
+
// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
struct rcu_tasks *rtp)
{
unsigned long flags;
+ unsigned long j;
+ bool needadjust = false;
bool needwake;
+ struct rcu_tasks_percpu *rtpcp;
rhp->next = NULL;
rhp->func = func;
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- needwake = !rtp->cbs_head;
- WRITE_ONCE(*rtp->cbs_tail, rhp);
- rtp->cbs_tail = &rhp->next;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
+ local_irq_save(flags);
+ rcu_read_lock();
+ rtpcp = per_cpu_ptr(rtp->rtpcpu,
+ smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift));
+ if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ j = jiffies;
+ if (rtpcp->rtp_jiffies != j) {
+ rtpcp->rtp_jiffies = j;
+ rtpcp->rtp_n_lock_retries = 0;
+ }
+ if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim &&
+ READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
+ needadjust = true; // Defer adjustment to avoid deadlock.
+ }
+ if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) {
+ raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
+ cblist_init_generic(rtp);
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ }
+ needwake = rcu_segcblist_empty(&rtpcp->cblist);
+ rcu_segcblist_enqueue(&rtpcp->cblist, rhp);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (unlikely(needadjust)) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids));
+ WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
+ smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
+ pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ rcu_read_unlock();
/* We can't create the thread unless interrupts are enabled. */
if (needwake && READ_ONCE(rtp->kthread_ptr))
- wake_up(&rtp->cbs_wq);
+ irq_work_queue(&rtpcp->rtp_irq_work);
}
// Wait for a grace period for the specified flavor of Tasks RCU.
@@ -178,12 +322,173 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
wait_rcu_gp(rtp->call_func);
}
+// RCU callback function for rcu_barrier_tasks_generic().
+static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp;
+
+ rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head);
+ rtp = rtpcp->rtpp;
+ if (atomic_dec_and_test(&rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
+}
+
+// Wait for all in-flight callbacks for the specified RCU Tasks flavor.
+// Operates in a manner similar to rcu_barrier().
+static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq);
+
+ mutex_lock(&rtp->barrier_q_mutex);
+ if (rcu_seq_done(&rtp->barrier_q_seq, s)) {
+ smp_mb();
+ mutex_unlock(&rtp->barrier_q_mutex);
+ return;
+ }
+ rcu_seq_start(&rtp->barrier_q_seq);
+ init_completion(&rtp->barrier_q_completion);
+ atomic_set(&rtp->barrier_q_count, 2);
+ for_each_possible_cpu(cpu) {
+ if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim))
+ break;
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+ rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head))
+ atomic_inc(&rtp->barrier_q_count);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ if (atomic_sub_and_test(2, &rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
+ wait_for_completion(&rtp->barrier_q_completion);
+ rcu_seq_end(&rtp->barrier_q_seq);
+ mutex_unlock(&rtp->barrier_q_mutex);
+}
+
+// Advance callbacks and indicate whether either a grace period or
+// callback invocation is needed.
+static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ long n;
+ long ncbs = 0;
+ long ncbsnz = 0;
+ int needgpcb = 0;
+
+ for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ /* Advance and accelerate any new callbacks. */
+ if (!rcu_segcblist_n_cbs(&rtpcp->cblist))
+ continue;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ // Should we shrink down to a single callback queue?
+ n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+ if (n) {
+ ncbs += n;
+ if (cpu > 0)
+ ncbsnz += n;
+ }
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ if (rcu_segcblist_pend_cbs(&rtpcp->cblist))
+ needgpcb |= 0x3;
+ if (!rcu_segcblist_empty(&rtpcp->cblist))
+ needgpcb |= 0x1;
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+
+ // Shrink down to a single callback queue if appropriate.
+ // This is done in two stages: (1) If there are no more than
+ // rcu_task_collapse_lim callbacks on CPU 0 and none on any other
+ // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period,
+ // if there has not been an increase in callbacks, limit dequeuing
+ // to CPU 0. Note the matching RCU read-side critical section in
+ // call_rcu_tasks_generic().
+ if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim > 1) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids));
+ smp_store_release(&rtp->percpu_enqueue_lim, 1);
+ rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
+ pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ if (rcu_task_cb_adjust && !ncbsnz &&
+ poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {
+ WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
+ pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+
+ return needgpcb;
+}
+
+// Advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp)
+{
+ int cpu;
+ int cpunext;
+ unsigned long flags;
+ int len;
+ struct rcu_head *rhp;
+ struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
+ struct rcu_tasks_percpu *rtpcp_next;
+
+ cpu = rtpcp->cpu;
+ cpunext = cpu * 2 + 1;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ cpunext++;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ }
+ }
+
+ if (rcu_segcblist_empty(&rtpcp->cblist))
+ return;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ len = rcl.len;
+ for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ cond_resched();
+ }
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_add_len(&rtpcp->cblist, -len);
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+
+// Workqueue flood to advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work);
+
+ rtp = rtpcp->rtpp;
+ rcu_tasks_invoke_cbs(rtp, rtpcp);
+}
+
/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
static int __noreturn rcu_tasks_kthread(void *arg)
{
- unsigned long flags;
- struct rcu_head *list;
- struct rcu_head *next;
+ int needgpcb;
struct rcu_tasks *rtp = arg;
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
@@ -199,42 +504,22 @@ static int __noreturn rcu_tasks_kthread(void *arg)
for (;;) {
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
- /* Pick up any new callbacks. */
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- smp_mb__after_spinlock(); // Order updates vs. GP.
- list = rtp->cbs_head;
- rtp->cbs_head = NULL;
- rtp->cbs_tail = &rtp->cbs_head;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
-
/* If there were none, wait a bit and start over. */
- if (!list) {
- wait_event_interruptible(rtp->cbs_wq,
- READ_ONCE(rtp->cbs_head));
- if (!rtp->cbs_head) {
- WARN_ON(signal_pending(current));
- set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
- schedule_timeout_idle(HZ/10);
- }
- continue;
+ wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp)));
+
+ if (needgpcb & 0x2) {
+ // Wait for one grace period.
+ set_tasks_gp_state(rtp, RTGS_WAIT_GP);
+ rtp->gp_start = jiffies;
+ rcu_seq_start(&rtp->tasks_gp_seq);
+ rtp->gp_func(rtp);
+ rcu_seq_end(&rtp->tasks_gp_seq);
}
- // Wait for one grace period.
- set_tasks_gp_state(rtp, RTGS_WAIT_GP);
- rtp->gp_start = jiffies;
- rtp->gp_func(rtp);
- rtp->n_gps++;
-
- /* Invoke the callbacks. */
+ /* Invoke callbacks. */
set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
- while (list) {
- next = list->next;
- local_bh_disable();
- list->func(list);
- local_bh_enable();
- list = next;
- cond_resched();
- }
+ rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+
/* Paranoid sleep to keep this from entering a tight loop */
schedule_timeout_idle(rtp->gp_sleep);
}
@@ -279,14 +564,15 @@ static void __init rcu_tasks_bootup_oddness(void)
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each...
pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
rtp->kname,
tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
jiffies - data_race(rtp->gp_jiffies),
- data_race(rtp->n_gps),
+ data_race(rcu_seq_current(&rtp->tasks_gp_seq)),
data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
".k"[!!data_race(rtp->kthread_ptr)],
- ".C"[!!data_race(rtp->cbs_head)],
+ ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))],
s);
}
#endif // #ifndef CONFIG_TINY_RCU
@@ -411,10 +697,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
// read-side critical sections waited for by rcu_tasks_postscan().
//
-// Pre-grace-period update-side code is ordered before the grace via the
-// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side
-// code is ordered before the grace period via synchronize_rcu() call
-// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
+// Pre-grace-period update-side code is ordered before the grace
+// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
+// is ordered before the grace period via synchronize_rcu() call in
+// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
// disabling.
/* Pre-grace-period preparation. */
@@ -586,13 +872,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
*/
void rcu_barrier_tasks(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks();
+ rcu_barrier_tasks_generic(&rcu_tasks);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
static int __init rcu_spawn_tasks_kthread(void)
{
+ cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
rcu_tasks.pregp_func = rcu_tasks_pregp_step;
@@ -724,13 +1010,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
*/
void rcu_barrier_tasks_rude(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_rude();
+ rcu_barrier_tasks_generic(&rcu_tasks_rude);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
static int __init rcu_spawn_tasks_rude_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
@@ -1073,25 +1359,50 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
// Any tasks that exit after this point will set ->trc_reader_checked.
}
+/* Communicate task state back to the RCU tasks trace stall warning request. */
+struct trc_stall_chk_rdr {
+ int nesting;
+ int ipi_to_cpu;
+ u8 needqs;
+};
+
+static int trc_check_slow_task(struct task_struct *t, void *arg)
+{
+ struct trc_stall_chk_rdr *trc_rdrp = arg;
+
+ if (task_curr(t))
+ return false; // It is running, so decline to inspect it.
+ trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
+ trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
+ trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs);
+ return true;
+}
+
/* Show the state of a task stalling the current RCU tasks trace GP. */
static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
{
int cpu;
+ struct trc_stall_chk_rdr trc_rdr;
+ bool is_idle_tsk = is_idle_task(t);
if (*firstreport) {
pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n");
*firstreport = false;
}
- // FIXME: This should attempt to use try_invoke_on_nonrunning_task().
cpu = task_cpu(t);
- pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
- t->pid,
- ".I"[READ_ONCE(t->trc_ipi_to_cpu) >= 0],
- ".i"[is_idle_task(t)],
- ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
- READ_ONCE(t->trc_reader_nesting),
- " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)],
- cpu);
+ if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
+ pr_alert("P%d: %c\n",
+ t->pid,
+ ".i"[is_idle_tsk]);
+ else
+ pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
+ t->pid,
+ ".I"[trc_rdr.ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk],
+ ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
+ trc_rdr.nesting,
+ " N"[!!trc_rdr.needqs],
+ cpu);
sched_show_task(t);
}
@@ -1121,7 +1432,8 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
trc_wait_for_one_reader(t, hop);
// If check succeeded, remove this task from the list.
- if (READ_ONCE(t->trc_reader_checked))
+ if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
+ READ_ONCE(t->trc_reader_checked))
trc_del_holdout(t);
else if (needreport)
show_stalled_task_trace(t, firstreport);
@@ -1156,7 +1468,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
// Yes, this assumes that CPUs process IPIs in order. If that ever
// changes, there will need to be a recheck and/or timed wait.
for_each_online_cpu(cpu)
- if (smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))
+ if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
// Remove the safety count.
@@ -1256,13 +1568,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace);
*/
void rcu_barrier_tasks_trace(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_trace();
+ rcu_barrier_tasks_generic(&rcu_tasks_trace);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
static int __init rcu_spawn_tasks_trace_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_trace);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
rcu_tasks_trace.gp_sleep = HZ / 10;
rcu_tasks_trace.init_fract = HZ / 10;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ef8d36f580fc..a4c25a6283b0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -79,7 +79,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(1),
#ifdef CONFIG_RCU_NOCB_CPU
- .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
+ .cblist.flags = SEGCBLIST_RCU_CORE,
#endif
};
static struct rcu_state rcu_state = {
@@ -624,7 +624,6 @@ static noinstr void rcu_eqs_enter(bool user)
instrumentation_begin();
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- rcu_prepare_for_idle();
rcu_preempt_deferred_qs(current);
// instrumentation for the noinstr rcu_dynticks_eqs_enter()
@@ -768,9 +767,6 @@ noinstr void rcu_nmi_exit(void)
trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
- if (!in_nmi())
- rcu_prepare_for_idle();
-
// instrumentation for the noinstr rcu_dynticks_eqs_enter()
instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
instrumentation_end();
@@ -872,7 +868,6 @@ static void noinstr rcu_eqs_exit(bool user)
// instrumentation for the noinstr rcu_dynticks_eqs_exit()
instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
- rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
@@ -1014,12 +1009,6 @@ noinstr void rcu_nmi_enter(void)
rcu_dynticks_eqs_exit();
// ... but is watching here.
- if (!in_nmi()) {
- instrumentation_begin();
- rcu_cleanup_after_idle();
- instrumentation_end();
- }
-
instrumentation_begin();
// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
@@ -1087,6 +1076,24 @@ void rcu_irq_enter_irqson(void)
}
/*
+ * Check to see if any future non-offloaded RCU-related work will need
+ * to be done by the current CPU, even if none need be done immediately,
+ * returning 1 if so. This function is part of the RCU implementation;
+ * it is -not- an exported member of the RCU API. This is used by
+ * the idle-entry code to figure out whether it is safe to disable the
+ * scheduler-clock interrupt.
+ *
+ * Just check whether or not this CPU has non-offloaded RCU callbacks
+ * queued.
+ */
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
+{
+ *nextevt = KTIME_MAX;
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
+ !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
+}
+
+/*
* If any sort of urgency was applied to the current CPU (for example,
* the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
* to get to a quiescent state, disable it.
@@ -1467,7 +1474,7 @@ static void rcu_gp_kthread_wake(void)
{
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
- if ((current == t && !in_irq() && !in_serving_softirq()) ||
+ if ((current == t && !in_hardirq() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) || !t)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
@@ -1590,10 +1597,11 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
struct rcu_data *rdp)
{
rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
- !raw_spin_trylock_rcu_node(rnp))
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
return;
- WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ // The grace period cannot end while we hold the rcu_node lock.
+ if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
raw_spin_unlock_rcu_node(rnp);
}
@@ -2277,7 +2285,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
unsigned long flags;
unsigned long mask;
bool needwake = false;
- const bool offloaded = rcu_rdp_is_offloaded(rdp);
+ bool needacc = false;
struct rcu_node *rnp;
WARN_ON_ONCE(rdp->cpu != smp_processor_id());
@@ -2304,15 +2312,30 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
/*
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
+ *
+ * NOCB kthreads have their own way to deal with that...
*/
- if (!offloaded)
+ if (!rcu_rdp_is_offloaded(rdp)) {
needwake = rcu_accelerate_cbs(rnp, rdp);
+ } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ /*
+ * ...but NOCB kthreads may miss or delay callbacks acceleration
+ * if in the middle of a (de-)offloading process.
+ */
+ needacc = true;
+ }
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
if (needwake)
rcu_gp_kthread_wake();
+
+ if (needacc) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_accelerate_cbs_unlocked(rnp, rdp);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
}
}
@@ -2444,7 +2467,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
int div;
bool __maybe_unused empty;
unsigned long flags;
- const bool offloaded = rcu_rdp_is_offloaded(rdp);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count = 0;
@@ -2462,18 +2484,17 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
/*
- * Extract the list of ready callbacks, disabling to prevent
+ * Extract the list of ready callbacks, disabling IRQs to prevent
* races with call_rcu() from interrupt handlers. Leave the
* callback counts, as rcu_barrier() needs to be conservative.
*/
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
+ rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
pending = rcu_segcblist_n_cbs(&rdp->cblist);
div = READ_ONCE(rcu_divisor);
div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
bl = max(rdp->blimit, pending >> div);
- if (unlikely(bl > 100)) {
+ if (in_serving_softirq() && unlikely(bl > 100)) {
long rrn = READ_ONCE(rcu_resched_ns);
rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
@@ -2482,7 +2503,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
- if (offloaded)
+ if (rcu_rdp_is_offloaded(rdp))
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
@@ -2510,18 +2531,21 @@ static void rcu_do_batch(struct rcu_data *rdp)
/*
* Stop only if limit reached and CPU has something to do.
*/
- if (count >= bl && !offloaded &&
- (need_resched() ||
- (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
- break;
- if (unlikely(tlimit)) {
- /* only call local_clock() every 32 callbacks */
- if (likely((count & 31) || local_clock() < tlimit))
- continue;
- /* Exceeded the time limit, so leave. */
- break;
- }
- if (!in_serving_softirq()) {
+ if (in_serving_softirq()) {
+ if (count >= bl && (need_resched() || !is_idle_task(current)))
+ break;
+ /*
+ * Make sure we don't spend too much time here and deprive other
+ * softirq vectors of CPU cycles.
+ */
+ if (unlikely(tlimit)) {
+ /* only call local_clock() every 32 callbacks */
+ if (likely((count & 31) || local_clock() < tlimit))
+ continue;
+ /* Exceeded the time limit, so leave. */
+ break;
+ }
+ } else {
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
@@ -2530,8 +2554,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
}
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
+ rcu_nocb_lock_irqsave(rdp, flags);
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2565,9 +2588,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_unlock_irqrestore(rdp, flags);
- /* Re-invoke RCU core processing if there are callbacks remaining. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
- invoke_rcu_core();
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
}
@@ -2706,6 +2726,23 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ /*
+ * On RT rcu_core() can be preempted when IRQs aren't disabled.
+ * Therefore this function can race with concurrent NOCB (de-)offloading
+ * on this CPU and the below condition must be considered volatile.
+ * However if we race with:
+ *
+ * _ Offloading: In the worst case we accelerate or process callbacks
+ * concurrently with NOCB kthreads. We are guaranteed to
+ * call rcu_nocb_lock() if that happens.
+ *
+ * _ Deoffloading: In the worst case we miss callbacks acceleration or
+ * processing. This is fine because the early stage
+ * of deoffloading invokes rcu_core() after setting
+ * SEGCBLIST_RCU_CORE. So we guarantee that we'll process
+ * what could have been dismissed without the need to wait
+ * for the next rcu_pending() check in the next jiffy.
+ */
const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
@@ -2714,7 +2751,7 @@ static __latent_entropy void rcu_core(void)
WARN_ON_ONCE(!rdp->beenonline);
/* Report any deferred quiescent states if preemption enabled. */
- if (!(preempt_count() & PREEMPT_MASK)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
rcu_preempt_deferred_qs(current);
} else if (rcu_preempt_need_deferred_qs(current)) {
set_tsk_need_resched(current);
@@ -2737,8 +2774,12 @@ static __latent_entropy void rcu_core(void)
/* If there are callbacks ready, invoke them. */
if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
- likely(READ_ONCE(rcu_scheduler_fully_active)))
+ likely(READ_ONCE(rcu_scheduler_fully_active))) {
rcu_do_batch(rdp);
+ /* Re-invoke RCU core processing if there are callbacks remaining. */
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ invoke_rcu_core();
+ }
/* Do any needed deferred wakeups of rcuo kthreads. */
do_nocb_deferred_wakeup(rdp);
@@ -2982,7 +3023,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
head->func = func;
head->next = NULL;
local_irq_save(flags);
- kasan_record_aux_stack(head);
+ kasan_record_aux_stack_noalloc(head);
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
@@ -3547,7 +3588,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
return;
}
- kasan_record_aux_stack(ptr);
+ kasan_record_aux_stack_noalloc(ptr);
success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
if (!success) {
run_page_cache_worker(krcp);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 305cf6aeb408..486fc901bd08 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -157,7 +157,6 @@ struct rcu_data {
bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
- bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
bool cpu_started; /* RCU watching this onlining CPU. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
@@ -189,11 +188,6 @@ struct rcu_data {
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
-#ifdef CONFIG_RCU_FAST_NO_HZ
- unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
- unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
- int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
struct rcu_head barrier_head;
@@ -227,8 +221,11 @@ struct rcu_data {
struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
struct task_struct *nocb_cb_kthread;
- struct rcu_data *nocb_next_cb_rdp;
- /* Next rcu_data in wakeup chain. */
+ struct list_head nocb_head_rdp; /*
+ * Head of rcu_data list in wakeup chain,
+ * if rdp_gp.
+ */
+ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
/* The following fields are used by CB kthread, hence new cacheline. */
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
@@ -419,8 +416,6 @@ static bool rcu_is_callbacks_kthread(void);
static void rcu_cpu_kthread_setup(unsigned int cpu);
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_cleanup_after_idle(void);
-static void rcu_prepare_for_idle(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
@@ -447,12 +442,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
-#define rcu_nocb_lock_irqsave(rdp, flags) \
-do { \
- if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
- local_irq_save(flags); \
- else \
- raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
+
+/*
+ * Disable IRQs before checking offloaded state so that local
+ * locking is safe against concurrent de-offloading.
+ */
+#define rcu_nocb_lock_irqsave(rdp, flags) \
+do { \
+ local_irq_save(flags); \
+ if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
+ raw_spin_lock(&(rdp)->nocb_lock); \
} while (0)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index f3947c49eee7..237a79989aba 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -255,7 +255,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
*/
static void rcu_report_exp_rdp(struct rcu_data *rdp)
{
- WRITE_ONCE(rdp->exp_deferred_qs, false);
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, false);
rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
}
@@ -387,6 +387,7 @@ retry_ipi:
continue;
}
if (get_cpu() == cpu) {
+ mask_ofl_test |= mask;
put_cpu();
continue;
}
@@ -506,7 +507,10 @@ static void synchronize_rcu_expedited_wait(void)
if (rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = true;
- tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ preempt_disable();
+ if (cpu_online(cpu))
+ tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ preempt_enable();
}
}
j = READ_ONCE(jiffies_till_first_fqs);
@@ -655,7 +659,7 @@ static void rcu_exp_handler(void *unused)
rcu_dynticks_curr_cpu_in_eqs()) {
rcu_report_exp_rdp(rdp);
} else {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
set_tsk_need_resched(t);
set_preempt_need_resched();
}
@@ -677,7 +681,7 @@ static void rcu_exp_handler(void *unused)
if (depth > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
t->rcu_read_unlock_special.b.exp_hint = true;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -759,7 +763,7 @@ static void sync_sched_exp_online_cleanup(int cpu)
my_cpu = get_cpu();
/* Quiescent state either not needed or already requested, leave. */
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- rdp->cpu_no_qs.b.exp) {
+ READ_ONCE(rdp->cpu_no_qs.b.exp)) {
put_cpu();
return;
}
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 368ef7b9af4f..eeafb546a7a0 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -60,16 +60,22 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
* If the list is invalid, a warning is emitted and all CPUs are offloaded.
*/
+
+static bool rcu_nocb_is_setup;
+
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- if (cpulist_parse(str, rcu_nocb_mask)) {
- pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
- cpumask_setall(rcu_nocb_mask);
+ if (*str == '=') {
+ if (cpulist_parse(++str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
}
+ rcu_nocb_is_setup = true;
return 1;
}
-__setup("rcu_nocbs=", rcu_nocb_setup);
+__setup("rcu_nocbs", rcu_nocb_setup);
static int __init parse_rcu_nocb_poll(char *arg)
{
@@ -625,7 +631,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
* and the global grace-period kthread are awakened if needed.
*/
WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ /*
+ * An rcu_data structure is removed from the list after its
+ * CPU is de-offloaded and added to the list before that CPU is
+ * (re-)offloaded. If the following loop happens to be referencing
+ * that rcu_data structure during the time that the corresponding
+ * CPU is de-offloaded and then immediately re-offloaded, this
+ * loop's rdp pointer will be carried to the end of the list by
+ * the resulting pair of list operations. This can cause the loop
+ * to skip over some of the rcu_data structures that were supposed
+ * to have been scanned. Fortunately a new iteration through the
+ * entire loop is forced after a given CPU's rcu_data structure
+ * is added to the list, so the skipped-over rcu_data structures
+ * won't be ignored for long.
+ */
+ list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
bool needwake_state = false;
if (!nocb_gp_enabled_cb(rdp))
@@ -789,6 +809,18 @@ static void nocb_cb_wait(struct rcu_data *rdp)
bool can_sleep = true;
struct rcu_node *rnp = rdp->mynode;
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
+
+
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
@@ -841,17 +873,6 @@ static void nocb_cb_wait(struct rcu_data *rdp)
if (needwake_state)
swake_up_one(&rdp->nocb_state_wq);
-
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- // VVV Ensure CB invocation follows _sleep test.
- if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
- }
- } while (!nocb_cb_can_run(rdp));
}
/*
@@ -990,22 +1011,33 @@ static long rcu_nocb_rdp_deoffload(void *arg)
* will refuse to put anything into the bypass.
*/
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ /*
+ * Start with invoking rcu_core() early. This way if the current thread
+ * happens to preempt an ongoing call to rcu_core() in the middle,
+ * leaving some work dismissed because rcu_core() still thinks the rdp is
+ * completely offloaded, we are guaranteed a nearby future instance of
+ * rcu_core() to catch up.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
+ invoke_rcu_core();
ret = rdp_offload_toggle(rdp, false, flags);
swait_event_exclusive(rdp->nocb_state_wq,
!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
SEGCBLIST_KTHREAD_GP));
+ /* Stop nocb_gp_wait() from iterating over this structure. */
+ list_del_rcu(&rdp->nocb_entry_rdp);
/*
* Lock one last time to acquire latest callback updates from kthreads
* so we can later handle callbacks locally without locking.
*/
rcu_nocb_lock_irqsave(rdp, flags);
/*
- * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
+ * Theoretically we could clear SEGCBLIST_LOCKING after the nocb
* lock is released but how about being paranoid for once?
*/
- rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING);
/*
- * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
+ * Without SEGCBLIST_LOCKING, we can't use
* rcu_nocb_unlock_irqrestore() anymore.
*/
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
@@ -1057,15 +1089,26 @@ static long rcu_nocb_rdp_offload(void *arg)
return -EINVAL;
pr_info("Offloading %d\n", rdp->cpu);
+
+ /*
+ * Cause future nocb_gp_wait() invocations to iterate over
+ * structure, resetting ->nocb_gp_sleep and waking up the related
+ * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock
+ * before setting ->nocb_gp_sleep again, we are guaranteed to
+ * iterate this newly added structure before "rcuog" goes to
+ * sleep again.
+ */
+ list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp);
+
/*
- * Can't use rcu_nocb_lock_irqsave() while we are in
- * SEGCBLIST_SOFTIRQ_ONLY mode.
+ * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
+ * is set.
*/
raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
/*
* We didn't take the nocb lock while working on the
- * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+ * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode).
* Every modifications that have been done previously on
* rdp->cblist must be visible remotely by the nocb kthreads
* upon wake up after reading the cblist flags.
@@ -1084,6 +1127,14 @@ static long rcu_nocb_rdp_offload(void *arg)
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ /*
+ * All kthreads are ready to work, we can finally relieve rcu_core() and
+ * enable nocb bypass.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
return ret;
}
@@ -1122,13 +1173,17 @@ void __init rcu_init_nohz(void)
need_rcu_nocb_mask = true;
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
- if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
- if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
- pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
- return;
+ if (need_rcu_nocb_mask) {
+ if (!cpumask_available(rcu_nocb_mask)) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
}
+ rcu_nocb_is_setup = true;
}
- if (!cpumask_available(rcu_nocb_mask))
+
+ if (!rcu_nocb_is_setup)
return;
#if defined(CONFIG_NO_HZ_FULL)
@@ -1154,8 +1209,8 @@ void __init rcu_init_nohz(void)
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
}
rcu_organize_nocb_kthreads();
}
@@ -1178,17 +1233,17 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
* rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
* for this CPU's group has not yet been created, spawn it as well.
*/
-static void rcu_spawn_one_nocb_kthread(int cpu)
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_data *rdp_gp;
struct task_struct *t;
- /*
- * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
- * then nothing to do.
- */
- if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
+ if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup)
+ return;
+
+ /* If there already is an rcuo kthread, then nothing to do. */
+ if (rdp->nocb_cb_kthread)
return;
/* If we didn't spawn the GP kthread first, reorganize! */
@@ -1211,16 +1266,6 @@ static void rcu_spawn_one_nocb_kthread(int cpu)
}
/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it.
- */
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_nocb_kthread(cpu);
-}
-
-/*
* Once the scheduler is running, spawn rcuo kthreads for all online
* no-CBs CPUs. This assumes that the early_initcall()s happen before
* non-boot CPUs come online -- if this changes, we will need to add
@@ -1230,8 +1275,10 @@ static void __init rcu_spawn_nocb_kthreads(void)
{
int cpu;
- for_each_online_cpu(cpu)
- rcu_spawn_cpu_nocb_kthread(cpu);
+ if (rcu_nocb_is_setup) {
+ for_each_online_cpu(cpu)
+ rcu_spawn_cpu_nocb_kthread(cpu);
+ }
}
/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
@@ -1251,7 +1298,6 @@ static void __init rcu_organize_nocb_kthreads(void)
int nl = 0; /* Next GP kthread. */
struct rcu_data *rdp;
struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
- struct rcu_data *rdp_prev = NULL;
if (!cpumask_available(rcu_nocb_mask))
return;
@@ -1265,14 +1311,14 @@ static void __init rcu_organize_nocb_kthreads(void)
* Should the corresponding CPU come online in the future, then
* we will spawn the needed set of rcu_nocb_kthread() kthreads.
*/
- for_each_cpu(cpu, rcu_nocb_mask) {
+ for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->cpu >= nl) {
/* New GP kthread, set up for CBs & next GP. */
gotnocbs = true;
nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_gp_rdp = rdp;
rdp_gp = rdp;
+ INIT_LIST_HEAD(&rdp->nocb_head_rdp);
if (dump_tree) {
if (!firsttime)
pr_cont("%s\n", gotnocbscbs
@@ -1285,12 +1331,12 @@ static void __init rcu_organize_nocb_kthreads(void)
} else {
/* Another CB kthread, link to previous GP kthread. */
gotnocbscbs = true;
- rdp->nocb_gp_rdp = rdp_gp;
- rdp_prev->nocb_next_cb_rdp = rdp;
if (dump_tree)
pr_cont(" %d", cpu);
}
- rdp_prev = rdp;
+ rdp->nocb_gp_rdp = rdp_gp;
+ if (cpumask_test_cpu(cpu, rcu_nocb_mask))
+ list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
}
if (gotnocbs && dump_tree)
pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
@@ -1352,6 +1398,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
{
char bufw[20];
char bufr[20];
+ struct rcu_data *nocb_next_rdp;
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
bool wassleep;
@@ -1359,11 +1406,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
if (rdp->nocb_gp_rdp == rdp)
show_rcu_nocb_gp_state(rdp);
+ nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp,
+ &rdp->nocb_entry_rdp,
+ typeof(*rdp),
+ nocb_entry_rdp);
+
sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
- rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
+ nocb_next_rdp ? nocb_next_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
"cC"[!!atomic_read(&rdp->nocb_lock_contended)],
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 5199559fbbf0..c5b45c2f68a1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -16,7 +16,7 @@
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
{
/*
- * In order to read the offloaded state of an rdp is a safe
+ * In order to read the offloaded state of an rdp in a safe
* and stable way and prevent from its value to be changed
* under us, we must either hold the barrier mutex, the cpu
* hotplug lock (read or write) or the nocb lock. Local
@@ -51,12 +51,10 @@ static void __init rcu_bootup_announce_oddness(void)
RCU_FANOUT);
if (rcu_fanout_exact)
pr_info("\tHierarchical RCU autobalancing is disabled.\n");
- if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
- pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
if (IS_ENABLED(CONFIG_PROVE_RCU))
pr_info("\tRCU lockdep checking is enabled.\n");
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
- pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n");
+ pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n");
if (RCU_NUM_LVLS >= 4)
pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
@@ -88,13 +86,13 @@ static void __init rcu_bootup_announce_oddness(void)
if (rcu_kick_kthreads)
pr_info("\tKick kthreads if too-long grace period.\n");
if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
- pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
+ pr_info("\tRCU callback double-/use-after-free debug is enabled.\n");
if (gp_preinit_delay)
pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
if (gp_init_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
- pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
if (!use_softirq)
pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
@@ -260,10 +258,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
*/
- if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
+ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
else
- WARN_ON_ONCE(rdp->exp_deferred_qs);
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
}
/*
@@ -277,12 +275,16 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* current task, there might be any number of other tasks blocked while
* in an RCU read-side critical section.
*
+ * Unlike non-preemptible-RCU, quiescent state reports for expedited
+ * grace periods are handled separately via deferred quiescent states
+ * and context switch events.
+ *
* Callers to this function must disable preemption.
*/
static void rcu_qs(void)
{
RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
- if (__this_cpu_read(rcu_data.cpu_no_qs.s)) {
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data.gp_seq),
TPS("cpuqs"));
@@ -350,7 +352,7 @@ void rcu_note_context_switch(bool preempt)
* means that we continue to block the current grace period.
*/
rcu_qs();
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
@@ -477,7 +479,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
special = t->rcu_read_unlock_special;
rdp = this_cpu_ptr(&rcu_data);
- if (!special.s && !rdp->exp_deferred_qs) {
+ if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
}
@@ -497,7 +499,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* tasks are handled when removing the task from the
* blocked-tasks list below.
*/
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
/* Clean up if blocked during RCU read-side critical section. */
@@ -580,7 +582,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
- return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
+ return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
rcu_preempt_depth() == 0;
}
@@ -642,7 +644,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
(IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
t->rcu_blocked_node);
// Need to defer quiescent state until everything is enabled.
- if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) {
+ if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
// Using softirq, safe to awaken, and either the
// wakeup is free or there is either an expedited
// GP in flight or a potential need to deboost.
@@ -845,10 +847,8 @@ static void rcu_qs(void)
trace_rcu_grace_period(TPS("rcu_sched"),
__this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
- if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
- return;
- __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false);
- rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
+ rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
}
/*
@@ -925,7 +925,18 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return false;
}
-static void rcu_preempt_deferred_qs(struct task_struct *t) { }
+
+// Except that we do need to respond to a request by an expedited grace
+// period for a quiescent state from this CPU. Note that requests from
+// tasks are handled when removing the task from the blocked-tasks list
+// below.
+static void rcu_preempt_deferred_qs(struct task_struct *t)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ if (rdp->cpu_no_qs.b.exp)
+ rcu_report_exp_rdp(rdp);
+}
/*
* Because there is no preemptible RCU, there can be no readers blocked,
@@ -1153,7 +1164,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
/*
* Create an RCU-boost kthread for the specified node if one does not
* already exist. We only create this kthread for preemptible RCU.
- * Returns zero if all is well, a negated errno otherwise.
*/
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
@@ -1204,8 +1214,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU));
if (cpumask_weight(cm) == 0)
- cpumask_setall(cm);
+ cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU));
set_cpus_allowed_ptr(t, cm);
free_cpumask_var(cm);
}
@@ -1253,201 +1264,6 @@ static void __init rcu_spawn_boost_kthreads(void)
#endif /* #else #ifdef CONFIG_RCU_BOOST */
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
-
-/*
- * Check to see if any future non-offloaded RCU-related work will need
- * to be done by the current CPU, even if none need be done immediately,
- * returning 1 if so. This function is part of the RCU implementation;
- * it is -not- an exported member of the RCU API.
- *
- * Because we not have RCU_FAST_NO_HZ, just check whether or not this
- * CPU has RCU callbacks queued.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- *nextevt = KTIME_MAX;
- return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
- !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * after it.
- */
-static void rcu_cleanup_after_idle(void)
-{
-}
-
-/*
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
- * is nothing.
- */
-static void rcu_prepare_for_idle(void)
-{
-}
-
-#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-/*
- * This code is invoked when a CPU goes idle, at which point we want
- * to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode.
- *
- * The following preprocessor symbol controls this:
- *
- * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
- * to sleep in dyntick-idle mode with RCU callbacks pending. This
- * is sized to be roughly one RCU grace period. Those energy-efficiency
- * benchmarkers who might otherwise be tempted to set this to a large
- * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
- * system. And if you are -that- concerned about energy efficiency,
- * just power the system down and be done with it!
- *
- * The value below works well in practice. If future workloads require
- * adjustment, they can be converted into kernel config parameters, though
- * making the state machine smarter might be a better option.
- */
-#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
-
-static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
-module_param(rcu_idle_gp_delay, int, 0644);
-
-/*
- * Try to advance callbacks on the current CPU, but only if it has been
- * awhile since the last time we did so. Afterwards, if there are any
- * callbacks ready for immediate invocation, return true.
- */
-static bool __maybe_unused rcu_try_advance_all_cbs(void)
-{
- bool cbs_ready = false;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
-
- /* Exit early if we advanced recently. */
- if (jiffies == rdp->last_advance_all)
- return false;
- rdp->last_advance_all = jiffies;
-
- rnp = rdp->mynode;
-
- /*
- * Don't bother checking unless a grace period has
- * completed since we last checked and there are
- * callbacks not yet ready to invoke.
- */
- if ((rcu_seq_completed_gp(rdp->gp_seq,
- rcu_seq_current(&rnp->gp_seq)) ||
- unlikely(READ_ONCE(rdp->gpwrap))) &&
- rcu_segcblist_pend_cbs(&rdp->cblist))
- note_gp_changes(rdp);
-
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
- cbs_ready = true;
- return cbs_ready;
-}
-
-/*
- * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
- * to invoke. If the CPU has callbacks, try to advance them. Tell the
- * caller about what to set the timeout.
- *
- * The caller must have disabled interrupts.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- unsigned long dj;
-
- lockdep_assert_irqs_disabled();
-
- /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
- if (rcu_segcblist_empty(&rdp->cblist) ||
- rcu_rdp_is_offloaded(rdp)) {
- *nextevt = KTIME_MAX;
- return 0;
- }
-
- /* Attempt to advance callbacks. */
- if (rcu_try_advance_all_cbs()) {
- /* Some ready to invoke, so initiate later invocation. */
- invoke_rcu_core();
- return 1;
- }
- rdp->last_accelerate = jiffies;
-
- /* Request timer and round. */
- dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
-
- *nextevt = basemono + dj * TICK_NSEC;
- return 0;
-}
-
-/*
- * Prepare a CPU for idle from an RCU perspective. The first major task is to
- * sense whether nohz mode has been enabled or disabled via sysfs. The second
- * major task is to accelerate (that is, assign grace-period numbers to) any
- * recently arrived callbacks.
- *
- * The caller must have disabled interrupts.
- */
-static void rcu_prepare_for_idle(void)
-{
- bool needwake;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
- int tne;
-
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- return;
-
- /* Handle nohz enablement switches conservatively. */
- tne = READ_ONCE(tick_nohz_active);
- if (tne != rdp->tick_nohz_enabled_snap) {
- if (!rcu_segcblist_empty(&rdp->cblist))
- invoke_rcu_core(); /* force nohz to see update. */
- rdp->tick_nohz_enabled_snap = tne;
- return;
- }
- if (!tne)
- return;
-
- /*
- * If we have not yet accelerated this jiffy, accelerate all
- * callbacks on this CPU.
- */
- if (rdp->last_accelerate == jiffies)
- return;
- rdp->last_accelerate = jiffies;
- if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
- rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- needwake = rcu_accelerate_cbs(rnp, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- if (needwake)
- rcu_gp_kthread_wake();
- }
-}
-
-/*
- * Clean up for exit from idle. Attempt to advance callbacks based on
- * any grace periods that elapsed while the CPU was idle, and if any
- * callbacks are now ready to invoke, initiate invocation.
- */
-static void rcu_cleanup_after_idle(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- return;
- if (rcu_try_advance_all_cbs())
- invoke_rcu_core();
-}
-
-#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -1455,7 +1271,7 @@ static void rcu_cleanup_after_idle(void)
* CPU unless the grace period has extended for too long.
*
* This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPU CPUs.
+ * RCU_NOCB_CPU CPUs.
*/
static bool rcu_nohz_full_cpu(void)
{
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 5e2fa6fd97f1..21bebf7c9030 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -347,26 +347,6 @@ static void rcu_dump_cpu_stacks(void)
}
}
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-
- sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
- rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- !!rdp->tick_nohz_enabled_snap);
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- *cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
static const char * const gp_state_names[] = {
[RCU_GP_IDLE] = "RCU_GP_IDLE",
[RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
@@ -408,13 +388,12 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
* of RCU grace periods that this CPU is ignorant of, for example, "1"
* if the CPU was aware of the previous grace period.
*
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ * Also print out idle info.
*/
static void print_cpu_stall_info(int cpu)
{
unsigned long delta;
bool falsepositive;
- char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
char *ticks_title;
unsigned long ticks_value;
@@ -432,11 +411,10 @@ static void print_cpu_stall_info(int cpu)
ticks_title = "ticks this GP";
ticks_value = rdp->ticks_this_gp;
}
- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n",
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -449,7 +427,6 @@ static void print_cpu_stall_info(int cpu)
rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
- fast_no_hz,
falsepositive ? " (false positive?)" : "");
}
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 5d42f44e3e1a..dcb0410950e4 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -38,14 +38,10 @@
#define SCFTORT_STRING "scftorture"
#define SCFTORT_FLAG SCFTORT_STRING ": "
-#define SCFTORTOUT(s, x...) \
- pr_alert(SCFTORT_FLAG s, ## x)
-
#define VERBOSE_SCFTORTOUT(s, x...) \
- do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0)
+ do { if (verbose) pr_alert(SCFTORT_FLAG s "\n", ## x); } while (0)
-#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \
- do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0)
+#define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
@@ -587,14 +583,14 @@ static int __init scf_torture_init(void)
if (weight_resched1 == 0 && weight_single1 == 0 && weight_single_rpc1 == 0 &&
weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 &&
weight_all1 == 0 && weight_all_wait1 == 0) {
- VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
+ SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
firsterr = -EINVAL;
goto unwind;
}
if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST))
scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false);
else if (weight_resched1)
- VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
+ SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false);
scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true);
scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true);
@@ -625,12 +621,12 @@ static int __init scf_torture_init(void)
nthreads = num_online_cpus();
scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL);
if (!scf_stats_p) {
- VERBOSE_SCFTORTOUT_ERRSTRING("out of memory");
+ SCFTORTOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads);
+ VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads", nthreads);
atomic_set(&n_started, nthreads);
for (i = 0; i < nthreads; i++) {
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index c7421f2d05e1..c83b37af155b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,11 +11,10 @@ ccflags-y += $(call cc-disable-warning, unused-but-set-variable)
# that is not a function of syscall inputs. E.g. involuntary context switches.
KCOV_INSTRUMENT := n
-# There are numerous data races here, however, most of them are due to plain accesses.
-# This would make it even harder for syzbot to find reproducers, because these
-# bugs trigger without specific input. Disable by default, but should re-enable
-# eventually.
+# Disable KCSAN to avoid excessive noise and performance degradation. To avoid
+# false positives ensure barriers implied by sched functions are instrumented.
KCSAN_SANITIZE := n
+KCSAN_INSTRUMENT_BARRIERS := y
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a02820fc10b..83872f95a1ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;
/* flip prio, so high prio is leftmost */
- if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;
return false;
@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}
-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;
- if (!sched_core_enqueued(p))
- return;
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }
- rb_erase(&p->core_node, &rq->core_tree);
- RB_CLEAR_NODE(&p->core_node);
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
}
/*
@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
@@ -364,7 +374,8 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
@@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -5247,6 +5258,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
+ sched_core_tick(rq);
rq_unlock(rq, &rf);
@@ -5659,6 +5671,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
@@ -5711,10 +5724,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* reset state */
rq->core->core_cookie = 0UL;
- if (rq->core->core_forceidle) {
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
- rq->core->core_forceidle = false;
}
/*
@@ -5756,7 +5777,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
- if (i != cpu)
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
p = rq_i->core_pick = pick_task(rq_i);
@@ -5786,7 +5812,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (p == rq_i->idle) {
if (rq_i->nr_running) {
- rq->core->core_forceidle = true;
+ rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
@@ -5795,6 +5821,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
}
+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ if (cookie)
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5831,8 +5863,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* 1 0 1
* 1 1 0
*/
- if (!(fi_before && rq->core->core_forceidle))
- task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
rq_i->core_pick->core_occupation = occ;
@@ -6036,11 +6068,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
goto unlock;
/* copy the shared state to the new leader */
- core_rq->core_task_seq = rq->core_task_seq;
- core_rq->core_pick_seq = rq->core_pick_seq;
- core_rq->core_cookie = rq->core_cookie;
- core_rq->core_forceidle = rq->core_forceidle;
- core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;
/* install new leader */
for_each_cpu(t, smt_mask) {
@@ -7129,7 +7169,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long sched_cpu_util(int cpu, unsigned long max)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
@@ -8523,7 +8563,7 @@ void sched_show_task(struct task_struct *p)
rcu_read_unlock();
pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
free, task_pid_nr(p), ppid,
- (unsigned long)task_thread_info(p)->flags);
+ read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -9412,7 +9452,9 @@ void __init sched_init(void)
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
- rq->core_forceidle = false;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;
rq->core_cookie = 0UL;
#endif
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 517f72b008f5..1fb45672ec85 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
enqueued = sched_core_enqueued(p);
if (enqueued)
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, DEQUEUE_SAVE);
old_cookie = p->core_cookie;
p->core_cookie = cookie;
@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* If task is currently running, it may not be compatible anymore after
* the cookie change, so enter the scheduler on its CPU to schedule it
* away.
+ *
+ * Note that it is possible that as a result of this cookie change, the
+ * core has now entered/left forced idle state. Defer accounting to the
+ * next scheduling edge, rather than always forcing a reschedule here.
*/
if (task_running(rq, p))
resched_curr(rq);
@@ -232,3 +236,63 @@ out:
return err;
}
+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void __sched_core_account_forceidle(struct rq *rq)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ u64 delta, now = rq_clock(rq->core);
+ struct rq *rq_i;
+ struct task_struct *p;
+ int i;
+
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+ if (rq->core->core_forceidle_start == 0)
+ return;
+
+ delta = now - rq->core->core_forceidle_start;
+ if (unlikely((s64)delta <= 0))
+ return;
+
+ rq->core->core_forceidle_start = now;
+
+ if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+ /* can't be forced idle without a running task */
+ } else if (rq->core->core_forceidle_count > 1 ||
+ rq->core->core_forceidle_occupation > 1) {
+ /*
+ * For larger SMT configurations, we need to scale the charged
+ * forced idle amount since there can be more than one forced
+ * idle sibling and more than one running cookied task.
+ */
+ delta *= rq->core->core_forceidle_count;
+ delta = div_u64(delta, rq->core->core_forceidle_occupation);
+ }
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (!p->core_cookie)
+ continue;
+
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+ }
+}
+
+void __sched_core_tick(struct rq *rq)
+{
+ if (!rq->core->core_forceidle_count)
+ return;
+
+ if (rq != rq->core)
+ update_rq_clock(rq->core);
+
+ __sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 893eece65bfd..3d06c5e4220d 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
-struct cpuacct_usage {
- u64 usages[CPUACCT_STAT_NSTATS];
-};
-
/* track CPU usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every CPU */
- struct cpuacct_usage __percpu *cpuusage;
+ u64 __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
@@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
return css_ca(ca->css.parent);
}
-static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage,
@@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
if (!ca)
goto out;
- ca->cpuusage = alloc_percpu(struct cpuacct_usage);
+ ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage)
goto out_free_ca;
@@ -99,14 +95,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
enum cpuacct_stat_index index)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
u64 data;
/*
* We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of usages.
*/
- BUG_ON(index > CPUACCT_STAT_NSTATS);
+ if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS))
+ return 0;
#ifndef CONFIG_64BIT
/*
@@ -115,14 +113,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
- if (index == CPUACCT_STAT_NSTATS) {
- int i = 0;
-
- data = 0;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- data += cpuusage->usages[i];
- } else {
- data = cpuusage->usages[index];
+ switch (index) {
+ case CPUACCT_STAT_USER:
+ data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE];
+ break;
+ case CPUACCT_STAT_SYSTEM:
+ data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] +
+ cpustat[CPUTIME_SOFTIRQ];
+ break;
+ case CPUACCT_STAT_NSTATS:
+ data = *cpuusage;
+ break;
}
#ifndef CONFIG_64BIT
@@ -132,10 +133,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
return data;
}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- int i;
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
+
+ /* Don't allow to reset global kernel_cpustat */
+ if (ca == &root_cpuacct)
+ return;
#ifndef CONFIG_64BIT
/*
@@ -143,9 +148,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
*/
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
-
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- cpuusage->usages[i] = val;
+ *cpuusage = 0;
+ cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0;
+ cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0;
+ cpustat[CPUTIME_SOFTIRQ] = 0;
#ifndef CONFIG_64BIT
raw_spin_rq_unlock_irq(cpu_rq(cpu));
@@ -196,7 +202,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
return -EINVAL;
for_each_possible_cpu(cpu)
- cpuacct_cpuusage_write(ca, cpu, 0);
+ cpuacct_cpuusage_write(ca, cpu);
return 0;
}
@@ -243,25 +249,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
seq_puts(m, "\n");
for_each_possible_cpu(cpu) {
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
seq_printf(m, "%d", cpu);
-
- for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit
- * platforms.
- */
- raw_spin_rq_lock_irq(cpu_rq(cpu));
-#endif
-
- seq_printf(m, " %llu", cpuusage->usages[index]);
-
-#ifndef CONFIG_64BIT
- raw_spin_rq_unlock_irq(cpu_rq(cpu));
-#endif
- }
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %llu",
+ cpuacct_cpuusage_read(ca, cpu, index));
seq_puts(m, "\n");
}
return 0;
@@ -270,25 +261,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
- s64 val[CPUACCT_STAT_NSTATS];
+ struct task_cputime cputime;
+ u64 val[CPUACCT_STAT_NSTATS];
int cpu;
int stat;
- memset(val, 0, sizeof(val));
+ memset(&cputime, 0, sizeof(cputime));
for_each_possible_cpu(cpu) {
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
+ cputime.utime += cpustat[CPUTIME_USER];
+ cputime.utime += cpustat[CPUTIME_NICE];
+ cputime.stime += cpustat[CPUTIME_SYSTEM];
+ cputime.stime += cpustat[CPUTIME_IRQ];
+ cputime.stime += cpustat[CPUTIME_SOFTIRQ];
+
+ cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu);
}
+ cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime,
+ &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]);
+
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
- seq_printf(sf, "%s %lld\n",
- cpuacct_stat_desc[stat],
- (long long)nsec_to_clock_t(val[stat]));
+ seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat],
+ nsec_to_clock_t(val[stat]));
}
return 0;
@@ -339,16 +335,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
- int index = CPUACCT_STAT_SYSTEM;
- struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
-
- if (regs && user_mode(regs))
- index = CPUACCT_STAT_USER;
rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- __this_cpu_add(ca->cpuusage->usages[index], cputime);
+ __this_cpu_add(*ca->cpuusage, cputime);
rcu_read_unlock();
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e7af18857371..26778884d9ab 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -168,7 +168,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max,
FREQUENCY_UTIL, NULL);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9392aea1804e..b7ec42732b28 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -148,10 +148,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += cputime;
+ task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += cputime;
+ task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime;
}
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7dcbaa31c5d9..aa29211de1bf 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PN(avg_atom);
__PN(avg_per_cpu);
+
+#ifdef CONFIG_SCHED_CORE
+ PN_SCHEDSTAT(core_forceidle_sum);
+#endif
}
__P(nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e476f6d9435..095b0aa378df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1502,7 +1502,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
-static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight);
@@ -1569,7 +1568,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
- ns->util += cpu_util(cpu);
+ ns->util += cpu_util_cfs(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
@@ -3240,7 +3239,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
- * See cpu_util().
+ * See cpu_util_cfs().
*/
cpufreq_update_util(rq, flags);
}
@@ -4070,7 +4069,8 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
-static inline int task_fits_capacity(struct task_struct *p, long capacity)
+static inline int task_fits_capacity(struct task_struct *p,
+ unsigned long capacity)
{
return fits_capacity(uclamp_task_util(p), capacity);
}
@@ -5509,11 +5509,9 @@ static inline void hrtick_update(struct rq *rq)
#endif
#ifdef CONFIG_SMP
-static inline unsigned long cpu_util(int cpu);
-
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
+ return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
@@ -6345,7 +6343,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
-static inline bool asym_fits_capacity(int task_util, int cpu)
+static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
{
if (static_branch_unlikely(&sched_asym_cpucapacity))
return fits_capacity(task_util, capacity_of(cpu));
@@ -6398,8 +6396,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
+ in_task() &&
prev == smp_processor_id() &&
- this_rq()->nr_running <= 1) {
+ this_rq()->nr_running <= 1 &&
+ asym_fits_capacity(task_util, prev)) {
return prev;
}
@@ -6456,58 +6456,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/**
- * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
- * @cpu: the CPU to get the utilization of
- *
- * The unit of the return value must be the one of capacity so we can compare
- * the utilization with the capacity of the CPU that is available for CFS task
- * (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * The estimated utilization of a CPU is defined to be the maximum between its
- * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
- * currently RUNNABLE on that CPU.
- * This allows to properly represent the expected utilization of a CPU which
- * has just got a big task running since a long sleep period. At the same time
- * however it preserves the benefits of the "blocked utilization" in
- * describing the potential for other tasks waking up on the same CPU.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- *
- * Return: the (estimated) utilization for the specified CPU
- */
-static inline unsigned long cpu_util(int cpu)
-{
- struct cfs_rq *cfs_rq;
- unsigned int util;
-
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- if (sched_feat(UTIL_EST))
- util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
-
- return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@@ -6528,7 +6476,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util(cpu);
+ return cpu_util_cfs(cpu);
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
@@ -6592,7 +6540,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with
- * the cpu_util call.
+ * cpu_util.
*/
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
@@ -6624,7 +6572,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
- * cpu_util() after the task has been enqueued.
+ * cpu_util after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
@@ -6915,6 +6863,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
break;
}
+ /*
+ * Usually only true for WF_EXEC and WF_FORK, as sched_domains
+ * usually do not have SD_BALANCE_WAKE set. That means wakeup
+ * will usually go to the fast path.
+ */
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
@@ -8681,7 +8634,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct rq *rq = cpu_rq(i);
sgs->group_load += cpu_load(rq);
- sgs->group_util += cpu_util(i);
+ sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -9699,7 +9652,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util(cpu_of(rq));
+ util = cpu_util_cfs(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -11068,7 +11021,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
- if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2bb54b7..a679613a7cb7 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Pressure stall information for CPU, memory and IO
*
@@ -34,13 +35,19 @@
* delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive.
*
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
* SOME = nr_delayed_tasks != 0
- * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
*
* The percentage of wallclock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
@@ -81,13 +88,13 @@
*
* threads = min(nr_nonidle_tasks, nr_cpus)
* SOME = min(nr_delayed_tasks / threads, 1)
- * FULL = (threads - min(nr_running_tasks, threads)) / threads
+ * FULL = (threads - min(nr_productive_tasks, threads)) / threads
*
* For the 257 number crunchers on 256 CPUs, this yields:
*
* threads = min(257, 256)
* SOME = min(1 / 256, 1) = 0.4%
- * FULL = (256 - min(257, 256)) / 256 = 0%
+ * FULL = (256 - min(256, 256)) / 256 = 0%
*
* For the 1 out of 4 memory-delayed tasks, this yields:
*
@@ -112,7 +119,7 @@
* For each runqueue, we track:
*
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
*
* and then periodically aggregate:
@@ -233,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_SOME:
return unlikely(tasks[NR_MEMSTALL]);
case PSI_MEM_FULL:
- return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+ return unlikely(tasks[NR_MEMSTALL] &&
+ tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
case PSI_CPU_FULL:
@@ -710,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- groupc->tasks[3], clear, set);
+ groupc->tasks[3], groupc->tasks[4],
+ clear, set);
psi_bug = 1;
}
}
@@ -833,7 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
/*
* When switching between tasks that have an identical
* runtime state, the cgroup that contains both tasks
- * runtime state, the cgroup that contains both tasks
* we reach the first common ancestor. Iterate @next's
* ancestors only until we encounter @prev's ONCPU.
*/
@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
int clear = TSK_ONCPU, set = 0;
/*
- * When we're going to sleep, psi_dequeue() lets us handle
- * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
- * with TSK_ONCPU and save walking common ancestors twice.
+ * When we're going to sleep, psi_dequeue() lets us
+ * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+ * TSK_IOWAIT here, where we can combine it with
+ * TSK_ONCPU and save walking common ancestors twice.
*/
if (sleep) {
clear |= TSK_RUNNING;
+ if (prev->in_memstall)
+ clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
}
@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 1;
- psi_task_change(current, 0, TSK_MEMSTALL);
+ psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf);
}
@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 0;
- psi_task_change(current, TSK_MEMSTALL, 0);
+ psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b48baaba2fc2..7b4f4fbbb404 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
@@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ do_start_rt_bandwidth(rt_b);
+}
+
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -1031,13 +1036,17 @@ static void update_curr_rt(struct rq *rq)
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ int exceeded;
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
+ exceeded = sched_rt_runtime_exceeded(rt_rq);
+ if (exceeded)
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (exceeded)
+ do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
}
@@ -2911,8 +2920,12 @@ static int sched_rt_global_validate(void)
static void sched_rt_do_global(void)
{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
def_rt_bandwidth.rt_runtime = global_rt_runtime();
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0e66749486e7..de53be905739 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1111,8 +1111,10 @@ struct rq {
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
- unsigned char core_forceidle;
+ unsigned int core_forceidle_count;
unsigned int core_forceidle_seq;
+ unsigned int core_forceidle_occupation;
+ u64 core_forceidle_start;
#endif
};
@@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
}
extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
-extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
extern void sched_core_get(void);
extern void sched_core_put(void);
@@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { }
#include "stats.h"
#include "autogroup.h"
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+}
+
+#else
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
+
+#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
+
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -2938,16 +2966,52 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg);
}
-static inline unsigned long cpu_util_cfs(struct rq *rq)
+/**
+ * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for.
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Estimated) utilization for the specified CPU.
+ */
+static inline unsigned long cpu_util_cfs(int cpu)
{
- unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
+ struct cfs_rq *cfs_rq;
+ unsigned long util;
+
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST)) {
util = max_t(unsigned long, util,
- READ_ONCE(rq->cfs.avg.util_est.enqueued));
+ READ_ONCE(cfs_rq->avg.util_est.enqueued));
}
- return util;
+ return min(util, capacity_orig_of(cpu));
}
static inline unsigned long cpu_util_rt(struct rq *rq)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index cfb0893a83d4..3a3c826dd83a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (static_branch_likely(&psi_disabled))
return;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL_RUNNING;
+
if (!wakeup || p->sched_psi_wake_requeue) {
if (p->in_memstall)
set |= TSK_MEMSTALL;
@@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
return;
if (p->in_memstall)
- clear |= TSK_MEMSTALL;
+ clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
psi_task_change(p, clear, 0);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index a629b11bf3e0..dfcee3888b00 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4185,6 +4185,15 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
ss_mode != 0))
return -EINVAL;
+ /*
+ * Return before taking any locks if no actual
+ * sigaltstack changes were requested.
+ */
+ if (t->sas_ss_sp == (unsigned long)ss_sp &&
+ t->sas_ss_size == ss_size &&
+ t->sas_ss_flags == ss_flags)
+ return 0;
+
sigaltstack_lock();
if (ss_mode == SS_DISABLE) {
ss_size = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 083be6af29d7..d7ed1dffa426 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -33,6 +33,7 @@
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/kmemleak.h>
+#include <linux/filter.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b348749a9fc6..dcdcb85121e4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1306,8 +1306,7 @@ int do_settimeofday64(const struct timespec64 *ts)
timekeeping_forward_now(tk);
xt = tk_xtime(tk);
- ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
- ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+ ts_delta = timespec64_sub(*ts, xt);
if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
ret = -EINVAL;
diff --git a/kernel/torture.c b/kernel/torture.c
index bb8f411c974b..ef27a6c82451 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -570,7 +570,7 @@ int torture_shuffle_init(long shuffint)
shuffle_idle_cpu = -1;
if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
- VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+ TOROUT_ERRSTRING("Failed to alloc mask");
return -ENOMEM;
}
@@ -934,7 +934,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
*tp = kthread_run(fn, arg, "%s", s);
if (IS_ERR(*tp)) {
ret = PTR_ERR(*tp);
- VERBOSE_TOROUT_ERRSTRING(f);
+ TOROUT_ERRSTRING(f);
*tp = NULL;
}
torture_shuffle_task_register(*tp);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ae9755037b7e..21aa30644219 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -345,7 +345,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -394,7 +394,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.func = bpf_trace_printk,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
};
@@ -450,9 +450,9 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = {
.func = bpf_trace_vprintk,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
- .arg3_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -492,9 +492,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_seq_file_ids[0],
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
- .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -509,7 +509,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_seq_file_ids[0],
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -533,7 +533,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_seq_file_ids[0],
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -694,7 +694,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -764,7 +764,7 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = {
.func = bpf_get_current_task_btf,
.gpl_only = true,
.ret_type = RET_PTR_TO_BTF_ID,
- .ret_btf_id = &btf_task_struct_ids[0],
+ .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
@@ -779,7 +779,7 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = {
.func = bpf_task_pt_regs,
.gpl_only = true,
.arg1_type = ARG_PTR_TO_BTF_ID,
- .arg1_btf_id = &btf_task_struct_ids[0],
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.ret_type = RET_PTR_TO_BTF_ID,
.ret_btf_id = &bpf_task_pt_regs_ids[0],
};
@@ -1004,7 +1004,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE,
.arg5_type = ARG_ANYTHING,
};
@@ -1012,7 +1012,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
{
/* This helper call is inlined by verifier. */
- return ((u64 *)ctx)[-1];
+ return ((u64 *)ctx)[-2];
}
static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
@@ -1091,6 +1091,53 @@ static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
};
+BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
+{
+ /* This helper call is inlined by verifier. */
+ u64 nr_args = ((u64 *)ctx)[-1];
+
+ if ((u64) n >= nr_args)
+ return -EINVAL;
+ *value = ((u64 *)ctx)[n];
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_proto = {
+ .func = get_func_arg,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
+{
+ /* This helper call is inlined by verifier. */
+ u64 nr_args = ((u64 *)ctx)[-1];
+
+ *value = ((u64 *)ctx)[nr_args];
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_ret_proto = {
+ .func = get_func_ret,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_1(get_func_arg_cnt, void *, ctx)
+{
+ /* This helper call is inlined by verifier. */
+ return ((u64 *)ctx)[-1];
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
+ .func = get_func_arg_cnt,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1206,6 +1253,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_func_ip_proto_tracing;
case BPF_FUNC_get_branch_snapshot:
return &bpf_get_branch_snapshot_proto;
+ case BPF_FUNC_find_vma:
+ return &bpf_find_vma_proto;
case BPF_FUNC_trace_vprintk:
return bpf_get_trace_vprintk_proto();
default:
@@ -1285,7 +1334,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -1400,9 +1449,6 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
-#ifndef CONFIG_X86
- return -ENOENT;
-#else
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
struct perf_branch_stack *br_stack = ctx->data->br_stack;
u32 to_copy;
@@ -1411,7 +1457,7 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
return -EINVAL;
if (unlikely(!br_stack))
- return -EINVAL;
+ return -ENOENT;
if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE)
return br_stack->nr * br_entry_size;
@@ -1423,7 +1469,6 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
memcpy(buf, br_stack->entries, to_copy);
return to_copy;
-#endif
}
static const struct bpf_func_proto bpf_read_branch_records_proto = {
@@ -1511,7 +1556,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -1565,7 +1610,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -1631,6 +1676,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
NULL;
case BPF_FUNC_d_path:
return &bpf_d_path_proto;
+ case BPF_FUNC_get_func_arg:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
+ case BPF_FUNC_get_func_ret:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
+ case BPF_FUNC_get_func_arg_cnt:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
default:
fn = raw_tp_prog_func_proto(func_id, prog);
if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 88de94da596b..78ea542ce3bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3207,7 +3207,7 @@ struct trace_buffer_struct {
char buffer[4][TRACE_BUF_SIZE];
};
-static struct trace_buffer_struct *trace_percpu_buffer;
+static struct trace_buffer_struct __percpu *trace_percpu_buffer;
/*
* This allows for lockless recording. If we're nested too deeply, then
@@ -3217,7 +3217,7 @@ static char *get_trace_buf(void)
{
struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
- if (!buffer || buffer->nesting >= 4)
+ if (!trace_percpu_buffer || buffer->nesting >= 4)
return NULL;
buffer->nesting++;
@@ -3236,7 +3236,7 @@ static void put_trace_buf(void)
static int alloc_percpu_trace_buffer(void)
{
- struct trace_buffer_struct *buffers;
+ struct trace_buffer_struct __percpu *buffers;
if (trace_percpu_buffer)
return 0;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 33272a7b6912..4e1257f50aa3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -7,6 +7,7 @@
*/
#define pr_fmt(fmt) "trace_kprobe: " fmt
+#include <linux/bpf-cgroup.h>
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f5f0039d31e5..4f35514a48f3 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -7,6 +7,7 @@
*/
#define pr_fmt(fmt) "trace_uprobe: " fmt
+#include <linux/bpf-cgroup.h>
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/module.h>
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 4f5613dac227..7b32c356ebc5 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -264,15 +264,16 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
{
struct ucounts *iter;
+ long max = LONG_MAX;
long ret = 0;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- long max = READ_ONCE(iter->ns->ucount_max[type]);
long new = atomic_long_add_return(v, &iter->ucount[type]);
if (new < 0 || new > max)
ret = LONG_MAX;
else if (iter == ucounts)
ret = new;
+ max = READ_ONCE(iter->ns->ucount_max[type]);
}
return ret;
}
@@ -312,15 +313,16 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
{
/* Caller must hold a reference to ucounts */
struct ucounts *iter;
+ long max = LONG_MAX;
long dec, ret = 0;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- long max = READ_ONCE(iter->ns->ucount_max[type]);
long new = atomic_long_add_return(1, &iter->ucount[type]);
if (new < 0 || new > max)
goto unwind;
if (iter == ucounts)
ret = new;
+ max = READ_ONCE(iter->ns->ucount_max[type]);
/*
* Grab an extra ucount reference for the caller when
* the rlimit count was previously 0.
@@ -339,15 +341,16 @@ unwind:
return 0;
}
-bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit)
{
struct ucounts *iter;
- if (get_ucounts_value(ucounts, type) > max)
- return true;
+ long max = rlimit;
+ if (rlimit > LONG_MAX)
+ max = LONG_MAX;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- max = READ_ONCE(iter->ns->ucount_max[type]);
if (get_ucounts_value(iter, type) > max)
return true;
+ max = READ_ONCE(iter->ns->ucount_max[type]);
}
return false;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 613917bbc4e7..33f1106b4f99 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -154,6 +154,9 @@ struct worker_pool {
unsigned long watchdog_ts; /* L: watchdog timestamp */
+ /* The current concurrency level. */
+ atomic_t nr_running;
+
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
@@ -178,18 +181,11 @@ struct worker_pool {
int refcnt; /* PL: refcnt for unbound pools */
/*
- * The current concurrency level. As it's likely to be accessed
- * from other CPUs during try_to_wake_up(), put it in a separate
- * cacheline.
- */
- atomic_t nr_running ____cacheline_aligned_in_smp;
-
- /*
* Destruction of pool is RCU protected to allow dereferences
* from get_work_pool().
*/
struct rcu_head rcu;
-} ____cacheline_aligned_in_smp;
+};
/*
* The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
@@ -868,8 +864,17 @@ void wq_worker_running(struct task_struct *task)
if (!worker->sleeping)
return;
+
+ /*
+ * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
+ * and the nr_running increment below, we may ruin the nr_running reset
+ * and leave with an unexpected pool->nr_running == 1 on the newly unbound
+ * pool. Protect against such race.
+ */
+ preempt_disable();
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running);
+ preempt_enable();
worker->sleeping = 0;
}
@@ -878,8 +883,7 @@ void wq_worker_running(struct task_struct *task)
* @task: task going to sleep
*
* This function is called from schedule() when a busy worker is
- * going to sleep. Preemption needs to be disabled to protect ->sleeping
- * assignment.
+ * going to sleep.
*/
void wq_worker_sleeping(struct task_struct *task)
{
@@ -904,6 +908,16 @@ void wq_worker_sleeping(struct task_struct *task)
raw_spin_lock_irq(&pool->lock);
/*
+ * Recheck in case unbind_workers() preempted us. We don't
+ * want to decrement nr_running after the worker is unbound
+ * and nr_running has been reset.
+ */
+ if (worker->flags & WORKER_NOT_RUNNING) {
+ raw_spin_unlock_irq(&pool->lock);
+ return;
+ }
+
+ /*
* The counterpart of the following dec_and_test, implied mb,
* worklist not empty test sequence is in insert_work().
* Please read comment there.
@@ -1531,7 +1545,8 @@ out:
* @work: work to queue
*
* We queue the work to a specific CPU, the caller must ensure it
- * can't go away.
+ * can't go away. Callers that fail to ensure that the specified
+ * CPU cannot go away will execute on a randomly chosen CPU.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
@@ -1811,14 +1826,8 @@ static void worker_enter_idle(struct worker *worker)
if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
- /*
- * Sanity check nr_running. Because unbind_workers() releases
- * pool->lock between setting %WORKER_UNBOUND and zapping
- * nr_running, the warning may trigger spuriously. Check iff
- * unbind is not in progress.
- */
- WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
- pool->nr_workers == pool->nr_idle &&
+ /* Sanity check nr_running. */
+ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle &&
atomic_read(&pool->nr_running));
}
@@ -4979,38 +4988,22 @@ static void unbind_workers(int cpu)
/*
* We've blocked all attach/detach operations. Make all workers
* unbound and set DISASSOCIATED. Before this, all workers
- * except for the ones which are still executing works from
- * before the last CPU down must be on the cpu. After
- * this, they may become diasporas.
+ * must be on the cpu. After this, they may become diasporas.
+ * And the preemption disabled section in their sched callbacks
+ * are guaranteed to see WORKER_UNBOUND since the code here
+ * is on the same cpu.
*/
for_each_pool_worker(worker, pool)
worker->flags |= WORKER_UNBOUND;
pool->flags |= POOL_DISASSOCIATED;
- raw_spin_unlock_irq(&pool->lock);
-
- for_each_pool_worker(worker, pool) {
- kthread_set_per_cpu(worker->task, -1);
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
- }
-
- mutex_unlock(&wq_pool_attach_mutex);
-
/*
- * Call schedule() so that we cross rq->lock and thus can
- * guarantee sched callbacks see the %WORKER_UNBOUND flag.
- * This is necessary as scheduler callbacks may be invoked
- * from other cpus.
- */
- schedule();
-
- /*
- * Sched callbacks are disabled now. Zap nr_running.
- * After this, nr_running stays zero and need_more_worker()
- * and keep_working() are always true as long as the
- * worklist is not empty. This pool now behaves as an
- * unbound (in terms of concurrency management) pool which
+ * The handling of nr_running in sched callbacks are disabled
+ * now. Zap nr_running. After this, nr_running stays zero and
+ * need_more_worker() and keep_working() are always true as
+ * long as the worklist is not empty. This pool now behaves as
+ * an unbound (in terms of concurrency management) pool which
* are served by workers tied to the pool.
*/
atomic_set(&pool->nr_running, 0);
@@ -5020,9 +5013,16 @@ static void unbind_workers(int cpu)
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
- raw_spin_lock_irq(&pool->lock);
wake_up_worker(pool);
+
raw_spin_unlock_irq(&pool->lock);
+
+ for_each_pool_worker(worker, pool) {
+ kthread_set_per_cpu(worker->task, -1);
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+ }
+
+ mutex_unlock(&wq_pool_attach_mutex);
}
}
@@ -5059,17 +5059,6 @@ static void rebind_workers(struct worker_pool *pool)
unsigned int worker_flags = worker->flags;
/*
- * A bound idle worker should actually be on the runqueue
- * of the associated CPU for local wake-ups targeting it to
- * work. Kick all idle workers so that they migrate to the
- * associated CPU. Doing this in the same loop as
- * replacing UNBOUND with REBOUND is safe as no worker will
- * be bound before @pool->lock is released.
- */
- if (worker_flags & WORKER_IDLE)
- wake_up_process(worker->task);
-
- /*
* We want to clear UNBOUND but can't directly call
* worker_clr_flags() or adjust nr_running. Atomically
* replace UNBOUND with another NOT_RUNNING flag REBOUND.