aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2024-11-26 18:15:06 +0100
committerChristian Brauner <brauner@kernel.org>2024-11-26 18:15:06 +0100
commitcf87766dd6f9ddcceaa8ee26e3cbd7538e42dd19 (patch)
tree8531685628a090333db2f874688ac07624b51072 /kernel
parentfs_parser: update mount_api doc to match function signature (diff)
parentfs/backing_file: fix wrong argument in callback (diff)
downloadwireguard-linux-cf87766dd6f9ddcceaa8ee26e3cbd7538e42dd19.tar.xz
wireguard-linux-cf87766dd6f9ddcceaa8ee26e3cbd7538e42dd19.zip
Merge branch 'ovl.fixes'
Bring in an overlayfs fix for v6.13-rc1 that fixes a bug introduced by the overlayfs changes merged for v6.13. Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arena.c38
-rw-r--r--kernel/bpf/arraymap.c26
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c4
-rw-r--r--kernel/bpf/bpf_inode_storage.c4
-rw-r--r--kernel/bpf/bpf_local_storage.c79
-rw-r--r--kernel/bpf/bpf_struct_ops.c115
-rw-r--r--kernel/bpf/bpf_task_storage.c7
-rw-r--r--kernel/bpf/btf.c57
-rw-r--r--kernel/bpf/cgroup.c2
-rw-r--r--kernel/bpf/core.c6
-rw-r--r--kernel/bpf/dispatcher.c3
-rw-r--r--kernel/bpf/hashtab.c56
-rw-r--r--kernel/bpf/helpers.c29
-rw-r--r--kernel/bpf/kmem_cache_iter.c238
-rw-r--r--kernel/bpf/memalloc.c5
-rw-r--r--kernel/bpf/range_tree.c272
-rw-r--r--kernel/bpf/range_tree.h21
-rw-r--r--kernel/bpf/syscall.c191
-rw-r--r--kernel/bpf/trampoline.c60
-rw-r--r--kernel/bpf/verifier.c597
-rw-r--r--kernel/cgroup/cgroup.c21
-rw-r--r--kernel/cgroup/cpuset.c157
-rw-r--r--kernel/cgroup/freezer.c97
-rw-r--r--kernel/cgroup/rstat.c19
-rw-r--r--kernel/configs/debug.config1
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/debug/kdb/kdb_bp.c6
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c33
-rw-r--r--kernel/debug/kdb/kdb_main.c69
-rw-r--r--kernel/dma/Kconfig17
-rw-r--r--kernel/dma/coherent.c14
-rw-r--r--kernel/dma/debug.c89
-rw-r--r--kernel/dma/mapping.c37
-rw-r--r--kernel/kprobes.c91
-rw-r--r--kernel/printk/internal.h3
-rw-r--r--kernel/printk/printk.c47
-rw-r--r--kernel/printk/printk_safe.c18
-rw-r--r--kernel/sched/ext.c968
-rw-r--r--kernel/sysctl.c1
-rw-r--r--kernel/trace/Kconfig10
-rw-r--r--kernel/trace/bpf_trace.c110
-rw-r--r--kernel/trace/fgraph.c155
-rw-r--r--kernel/trace/ftrace.c118
-rw-r--r--kernel/trace/ring_buffer.c102
-rw-r--r--kernel/trace/ring_buffer_benchmark.c4
-rw-r--r--kernel/trace/rv/rv.c2
-rw-r--r--kernel/trace/trace.c96
-rw-r--r--kernel/trace/trace.h22
-rw-r--r--kernel/trace/trace_branch.c10
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_entries.h29
-rw-r--r--kernel/trace/trace_event_perf.c6
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_events_filter.c8
-rw-r--r--kernel/trace/trace_events_hist.c11
-rw-r--r--kernel/trace/trace_events_user.c4
-rw-r--r--kernel/trace/trace_functions.c36
-rw-r--r--kernel/trace/trace_functions_graph.c272
-rw-r--r--kernel/trace/trace_hwlat.c4
-rw-r--r--kernel/trace/trace_kdb.c13
-rw-r--r--kernel/trace/trace_mmiotrace.c8
-rw-r--r--kernel/trace/trace_osnoise.c12
-rw-r--r--kernel/trace/trace_output.c5
-rw-r--r--kernel/trace/trace_preemptirq.c26
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_syscalls.c28
-rw-r--r--kernel/tracepoint.c75
-rw-r--r--kernel/ucount.c2
-rw-r--r--kernel/watchdog.c8
-rw-r--r--kernel/workqueue.c22
73 files changed, 3362 insertions, 1358 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 9b9c151b5c82..9762bdddf1de 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
-obj-$(CONFIG_BPF_SYSCALL) += arena.o
+obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
endif
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
@@ -52,3 +52,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
+obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index e52b3ad231b9..945a5680f6a5 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -3,9 +3,11 @@
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
+#include "linux/filter.h"
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
+#include "range_tree.h"
/*
* bpf_arena is a sparsely populated shared memory region between bpf program and
@@ -45,7 +47,7 @@ struct bpf_arena {
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
- struct maple_tree mt;
+ struct range_tree rt;
struct list_head vma_list;
struct mutex lock;
};
@@ -98,6 +100,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
u64 vm_range;
int err = -ENOMEM;
+ if (!bpf_jit_supports_arena())
+ return ERR_PTR(-EOPNOTSUPP);
+
if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
/* BPF_F_MMAPABLE must be set */
!(attr->map_flags & BPF_F_MMAPABLE) ||
@@ -132,7 +137,8 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
INIT_LIST_HEAD(&arena->vma_list);
bpf_map_init_from_attr(&arena->map, attr);
- mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE);
+ range_tree_init(&arena->rt);
+ range_tree_set(&arena->rt, 0, attr->max_entries);
mutex_init(&arena->lock);
return &arena->map;
@@ -183,7 +189,7 @@ static void arena_map_free(struct bpf_map *map)
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
free_vm_area(arena->kern_vm);
- mtree_destroy(&arena->mt);
+ range_tree_destroy(&arena->rt);
bpf_map_area_free(arena);
}
@@ -274,20 +280,20 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
/* User space requested to segfault when page is not allocated by bpf prog */
return VM_FAULT_SIGSEGV;
- ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL);
+ ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
return VM_FAULT_SIGSEGV;
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
if (ret) {
- mtree_erase(&arena->mt, vmf->pgoff);
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
return VM_FAULT_SIGSEGV;
}
ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
if (ret) {
- mtree_erase(&arena->mt, vmf->pgoff);
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
__free_page(page);
return VM_FAULT_SIGSEGV;
}
@@ -444,12 +450,16 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
guard(mutex)(&arena->lock);
- if (uaddr)
- ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1,
- MT_ENTRY, GFP_KERNEL);
- else
- ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY,
- page_cnt, 0, page_cnt_max - 1, GFP_KERNEL);
+ if (uaddr) {
+ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
+ if (ret)
+ goto out_free_pages;
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ } else {
+ ret = pgoff = range_tree_find(&arena->rt, page_cnt);
+ if (pgoff >= 0)
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ }
if (ret)
goto out_free_pages;
@@ -476,7 +486,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
kvfree(pages);
return clear_lo32(arena->user_vm_start) + uaddr32;
out:
- mtree_erase(&arena->mt, pgoff);
+ range_tree_set(&arena->rt, pgoff, page_cnt);
out_free_pages:
kvfree(pages);
return 0;
@@ -516,7 +526,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
pgoff = compute_pgoff(arena, uaddr);
/* clear range */
- mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL);
+ range_tree_set(&arena->rt, pgoff, page_cnt);
if (page_cnt > 1)
/* bulk zap if multiple pages being freed */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 79660e3fca4c..6cdbb4c33d31 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -947,22 +947,44 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
struct bpf_prog *prog = bpf_prog_get(fd);
+ bool is_extended;
if (IS_ERR(prog))
return prog;
- if (!bpf_prog_map_compatible(map, prog)) {
+ if (prog->type == BPF_PROG_TYPE_EXT ||
+ !bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
+ mutex_lock(&prog->aux->ext_mutex);
+ is_extended = prog->aux->is_extended;
+ if (!is_extended)
+ prog->aux->prog_array_member_cnt++;
+ mutex_unlock(&prog->aux->ext_mutex);
+ if (is_extended) {
+ /* Extended prog can not be tail callee. It's to prevent a
+ * potential infinite loop like:
+ * tail callee prog entry -> tail callee prog subprog ->
+ * freplace prog entry --tailcall-> tail callee prog entry.
+ */
+ bpf_prog_put(prog);
+ return ERR_PTR(-EBUSY);
+ }
+
return prog;
}
static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ struct bpf_prog *prog = ptr;
+
+ mutex_lock(&prog->aux->ext_mutex);
+ prog->aux->prog_array_member_cnt--;
+ mutex_unlock(&prog->aux->ext_mutex);
/* bpf_prog is freed after one RCU or tasks trace grace period */
- bpf_prog_put(ptr);
+ bpf_prog_put(prog);
}
static u32 prog_fd_array_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 28efd0a3f220..20f05de92e9c 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -107,7 +107,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
bpf_cgrp_storage_lock();
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, map_flags, GFP_ATOMIC);
+ value, map_flags, false, GFP_ATOMIC);
bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
@@ -181,7 +181,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, BPF_NOEXIST, gfp_flags);
+ value, BPF_NOEXIST, false, gfp_flags);
unlock:
bpf_cgrp_storage_unlock();
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e16e79f8cd6d..a51c82dee1bd 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -99,7 +99,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
- value, map_flags, GFP_ATOMIC);
+ value, map_flags, false, GFP_ATOMIC);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -153,7 +153,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, gfp_flags);
+ BPF_NOEXIST, false, gfp_flags);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index c938dea5ddbf..7e6a0af0afc1 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem, gfp_t gfp_flags)
+ void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
@@ -99,9 +99,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
}
if (selem) {
- if (value)
+ if (value) {
+ /* No need to call check_and_init_map_value as memory is zero init */
copy_map_value(&smap->map, SDATA(selem)->data, value);
- /* No need to call check_and_init_map_value as memory is zero init */
+ if (swap_uptrs)
+ bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
+ }
return selem;
}
@@ -209,8 +212,12 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
static void bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ /* The bpf_local_storage_map_free will wait for rcu_barrier */
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
bpf_mem_cache_raw_free(selem);
}
@@ -226,16 +233,25 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *smap,
bool reuse_now)
{
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-
if (!smap->bpf_ma) {
+ /* Only task storage has uptrs and task storage
+ * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true
+ * for task storage, so this bpf_obj_free_fields() won't unpin
+ * any uptr.
+ */
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
- if (!reuse_now) {
- call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
- } else {
+ if (reuse_now) {
+ /* reuse_now == true only happens when the storage owner
+ * (e.g. task_struct) is being destructed or the map itself
+ * is being destructed (ie map_free). In both cases,
+ * no bpf prog can have a hold on the selem. It is
+ * safe to unpin the uptrs and free the selem now.
+ */
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
/* Instead of using the vanilla call_rcu(),
* bpf_mem_cache_free will be able to reuse selem
* immediately.
@@ -243,6 +259,26 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
migrate_disable();
bpf_mem_cache_free(&smap->selem_ma, selem);
migrate_enable();
+ return;
+ }
+
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
+{
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ struct hlist_node *n;
+
+ /* The "_safe" iteration is needed.
+ * The loop is not removing the selem from the list
+ * but bpf_selem_free will use the selem->rcu_head
+ * which is union-ized with the selem->free_node.
+ */
+ hlist_for_each_entry_safe(selem, n, list, free_node) {
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ bpf_selem_free(selem, smap, reuse_now);
}
}
@@ -252,7 +288,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
- bool uncharge_mem, bool reuse_now)
+ bool uncharge_mem, struct hlist_head *free_selem_list)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -296,7 +332,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- bpf_selem_free(selem, smap, reuse_now);
+ hlist_add_head(&selem->free_node, free_selem_list);
if (rcu_access_pointer(local_storage->smap) == smap)
RCU_INIT_POINTER(local_storage->smap, NULL);
@@ -345,6 +381,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
bool bpf_ma, free_local_storage = false;
+ HLIST_HEAD(selem_free_list);
unsigned long flags;
if (unlikely(!selem_linked_to_storage_lockless(selem)))
@@ -360,9 +397,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, reuse_now);
+ local_storage, selem, true, &selem_free_list);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&selem_free_list, reuse_now);
+
if (free_local_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
}
@@ -524,11 +563,12 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags, gfp_t gfp_flags)
+ void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
struct bpf_local_storage *local_storage;
+ HLIST_HEAD(old_selem_free_list);
unsigned long flags;
int err;
@@ -550,7 +590,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
@@ -584,7 +624,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
- alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
@@ -624,11 +664,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- true, false);
+ true, &old_selem_free_list);
}
unlock:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&old_selem_free_list, false);
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
bpf_selem_free(alloc_selem, smap, true);
@@ -706,6 +747,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
bool bpf_ma, free_storage = false;
+ HLIST_HEAD(free_selem_list);
struct hlist_node *n;
unsigned long flags;
@@ -734,10 +776,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, true);
+ local_storage, selem, true, &free_selem_list);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&free_selem_list, true);
+
if (free_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
}
@@ -883,6 +927,9 @@ void bpf_local_storage_map_free(struct bpf_map *map,
synchronize_rcu();
if (smap->bpf_ma) {
+ rcu_barrier_tasks_trace();
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
bpf_mem_alloc_destroy(&smap->selem_ma);
bpf_mem_alloc_destroy(&smap->storage_ma);
}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index fda3dd2ee984..606efe32485a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -23,7 +23,6 @@ struct bpf_struct_ops_value {
struct bpf_struct_ops_map {
struct bpf_map map;
- struct rcu_head rcu;
const struct bpf_struct_ops_desc *st_ops_desc;
/* protect map_update */
struct mutex lock;
@@ -32,7 +31,9 @@ struct bpf_struct_ops_map {
* (in kvalue.data).
*/
struct bpf_link **links;
- u32 links_cnt;
+ /* ksyms for bpf trampolines */
+ struct bpf_ksym **ksyms;
+ u32 funcs_cnt;
u32 image_pages_cnt;
/* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog.
@@ -481,11 +482,11 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
{
u32 i;
- for (i = 0; i < st_map->links_cnt; i++) {
- if (st_map->links[i]) {
- bpf_link_put(st_map->links[i]);
- st_map->links[i] = NULL;
- }
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ break;
+ bpf_link_put(st_map->links[i]);
+ st_map->links[i] = NULL;
}
}
@@ -586,6 +587,49 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
return 0;
}
+static void bpf_struct_ops_ksym_init(const char *tname, const char *mname,
+ void *image, unsigned int size,
+ struct bpf_ksym *ksym)
+{
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname);
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ bpf_image_ksym_init(image, size, ksym);
+}
+
+static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_add(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_del(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ kfree(st_map->ksyms[i]);
+ st_map->ksyms[i] = NULL;
+ }
+}
+
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
@@ -601,6 +645,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
int prog_fd, err;
u32 i, trampoline_start, image_off = 0;
void *cur_image = NULL, *image = NULL;
+ struct bpf_link **plink;
+ struct bpf_ksym **pksym;
+ const char *tname, *mname;
if (flags)
return -EINVAL;
@@ -639,14 +686,19 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
+ plink = st_map->links;
+ pksym = st_map->ksyms;
+ tname = btf_name_by_offset(st_map->btf, t->name_off);
module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
struct bpf_tramp_link *link;
+ struct bpf_ksym *ksym;
u32 moff;
moff = __btf_member_bit_offset(t, member) / 8;
+ mname = btf_name_by_offset(st_map->btf, member->name_off);
ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
@@ -714,7 +766,14 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
}
bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
&bpf_struct_ops_link_lops, prog);
- st_map->links[i] = &link->link;
+ *plink++ = &link->link;
+
+ ksym = kzalloc(sizeof(*ksym), GFP_USER);
+ if (!ksym) {
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ *pksym++ = ksym;
trampoline_start = image_off;
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
@@ -735,6 +794,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
+
+ /* init ksym for this trampoline */
+ bpf_struct_ops_ksym_init(tname, mname,
+ image + trampoline_start,
+ image_off - trampoline_start,
+ ksym);
}
if (st_ops->validate) {
@@ -783,6 +848,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*/
reset_unlock:
+ bpf_struct_ops_map_free_ksyms(st_map);
bpf_struct_ops_map_free_image(st_map);
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
@@ -790,6 +856,8 @@ reset_unlock:
unlock:
kfree(tlinks);
mutex_unlock(&st_map->lock);
+ if (!err)
+ bpf_struct_ops_map_add_ksyms(st_map);
return err;
}
@@ -849,7 +917,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
+ if (st_map->ksyms)
+ bpf_struct_ops_map_free_ksyms(st_map);
bpf_map_area_free(st_map->links);
+ bpf_map_area_free(st_map->ksyms);
bpf_struct_ops_map_free_image(st_map);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
@@ -866,6 +937,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
if (btf_is_module(st_map->btf))
module_put(st_map->st_ops_desc->st_ops->owner);
+ bpf_struct_ops_map_del_ksyms(st_map);
+
/* The struct_ops's function may switch to another struct_ops.
*
* For example, bpf_tcp_cc_x->init() may switch to
@@ -895,6 +968,19 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
return 0;
}
+static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t)
+{
+ int i;
+ u32 count;
+ const struct btf_member *member;
+
+ count = 0;
+ for_each_member(i, t, member)
+ if (btf_type_resolve_func_ptr(btf, member->type, NULL))
+ count++;
+ return count;
+}
+
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
const struct bpf_struct_ops_desc *st_ops_desc;
@@ -961,11 +1047,15 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
map = &st_map->map;
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
- st_map->links_cnt = btf_type_vlen(t);
+ st_map->funcs_cnt = count_func_ptrs(btf, t);
st_map->links =
- bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *),
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *),
+ NUMA_NO_NODE);
+
+ st_map->ksyms =
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *),
NUMA_NO_NODE);
- if (!st_map->uvalue || !st_map->links) {
+ if (!st_map->uvalue || !st_map->links || !st_map->ksyms) {
ret = -ENOMEM;
goto errout_free;
}
@@ -994,7 +1084,8 @@ static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
usage = sizeof(*st_map) +
vt->size - sizeof(struct bpf_struct_ops_value);
usage += vt->size;
- usage += btf_type_vlen(vt) * sizeof(struct bpf_links *);
+ usage += st_map->funcs_cnt * sizeof(struct bpf_link *);
+ usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *);
usage += PAGE_SIZE;
return usage;
}
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 1eb9852a9f8e..bf7fa15fdcc6 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -128,6 +128,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
struct pid *pid;
int fd, err;
+ if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR))
+ return -EOPNOTSUPP;
+
fd = *(int *)key;
pid = pidfd_get_pid(fd, &f_flags);
if (IS_ERR(pid))
@@ -146,7 +149,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
- GFP_ATOMIC);
+ true, GFP_ATOMIC);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
@@ -218,7 +221,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, gfp_flags);
+ BPF_NOEXIST, false, gfp_flags);
return IS_ERR(sdata) ? NULL : sdata->data;
}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5cd1c7a23848..e7a59e6462a9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2808,7 +2808,7 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "type_id=%u", t->type);
}
-static struct btf_kind_operations modifier_ops = {
+static const struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_modifier_resolve,
.check_member = btf_modifier_check_member,
@@ -2817,7 +2817,7 @@ static struct btf_kind_operations modifier_ops = {
.show = btf_modifier_show,
};
-static struct btf_kind_operations ptr_ops = {
+static const struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_ptr_resolve,
.check_member = btf_ptr_check_member,
@@ -2858,7 +2858,7 @@ static void btf_fwd_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
}
-static struct btf_kind_operations fwd_ops = {
+static const struct btf_kind_operations fwd_ops = {
.check_meta = btf_fwd_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
@@ -3109,7 +3109,7 @@ static void btf_array_show(const struct btf *btf, const struct btf_type *t,
__btf_array_show(btf, t, type_id, data, bits_offset, show);
}
-static struct btf_kind_operations array_ops = {
+static const struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
.check_member = btf_array_check_member,
@@ -3334,7 +3334,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
}
static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
- u32 off, int sz, struct btf_field_info *info)
+ u32 off, int sz, struct btf_field_info *info, u32 field_mask)
{
enum btf_field_type type;
u32 res_id;
@@ -3358,9 +3358,14 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
type = BPF_KPTR_REF;
else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
type = BPF_KPTR_PERCPU;
+ else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_UPTR;
else
return -EINVAL;
+ if (!(type & field_mask))
+ return BTF_FIELD_IGNORE;
+
/* Get the base type */
t = btf_type_skip_modifiers(btf, t->type, &res_id);
/* Only pointer to struct is allowed */
@@ -3502,7 +3507,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_
field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
- if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) {
+ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
type = BPF_KPTR_REF;
goto end;
}
@@ -3535,6 +3540,7 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
case BPF_LIST_HEAD:
case BPF_RB_ROOT:
break;
@@ -3667,8 +3673,9 @@ static int btf_find_field_one(const struct btf *btf,
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
ret = btf_find_kptr(btf, var_type, off, sz,
- info_cnt ? &info[0] : &tmp);
+ info_cnt ? &info[0] : &tmp, field_mask);
if (ret < 0)
return ret;
break;
@@ -3991,6 +3998,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
if (ret < 0)
goto end;
@@ -4050,12 +4058,28 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* Hence we only need to ensure that bpf_{list_head,rb_root} ownership
* does not form cycles.
*/
- if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR)))
return 0;
for (i = 0; i < rec->cnt; i++) {
struct btf_struct_meta *meta;
+ const struct btf_type *t;
u32 btf_id;
+ if (rec->fields[i].type == BPF_UPTR) {
+ /* The uptr only supports pinning one page and cannot
+ * point to a kernel struct
+ */
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ return -EINVAL;
+ t = btf_type_by_id(rec->fields[i].kptr.btf,
+ rec->fields[i].kptr.btf_id);
+ if (!t->size)
+ return -EINVAL;
+ if (t->size > PAGE_SIZE)
+ return -E2BIG;
+ continue;
+ }
+
if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
continue;
btf_id = rec->fields[i].graph_root.value_btf_id;
@@ -4191,7 +4215,7 @@ static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
__btf_struct_show(btf, t, type_id, data, bits_offset, show);
}
-static struct btf_kind_operations struct_ops = {
+static const struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
.check_member = btf_struct_check_member,
@@ -4359,7 +4383,7 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
btf_show_end_type(show);
}
-static struct btf_kind_operations enum_ops = {
+static const struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
@@ -4462,7 +4486,7 @@ static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
btf_show_end_type(show);
}
-static struct btf_kind_operations enum64_ops = {
+static const struct btf_kind_operations enum64_ops = {
.check_meta = btf_enum64_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
@@ -4540,7 +4564,7 @@ done:
btf_verifier_log(env, ")");
}
-static struct btf_kind_operations func_proto_ops = {
+static const struct btf_kind_operations func_proto_ops = {
.check_meta = btf_func_proto_check_meta,
.resolve = btf_df_resolve,
/*
@@ -4598,7 +4622,7 @@ static int btf_func_resolve(struct btf_verifier_env *env,
return 0;
}
-static struct btf_kind_operations func_ops = {
+static const struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
.resolve = btf_func_resolve,
.check_member = btf_df_check_member,
@@ -5566,7 +5590,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
goto free_aof;
}
- ret = btf_find_kptr(btf, t, 0, 0, &tmp);
+ ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR);
if (ret != BTF_FIELD_FOUND)
continue;
@@ -6564,7 +6588,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (prog_args_trusted(prog))
info->reg_type |= PTR_TRUSTED;
- if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+ /* Raw tracepoint arguments always get marked as maybe NULL */
+ if (bpf_prog_is_raw_tp(prog))
+ info->reg_type |= PTR_MAYBE_NULL;
+ else if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
info->reg_type |= PTR_MAYBE_NULL;
if (tgt_prog) {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 025d7e2214ae..46e5db65dbc8 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1708,7 +1708,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* returned value != 1 during execution. In all other cases 0 is returned.
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
- struct ctl_table *table, int write,
+ const struct ctl_table *table, int write,
char **buf, size_t *pcount, loff_t *ppos,
enum cgroup_bpf_attach_type atype)
{
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e303626bdb2f..a2327c4fdc8b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -131,6 +131,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
#endif
mutex_init(&fp->aux->used_maps_mutex);
+ mutex_init(&fp->aux->ext_mutex);
mutex_init(&fp->aux->dst_mutex);
return fp;
@@ -3044,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
return false;
}
+bool __weak bpf_jit_supports_private_stack(void)
+{
+ return false;
+}
+
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 70fb82bf1637..b77db7413f8c 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -154,7 +154,8 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
d->image = NULL;
goto out;
}
- bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
+ bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym);
+ bpf_image_ksym_add(&d->ksym);
}
prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b14b87463ee0..3ec941a0ea41 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -896,9 +896,12 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
check_and_free_fields(htab, l);
+
+ migrate_disable();
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
bpf_mem_cache_free(&htab->ma, l);
+ migrate_enable();
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -948,7 +951,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
if (htab_is_prealloc(htab)) {
bpf_map_dec_elem_count(&htab->map);
check_and_free_fields(htab, l);
- __pcpu_freelist_push(&htab->freelist, &l->fnode);
+ pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
dec_elem_count(htab);
htab_elem_free(htab, l);
@@ -1018,7 +1021,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
- htab_put_fd_value(htab, old_elem);
*pl_new = old_elem;
} else {
struct pcpu_freelist_node *l;
@@ -1105,6 +1107,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
struct htab_elem *l_new = NULL, *l_old;
struct hlist_nulls_head *head;
unsigned long flags;
+ void *old_map_ptr;
struct bucket *b;
u32 key_size, hash;
int ret;
@@ -1183,12 +1186,27 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
+
+ /* l_old has already been stashed in htab->extra_elems, free
+ * its special fields before it is available for reuse. Also
+ * save the old map pointer in htab of maps before unlock
+ * and release it after unlock.
+ */
+ old_map_ptr = NULL;
+ if (htab_is_prealloc(htab)) {
+ if (map->ops->map_fd_put_ptr)
+ old_map_ptr = fd_htab_map_get_ptr(map, l_old);
+ check_and_free_fields(htab, l_old);
+ }
+ }
+ htab_unlock_bucket(htab, b, hash, flags);
+ if (l_old) {
+ if (old_map_ptr)
+ map->ops->map_fd_put_ptr(map, old_map_ptr, true);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
- else
- check_and_free_fields(htab, l_old);
}
- ret = 0;
+ return 0;
err:
htab_unlock_bucket(htab, b, hash, flags);
return ret;
@@ -1432,15 +1450,15 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
l = lookup_elem_raw(head, hash, key, key_size);
-
- if (l) {
+ if (l)
hlist_nulls_del_rcu(&l->hash_node);
- free_htab_elem(htab, l);
- } else {
+ else
ret = -ENOENT;
- }
htab_unlock_bucket(htab, b, hash, flags);
+
+ if (l)
+ free_htab_elem(htab, l);
return ret;
}
@@ -1853,13 +1871,14 @@ again_nocopy:
* may cause deadlock. See comments in function
* prealloc_lru_pop(). Let us do bpf_lru_push_free()
* after releasing the bucket lock.
+ *
+ * For htab of maps, htab_put_fd_value() in
+ * free_htab_elem() may acquire a spinlock with bucket
+ * lock being held and it violates the lock rule, so
+ * invoke free_htab_elem() after unlock as well.
*/
- if (is_lru_map) {
- l->batch_flink = node_to_free;
- node_to_free = l;
- } else {
- free_htab_elem(htab, l);
- }
+ l->batch_flink = node_to_free;
+ node_to_free = l;
}
dst_key += key_size;
dst_val += value_size;
@@ -1871,7 +1890,10 @@ again_nocopy:
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
- htab_lru_push_free(htab, l);
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
}
next_batch:
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d45ebe8afb4..751c150f9e1c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2522,6 +2522,25 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
}
/**
+ * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
+ * in the pid namespace of the current task. If a task is returned, it must
+ * either be stored in a map, or released with bpf_task_release().
+ * @vpid: The vpid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(vpid);
+ if (p)
+ p = bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+/**
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
* @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
@@ -3068,7 +3087,9 @@ BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_throw)
+BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -3086,8 +3107,8 @@ BTF_ID(func, bpf_cgroup_release_dtor)
#endif
BTF_KFUNCS_START(common_btf_ids)
-BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
-BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
+BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
@@ -3124,6 +3145,10 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_get_kmem_cache)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c
new file mode 100644
index 000000000000..3ae2158d767f
--- /dev/null
+++ b/kernel/bpf/kmem_cache_iter.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
+
+/* open-coded version */
+struct bpf_iter_kmem_cache {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_kmem_cache_kern {
+ struct kmem_cache *pos;
+} __attribute__((aligned(8)));
+
+#define KMEM_CACHE_POS_START ((void *)1L)
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->pos = KMEM_CACHE_POS_START;
+ return 0;
+}
+
+__bpf_kfunc struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *prev = kit->pos;
+ struct kmem_cache *next;
+ bool destroy = false;
+
+ if (!prev)
+ return NULL;
+
+ mutex_lock(&slab_mutex);
+
+ if (list_empty(&slab_caches)) {
+ mutex_unlock(&slab_mutex);
+ return NULL;
+ }
+
+ if (prev == KMEM_CACHE_POS_START)
+ next = list_first_entry(&slab_caches, struct kmem_cache, list);
+ else if (list_last_entry(&slab_caches, struct kmem_cache, list) == prev)
+ next = NULL;
+ else
+ next = list_next_entry(prev, list);
+
+ /* boot_caches have negative refcount, don't touch them */
+ if (next && next->refcount > 0)
+ next->refcount++;
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (prev && prev != KMEM_CACHE_POS_START) {
+ if (prev->refcount > 1)
+ prev->refcount--;
+ else if (prev->refcount == 1)
+ destroy = true;
+ }
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(prev);
+
+ kit->pos = next;
+ return next;
+}
+
+__bpf_kfunc void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *s = kit->pos;
+ bool destroy = false;
+
+ if (s == NULL || s == KMEM_CACHE_POS_START)
+ return;
+
+ mutex_lock(&slab_mutex);
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (s->refcount > 1)
+ s->refcount--;
+ else if (s->refcount == 1)
+ destroy = true;
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(s);
+}
+
+__bpf_kfunc_end_defs();
+
+struct bpf_iter__kmem_cache {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kmem_cache *, s);
+};
+
+union kmem_cache_iter_priv {
+ struct bpf_iter_kmem_cache it;
+ struct bpf_iter_kmem_cache_kern kit;
+};
+
+static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t cnt = 0;
+ bool found = false;
+ struct kmem_cache *s;
+ union kmem_cache_iter_priv *p = seq->private;
+
+ mutex_lock(&slab_mutex);
+
+ /* Find an entry at the given position in the slab_caches list instead
+ * of keeping a reference (of the last visited entry, if any) out of
+ * slab_mutex. It might miss something if one is deleted in the middle
+ * while it releases the lock. But it should be rare and there's not
+ * much we can do about it.
+ */
+ list_for_each_entry(s, &slab_caches, list) {
+ if (cnt == *pos) {
+ /* Make sure this entry remains in the list by getting
+ * a new reference count. Note that boot_cache entries
+ * have a negative refcount, so don't touch them.
+ */
+ if (s->refcount > 0)
+ s->refcount++;
+ found = true;
+ break;
+ }
+ cnt++;
+ }
+ mutex_unlock(&slab_mutex);
+
+ if (!found)
+ s = NULL;
+
+ p->kit.pos = s;
+ return s;
+}
+
+static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ union kmem_cache_iter_priv *p = seq->private;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog && !ctx.s)
+ bpf_iter_run_prog(prog, &ctx);
+
+ bpf_iter_kmem_cache_destroy(&p->it);
+}
+
+static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ union kmem_cache_iter_priv *p = seq->private;
+
+ ++*pos;
+
+ return bpf_iter_kmem_cache_next(&p->it);
+}
+
+static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static const struct seq_operations kmem_cache_iter_seq_ops = {
+ .start = kmem_cache_iter_seq_start,
+ .next = kmem_cache_iter_seq_next,
+ .stop = kmem_cache_iter_seq_stop,
+ .show = kmem_cache_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
+
+static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
+ .seq_ops = &kmem_cache_iter_seq_ops,
+ .seq_priv_size = sizeof(union kmem_cache_iter_priv),
+};
+
+static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "kmem_cache iter\n");
+}
+
+DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
+ struct kmem_cache *s)
+
+static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
+ .target = "kmem_cache",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__kmem_cache, s),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &kmem_cache_iter_seq_info,
+};
+
+static int __init bpf_kmem_cache_iter_init(void)
+{
+ bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
+ return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
+}
+
+late_initcall(bpf_kmem_cache_iter_init);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 146f5b57cfb1..889374722d0a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -254,11 +254,8 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
static void free_one(void *obj, bool percpu)
{
- if (percpu) {
+ if (percpu)
free_percpu(((void __percpu **)obj)[1]);
- kfree(obj);
- return;
- }
kfree(obj);
}
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
new file mode 100644
index 000000000000..5bdf9aadca3a
--- /dev/null
+++ b/kernel/bpf/range_tree.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/interval_tree_generic.h>
+#include <linux/slab.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/bpf.h>
+#include "range_tree.h"
+
+/*
+ * struct range_tree is a data structure used to allocate contiguous memory
+ * ranges in bpf arena. It's a large bitmap. The contiguous sequence of bits is
+ * represented by struct range_node or 'rn' for short.
+ * rn->rn_rbnode links it into an interval tree while
+ * rn->rb_range_size links it into a second rbtree sorted by size of the range.
+ * __find_range() performs binary search and best fit algorithm to find the
+ * range less or equal requested size.
+ * range_tree_clear/set() clears or sets a range of bits in this bitmap. The
+ * adjacent ranges are merged or split at the same time.
+ *
+ * The split/merge logic is based/borrowed from XFS's xbitmap32 added
+ * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree").
+ *
+ * The implementation relies on external lock to protect rbtree-s.
+ * The alloc/free of range_node-s is done via bpf_mem_alloc.
+ *
+ * bpf arena is using range_tree to represent unallocated slots.
+ * At init time:
+ * range_tree_set(rt, 0, max);
+ * Then:
+ * start = range_tree_find(rt, len);
+ * if (start >= 0)
+ * range_tree_clear(rt, start, len);
+ * to find free range and mark slots as allocated and later:
+ * range_tree_set(rt, start, len);
+ * to mark as unallocated after use.
+ */
+struct range_node {
+ struct rb_node rn_rbnode;
+ struct rb_node rb_range_size;
+ u32 rn_start;
+ u32 rn_last; /* inclusive */
+ u32 __rn_subtree_last;
+};
+
+static struct range_node *rb_to_range_node(struct rb_node *rb)
+{
+ return rb_entry(rb, struct range_node, rb_range_size);
+}
+
+static u32 rn_size(struct range_node *rn)
+{
+ return rn->rn_last - rn->rn_start + 1;
+}
+
+/* Find range that fits best to requested size */
+static inline struct range_node *__find_range(struct range_tree *rt, u32 len)
+{
+ struct rb_node *rb = rt->range_size_root.rb_root.rb_node;
+ struct range_node *best = NULL;
+
+ while (rb) {
+ struct range_node *rn = rb_to_range_node(rb);
+
+ if (len <= rn_size(rn)) {
+ best = rn;
+ rb = rb->rb_right;
+ } else {
+ rb = rb->rb_left;
+ }
+ }
+
+ return best;
+}
+
+s64 range_tree_find(struct range_tree *rt, u32 len)
+{
+ struct range_node *rn;
+
+ rn = __find_range(rt, len);
+ if (!rn)
+ return -ENOENT;
+ return rn->rn_start;
+}
+
+/* Insert the range into rbtree sorted by the range size */
+static inline void __range_size_insert(struct range_node *rn,
+ struct rb_root_cached *root)
+{
+ struct rb_node **link = &root->rb_root.rb_node, *rb = NULL;
+ u64 size = rn_size(rn);
+ bool leftmost = true;
+
+ while (*link) {
+ rb = *link;
+ if (size > rn_size(rb_to_range_node(rb))) {
+ link = &rb->rb_left;
+ } else {
+ link = &rb->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(&rn->rb_range_size, rb, link);
+ rb_insert_color_cached(&rn->rb_range_size, root, leftmost);
+}
+
+#define START(node) ((node)->rn_start)
+#define LAST(node) ((node)->rn_last)
+
+INTERVAL_TREE_DEFINE(struct range_node, rn_rbnode, u32,
+ __rn_subtree_last, START, LAST,
+ static inline __maybe_unused,
+ __range_it)
+
+static inline __maybe_unused void
+range_it_insert(struct range_node *rn, struct range_tree *rt)
+{
+ __range_size_insert(rn, &rt->range_size_root);
+ __range_it_insert(rn, &rt->it_root);
+}
+
+static inline __maybe_unused void
+range_it_remove(struct range_node *rn, struct range_tree *rt)
+{
+ rb_erase_cached(&rn->rb_range_size, &rt->range_size_root);
+ RB_CLEAR_NODE(&rn->rb_range_size);
+ __range_it_remove(rn, &rt->it_root);
+}
+
+static inline __maybe_unused struct range_node *
+range_it_iter_first(struct range_tree *rt, u32 start, u32 last)
+{
+ return __range_it_iter_first(&rt->it_root, start, last);
+}
+
+/* Clear the range in this range tree */
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *new_rn;
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, start, last))) {
+ if (rn->rn_start < start && rn->rn_last > last) {
+ u32 old_last = rn->rn_last;
+
+ /* Overlaps with the entire clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+
+ /* Add a range */
+ migrate_disable();
+ new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
+ migrate_enable();
+ if (!new_rn)
+ return -ENOMEM;
+ new_rn->rn_start = last + 1;
+ new_rn->rn_last = old_last;
+ range_it_insert(new_rn, rt);
+ } else if (rn->rn_start < start) {
+ /* Overlaps with the left side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+ } else if (rn->rn_last > last) {
+ /* Overlaps with the right side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_start = last + 1;
+ range_it_insert(rn, rt);
+ break;
+ } else {
+ /* in the middle of the clearing range */
+ range_it_remove(rn, rt);
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, rn);
+ migrate_enable();
+ }
+ }
+ return 0;
+}
+
+/* Is the whole range set ? */
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *left;
+
+ /* Is this whole range set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+ return -ESRCH;
+}
+
+/* Set the range in this range tree */
+int range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *right;
+ struct range_node *left;
+ int err;
+
+ /* Is this whole range already set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+
+ /* Clear out everything in the range we want to set. */
+ err = range_tree_clear(rt, start, len);
+ if (err)
+ return err;
+
+ /* Do we have a left-adjacent range ? */
+ left = range_it_iter_first(rt, start - 1, start - 1);
+ if (left && left->rn_last + 1 != start)
+ return -EFAULT;
+
+ /* Do we have a right-adjacent range ? */
+ right = range_it_iter_first(rt, last + 1, last + 1);
+ if (right && right->rn_start != last + 1)
+ return -EFAULT;
+
+ if (left && right) {
+ /* Combine left and right adjacent ranges */
+ range_it_remove(left, rt);
+ range_it_remove(right, rt);
+ left->rn_last = right->rn_last;
+ range_it_insert(left, rt);
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, right);
+ migrate_enable();
+ } else if (left) {
+ /* Combine with the left range */
+ range_it_remove(left, rt);
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ } else if (right) {
+ /* Combine with the right range */
+ range_it_remove(right, rt);
+ right->rn_start = start;
+ range_it_insert(right, rt);
+ } else {
+ migrate_disable();
+ left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
+ migrate_enable();
+ if (!left)
+ return -ENOMEM;
+ left->rn_start = start;
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ }
+ return 0;
+}
+
+void range_tree_destroy(struct range_tree *rt)
+{
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, 0, -1U))) {
+ range_it_remove(rn, rt);
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, rn);
+ migrate_enable();
+ }
+}
+
+void range_tree_init(struct range_tree *rt)
+{
+ rt->it_root = RB_ROOT_CACHED;
+ rt->range_size_root = RB_ROOT_CACHED;
+}
diff --git a/kernel/bpf/range_tree.h b/kernel/bpf/range_tree.h
new file mode 100644
index 000000000000..ff0b9110eb71
--- /dev/null
+++ b/kernel/bpf/range_tree.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#ifndef _RANGE_TREE_H
+#define _RANGE_TREE_H 1
+
+struct range_tree {
+ /* root of interval tree */
+ struct rb_root_cached it_root;
+ /* root of rbtree of interval sizes */
+ struct rb_root_cached range_size_root;
+};
+
+void range_tree_init(struct range_tree *rt);
+void range_tree_destroy(struct range_tree *rt);
+
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len);
+int range_tree_set(struct range_tree *rt, u32 start, u32 len);
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len);
+s64 range_tree_find(struct range_tree *rt, u32 len);
+
+#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c5aa127ed4cc..5684e8ce132d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
+#include <linux/tracepoint.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -155,6 +156,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
synchronize_rcu();
}
+static void unpin_uptr_kaddr(void *kaddr)
+{
+ if (kaddr)
+ unpin_user_page(virt_to_page(kaddr));
+}
+
+static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj)
+{
+ const struct btf_field *field;
+ void **uptr_addr;
+ int i;
+
+ for (i = 0, field = rec->fields; i < cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ unpin_uptr_kaddr(*uptr_addr);
+ }
+}
+
+static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj)
+{
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return;
+
+ __bpf_obj_unpin_uptrs(rec, rec->cnt, obj);
+}
+
+static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
+{
+ const struct btf_field *field;
+ const struct btf_type *t;
+ unsigned long start, end;
+ struct page *page;
+ void **uptr_addr;
+ int i, err;
+
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return 0;
+
+ for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ start = *(unsigned long *)uptr_addr;
+ if (!start)
+ continue;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ /* t->size was checked for zero before */
+ if (check_add_overflow(start, t->size - 1, &end)) {
+ err = -EFAULT;
+ goto unpin_all;
+ }
+
+ /* The uptr's struct cannot span across two pages */
+ if ((start & PAGE_MASK) != (end & PAGE_MASK)) {
+ err = -EOPNOTSUPP;
+ goto unpin_all;
+ }
+
+ err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page);
+ if (err != 1)
+ goto unpin_all;
+
+ if (PageHighMem(page)) {
+ err = -EOPNOTSUPP;
+ unpin_user_page(page);
+ goto unpin_all;
+ }
+
+ *uptr_addr = page_address(page) + offset_in_page(start);
+ }
+
+ return 0;
+
+unpin_all:
+ __bpf_obj_unpin_uptrs(rec, i, obj);
+ return err;
+}
+
static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
void *key, void *value, __u64 flags)
{
@@ -199,9 +283,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
} else {
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, flags);
- rcu_read_unlock();
+ err = bpf_obj_pin_uptrs(map->record, value);
+ if (!err) {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, flags);
+ rcu_read_unlock();
+ if (err)
+ bpf_obj_unpin_uptrs(map->record, value);
+ }
}
bpf_enable_instrumentation();
@@ -548,6 +637,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
if (btf_is_kernel(rec->fields[i].kptr.btf))
@@ -597,6 +687,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
if (btf_is_kernel(fields[i].kptr.btf))
btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
@@ -714,6 +805,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
field->kptr.dtor(xchgd_field);
}
break;
+ case BPF_UPTR:
+ /* The caller ensured that no one is using the uptr */
+ unpin_uptr_kaddr(*(void **)field_ptr);
+ break;
case BPF_LIST_HEAD:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
continue;
@@ -1105,7 +1200,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
- BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
+ BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1161,6 +1256,12 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
goto free_map_tab;
}
break;
+ case BPF_UPTR:
+ if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
case BPF_LIST_HEAD:
case BPF_RB_ROOT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
@@ -2933,17 +3034,33 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
-void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
- const struct bpf_link_ops *ops, struct bpf_prog *prog)
+/* bpf_link_init_sleepable() allows to specify whether BPF link itself has
+ * "sleepable" semantics, which normally would mean that BPF link's attach
+ * hook can dereference link or link's underlying program for some time after
+ * detachment due to RCU Tasks Trace-based lifetime protection scheme.
+ * BPF program itself can be non-sleepable, yet, because it's transitively
+ * reachable through BPF link, its freeing has to be delayed until after RCU
+ * Tasks Trace GP.
+ */
+void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ bool sleepable)
{
WARN_ON(ops->dealloc && ops->dealloc_deferred);
atomic64_set(&link->refcnt, 1);
link->type = type;
+ link->sleepable = sleepable;
link->id = 0;
link->ops = ops;
link->prog = prog;
}
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog)
+{
+ bpf_link_init_sleepable(link, type, ops, prog, false);
+}
+
static void bpf_link_free_id(int id)
{
if (!id)
@@ -2976,12 +3093,24 @@ void bpf_link_inc(struct bpf_link *link)
atomic64_inc(&link->refcnt);
}
+static void bpf_link_dealloc(struct bpf_link *link)
+{
+ /* now that we know that bpf_link itself can't be reached, put underlying BPF program */
+ if (link->prog)
+ bpf_prog_put(link->prog);
+
+ /* free bpf_link and its containing memory */
+ if (link->ops->dealloc_deferred)
+ link->ops->dealloc_deferred(link);
+ else
+ link->ops->dealloc(link);
+}
+
static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
{
struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
- /* free bpf_link and its containing memory */
- link->ops->dealloc_deferred(link);
+ bpf_link_dealloc(link);
}
static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
@@ -2996,26 +3125,27 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
static void bpf_link_free(struct bpf_link *link)
{
const struct bpf_link_ops *ops = link->ops;
- bool sleepable = false;
bpf_link_free_id(link->id);
- if (link->prog) {
- sleepable = link->prog->sleepable;
- /* detach BPF program, clean up used resources */
+ /* detach BPF program, clean up used resources */
+ if (link->prog)
ops->release(link);
- bpf_prog_put(link->prog);
- }
if (ops->dealloc_deferred) {
- /* schedule BPF link deallocation; if underlying BPF program
- * is sleepable, we need to first wait for RCU tasks trace
- * sync, then go through "classic" RCU grace period
+ /* Schedule BPF link deallocation, which will only then
+ * trigger putting BPF program refcount.
+ * If underlying BPF program is sleepable or BPF link's target
+ * attach hookpoint is sleepable or otherwise requires RCU GPs
+ * to ensure link and its underlying BPF program is not
+ * reachable anymore, we need to first wait for RCU tasks
+ * trace sync, and then go through "classic" RCU grace period
*/
- if (sleepable)
+ if (link->sleepable || (link->prog && link->prog->sleepable))
call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
else
call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
- } else if (ops->dealloc)
- ops->dealloc(link);
+ } else if (ops->dealloc) {
+ bpf_link_dealloc(link);
+ }
}
static void bpf_link_put_deferred(struct work_struct *work)
@@ -3218,7 +3348,8 @@ static void bpf_tracing_link_release(struct bpf_link *link)
container_of(link, struct bpf_tracing_link, link.link);
WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
- tr_link->trampoline));
+ tr_link->trampoline,
+ tr_link->tgt_prog));
bpf_trampoline_put(tr_link->trampoline);
@@ -3358,7 +3489,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
* in prog->aux
*
* - if prog->aux->dst_trampoline is NULL, the program has already been
- * attached to a target and its initial target was cleared (below)
+ * attached to a target and its initial target was cleared (below)
*
* - if tgt_prog != NULL, the caller specified tgt_prog_fd +
* target_btf_id using the link_create API.
@@ -3433,7 +3564,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
- err = bpf_trampoline_link_prog(&link->link, tr);
+ err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
@@ -3835,8 +3966,9 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
err = -ENOMEM;
goto out_put_btp;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
- &bpf_raw_tp_link_lops, prog);
+ bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ &bpf_raw_tp_link_lops, prog,
+ tracepoint_is_faultable(btp->tp));
link->btp = btp;
link->cookie = cookie;
@@ -4002,10 +4134,14 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
attach_type != BPF_TRACE_UPROBE_MULTI)
return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
+ return -EINVAL;
if (attach_type != BPF_PERF_EVENT &&
attach_type != BPF_TRACE_KPROBE_MULTI &&
attach_type != BPF_TRACE_KPROBE_SESSION &&
- attach_type != BPF_TRACE_UPROBE_MULTI)
+ attach_type != BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
return -EINVAL;
return 0;
case BPF_PROG_TYPE_SCHED_CLS:
@@ -5258,7 +5394,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
ret = bpf_kprobe_multi_link_attach(attr, prog);
- else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
+ else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION)
ret = bpf_uprobe_multi_link_attach(attr, prog);
break;
default:
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index f8302a5ca400..c4b1a98ff726 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -115,10 +115,14 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
-void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
+void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
ksym->end = ksym->start + size;
+}
+
+void bpf_image_ksym_add(struct bpf_ksym *ksym)
+{
bpf_ksym_add(ksym);
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
PAGE_SIZE, false, ksym->name);
@@ -377,7 +381,8 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
ksym = &im->ksym;
INIT_LIST_HEAD_RCU(&ksym->lnode);
snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
- bpf_image_ksym_add(image, size, ksym);
+ bpf_image_ksym_init(image, size, ksym);
+ bpf_image_ksym_add(ksym);
return im;
out_free_image:
@@ -523,7 +528,27 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog)
+{
+ struct bpf_prog_aux *aux = tgt_prog->aux;
+
+ guard(mutex)(&aux->ext_mutex);
+ if (aux->prog_array_member_cnt)
+ /* Program extensions can not extend target prog when the target
+ * prog has been updated to any prog_array map as tail callee.
+ * It's to prevent a potential infinite loop like:
+ * tgt prog entry -> tgt prog subprog -> freplace prog entry
+ * --tailcall-> tgt prog entry.
+ */
+ return -EBUSY;
+
+ aux->is_extended = true;
+ return 0;
+}
+
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
struct bpf_tramp_link *link_exiting;
@@ -544,6 +569,9 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
/* Cannot attach extension if fentry/fexit are in use. */
if (cnt)
return -EBUSY;
+ err = bpf_freplace_check_tgt_prog(tgt_prog);
+ if (err)
+ return err;
tr->extension_prog = link->link.prog;
return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
link->link.prog->bpf_func);
@@ -570,17 +598,21 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
return err;
}
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
int err;
mutex_lock(&tr->mutex);
- err = __bpf_trampoline_link_prog(link, tr);
+ err = __bpf_trampoline_link_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
-static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
int err;
@@ -591,6 +623,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
+ guard(mutex)(&tgt_prog->aux->ext_mutex);
+ tgt_prog->aux->is_extended = false;
return err;
}
hlist_del_init(&link->tramp_hlist);
@@ -599,12 +633,14 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
}
/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
int err;
mutex_lock(&tr->mutex);
- err = __bpf_trampoline_unlink_prog(link, tr);
+ err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
@@ -619,7 +655,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link)
if (!shim_link->trampoline)
return;
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL));
bpf_trampoline_put(shim_link->trampoline);
}
@@ -733,7 +769,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
goto err;
}
- err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+ err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL);
if (err)
goto err;
@@ -868,6 +904,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
@@ -944,6 +982,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb99bada7e2e..1c4ebb326785 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
+#define BPF_PRIV_STACK_MIN_SIZE 64
+
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -418,6 +420,25 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
return rec;
}
+static bool mask_raw_tp_reg_cond(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) {
+ return reg->type == (PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL) &&
+ bpf_prog_is_raw_tp(env->prog) && !reg->ref_obj_id;
+}
+
+static bool mask_raw_tp_reg(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ if (!mask_raw_tp_reg_cond(env, reg))
+ return false;
+ reg->type &= ~PTR_MAYBE_NULL;
+ return true;
+}
+
+static void unmask_raw_tp_reg(struct bpf_reg_state *reg, bool result)
+{
+ if (result)
+ reg->type |= PTR_MAYBE_NULL;
+}
+
static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
{
struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
@@ -1265,6 +1286,7 @@ static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_fun
if (!dst->refs)
return -ENOMEM;
+ dst->active_locks = src->active_locks;
dst->acquired_refs = src->acquired_refs;
return 0;
}
@@ -1335,13 +1357,32 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
if (err)
return err;
id = ++env->id_gen;
+ state->refs[new_ofs].type = REF_TYPE_PTR;
state->refs[new_ofs].id = id;
state->refs[new_ofs].insn_idx = insn_idx;
- state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
return id;
}
+static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type,
+ int id, void *ptr)
+{
+ struct bpf_func_state *state = cur_func(env);
+ int new_ofs = state->acquired_refs;
+ int err;
+
+ err = resize_reference_state(state, state->acquired_refs + 1);
+ if (err)
+ return err;
+ state->refs[new_ofs].type = type;
+ state->refs[new_ofs].id = id;
+ state->refs[new_ofs].insn_idx = insn_idx;
+ state->refs[new_ofs].ptr = ptr;
+
+ state->active_locks++;
+ return 0;
+}
+
/* release function corresponding to acquire_reference_state(). Idempotent. */
static int release_reference_state(struct bpf_func_state *state, int ptr_id)
{
@@ -1349,10 +1390,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_PTR)
+ continue;
if (state->refs[i].id == ptr_id) {
- /* Cannot release caller references in callbacks */
- if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
- return -EINVAL;
if (last_idx && i != last_idx)
memcpy(&state->refs[i], &state->refs[last_idx],
sizeof(*state->refs));
@@ -1364,6 +1404,45 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
return -EINVAL;
}
+static int release_lock_state(struct bpf_func_state *state, int type, int id, void *ptr)
+{
+ int i, last_idx;
+
+ last_idx = state->acquired_refs - 1;
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != type)
+ continue;
+ if (state->refs[i].id == id && state->refs[i].ptr == ptr) {
+ if (last_idx && i != last_idx)
+ memcpy(&state->refs[i], &state->refs[last_idx],
+ sizeof(*state->refs));
+ memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+ state->acquired_refs--;
+ state->active_locks--;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static struct bpf_reference_state *find_lock_state(struct bpf_verifier_env *env, enum ref_state_type type,
+ int id, void *ptr)
+{
+ struct bpf_func_state *state = cur_func(env);
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ struct bpf_reference_state *s = &state->refs[i];
+
+ if (s->type == REF_TYPE_PTR || s->type != type)
+ continue;
+
+ if (s->id == id && s->ptr == ptr)
+ return s;
+ }
+ return NULL;
+}
+
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
@@ -1373,13 +1452,6 @@ static void free_func_state(struct bpf_func_state *state)
kfree(state);
}
-static void clear_jmp_history(struct bpf_verifier_state *state)
-{
- kfree(state->jmp_history);
- state->jmp_history = NULL;
- state->jmp_history_cnt = 0;
-}
-
static void free_verifier_state(struct bpf_verifier_state *state,
bool free_self)
{
@@ -1389,7 +1461,6 @@ static void free_verifier_state(struct bpf_verifier_state *state,
free_func_state(state->frame[i]);
state->frame[i] = NULL;
}
- clear_jmp_history(state);
if (free_self)
kfree(state);
}
@@ -1415,13 +1486,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
struct bpf_func_state *dst;
int i, err;
- dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
- src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
- GFP_USER);
- if (!dst_state->jmp_history)
- return -ENOMEM;
- dst_state->jmp_history_cnt = src->jmp_history_cnt;
-
/* if dst has more stack frames then src frame, free them, this is also
* necessary in case of exceptional exits using bpf_throw.
*/
@@ -1434,12 +1498,12 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->active_preempt_lock = src->active_preempt_lock;
dst_state->in_sleepable = src->in_sleepable;
dst_state->curframe = src->curframe;
- dst_state->active_lock.ptr = src->active_lock.ptr;
- dst_state->active_lock.id = src->active_lock.id;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
+ dst_state->insn_hist_start = src->insn_hist_start;
+ dst_state->insn_hist_end = src->insn_hist_end;
dst_state->dfs_depth = src->dfs_depth;
dst_state->callback_unroll_depth = src->callback_unroll_depth;
dst_state->used_as_loop_entry = src->used_as_loop_entry;
@@ -2492,9 +2556,14 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
* The caller state doesn't matter.
* This is async callback. It starts in a fresh stack.
* Initialize it similar to do_check_common().
+ * But we do need to make sure to not clobber insn_hist, so we keep
+ * chaining insn_hist_start/insn_hist_end indices as for a normal
+ * child state.
*/
elem->st.branches = 1;
elem->st.in_sleepable = is_sleepable;
+ elem->st.insn_hist_start = env->cur_state->insn_hist_end;
+ elem->st.insn_hist_end = elem->st.insn_hist_start;
frame = kzalloc(sizeof(*frame), GFP_KERNEL);
if (!frame)
goto err;
@@ -3474,11 +3543,10 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s)
}
/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags, u64 linked_regs)
+static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags, u64 linked_regs)
{
- u32 cnt = cur->jmp_history_cnt;
- struct bpf_jmp_history_entry *p;
+ struct bpf_insn_hist_entry *p;
size_t alloc_size;
/* combine instruction flags if we already recorded this instruction */
@@ -3498,29 +3566,32 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st
return 0;
}
- cnt++;
- alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
- p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
- if (!p)
- return -ENOMEM;
- cur->jmp_history = p;
+ if (cur->insn_hist_end + 1 > env->insn_hist_cap) {
+ alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p));
+ p = kvrealloc(env->insn_hist, alloc_size, GFP_USER);
+ if (!p)
+ return -ENOMEM;
+ env->insn_hist = p;
+ env->insn_hist_cap = alloc_size / sizeof(*p);
+ }
- p = &cur->jmp_history[cnt - 1];
+ p = &env->insn_hist[cur->insn_hist_end];
p->idx = env->insn_idx;
p->prev_idx = env->prev_insn_idx;
p->flags = insn_flags;
p->linked_regs = linked_regs;
- cur->jmp_history_cnt = cnt;
+
+ cur->insn_hist_end++;
env->cur_hist_ent = p;
return 0;
}
-static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
- u32 hist_end, int insn_idx)
+static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env,
+ u32 hist_start, u32 hist_end, int insn_idx)
{
- if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
- return &st->jmp_history[hist_end - 1];
+ if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx)
+ return &env->insn_hist[hist_end - 1];
return NULL;
}
@@ -3537,25 +3608,26 @@ static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_stat
* history entry recording a jump from last instruction of parent state and
* first instruction of given state.
*/
-static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
- u32 *history)
+static int get_prev_insn_idx(const struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ int insn_idx, u32 hist_start, u32 *hist_endp)
{
- u32 cnt = *history;
+ u32 hist_end = *hist_endp;
+ u32 cnt = hist_end - hist_start;
- if (i == st->first_insn_idx) {
+ if (insn_idx == st->first_insn_idx) {
if (cnt == 0)
return -ENOENT;
- if (cnt == 1 && st->jmp_history[0].idx == i)
+ if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx)
return -ENOENT;
}
- if (cnt && st->jmp_history[cnt - 1].idx == i) {
- i = st->jmp_history[cnt - 1].prev_idx;
- (*history)--;
+ if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) {
+ (*hist_endp)--;
+ return env->insn_hist[hist_end - 1].prev_idx;
} else {
- i--;
+ return insn_idx - 1;
}
- return i;
}
static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
@@ -3727,7 +3799,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
/* If any register R in hist->linked_regs is marked as precise in bt,
* do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
*/
-static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist)
{
struct linked_regs linked_regs;
bool some_precise = false;
@@ -3772,7 +3844,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
* - *was* processed previously during backtracking.
*/
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
- struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
+ struct bpf_insn_hist_entry *hist, struct backtrack_state *bt)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
@@ -4191,7 +4263,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
* SCALARS, as well as any other registers and slots that contribute to
* a tracked state of given registers/stack slots, depending on specific BPF
* assembly instructions (see backtrack_insns() for exact instruction handling
- * logic). This backtracking relies on recorded jmp_history and is able to
+ * logic). This backtracking relies on recorded insn_hist and is able to
* traverse entire chain of parent states. This process ends only when all the
* necessary registers/slots and their transitive dependencies are marked as
* precise.
@@ -4308,8 +4380,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
for (;;) {
DECLARE_BITMAP(mask, 64);
- u32 history = st->jmp_history_cnt;
- struct bpf_jmp_history_entry *hist;
+ u32 hist_start = st->insn_hist_start;
+ u32 hist_end = st->insn_hist_end;
+ struct bpf_insn_hist_entry *hist;
if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
@@ -4348,7 +4421,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
err = 0;
skip_first = false;
} else {
- hist = get_jmp_hist_entry(st, history, i);
+ hist = get_insn_hist_entry(env, hist_start, hist_end, i);
err = backtrack_insn(env, i, subseq_idx, hist, bt);
}
if (err == -ENOTSUPP) {
@@ -4365,7 +4438,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
*/
return 0;
subseq_idx = i;
- i = get_prev_insn_idx(st, i, &history);
+ i = get_prev_insn_idx(env, st, i, hist_start, &hist_end);
if (i == -ENOENT)
break;
if (i >= env->prog->len) {
@@ -4731,7 +4804,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return push_insn_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -5038,7 +5111,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* we are not restoring spilled register */
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return push_insn_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -5423,7 +5496,7 @@ static bool in_sleepable(struct bpf_verifier_env *env)
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
return env->cur_state->active_rcu_lock ||
- env->cur_state->active_lock.ptr ||
+ cur_func(env)->active_locks ||
!in_sleepable(env);
}
@@ -5491,6 +5564,22 @@ static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr
return ret;
}
+static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno,
+ struct btf_field *field)
+{
+ struct bpf_reg_state *reg;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ mark_reg_known_zero(env, cur_regs(env), regno);
+ reg = reg_state(env, regno);
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
+ reg->mem_size = t->size;
+ reg->id = ++env->id_gen;
+
+ return 0;
+}
+
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
int value_regno, int insn_idx,
struct btf_field *kptr_field)
@@ -5519,9 +5608,15 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
verbose(env, "store to referenced kptr disallowed\n");
return -EACCES;
}
+ if (class != BPF_LDX && kptr_field->type == BPF_UPTR) {
+ verbose(env, "store to uptr disallowed\n");
+ return -EACCES;
+ }
if (class == BPF_LDX) {
- val_reg = reg_state(env, value_regno);
+ if (kptr_field->type == BPF_UPTR)
+ return mark_uptr_ld_reg(env, value_regno, kptr_field);
+
/* We can simply mark the value_regno receiving the pointer
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
@@ -5579,21 +5674,26 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
if (src != ACCESS_DIRECT) {
- verbose(env, "kptr cannot be accessed indirectly by helper\n");
+ verbose(env, "%s cannot be accessed indirectly by helper\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "kptr access cannot have variable offset\n");
+ verbose(env, "%s access cannot have variable offset\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
if (p != off + reg->var_off.value) {
- verbose(env, "kptr access misaligned expected=%u off=%llu\n",
+ verbose(env, "%s access misaligned expected=%u off=%llu\n",
+ btf_field_type_name(field->type),
p, off + reg->var_off.value);
return -EACCES;
}
if (size != bpf_size_to_bytes(BPF_DW)) {
- verbose(env, "kptr access size must be BPF_DW\n");
+ verbose(env, "%s access size must be BPF_DW\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
break;
@@ -5988,6 +6088,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
+static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
+{
+ if (!bpf_jit_supports_private_stack())
+ return NO_PRIV_STACK;
+
+ /* bpf_prog_check_recur() checks all prog types that use bpf trampoline
+ * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
+ * explicitly.
+ */
+ switch (prog->type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return PRIV_STACK_ADAPTIVE;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog))
+ return PRIV_STACK_ADAPTIVE;
+ fallthrough;
+ default:
+ break;
+ }
+
+ return NO_PRIV_STACK;
+}
+
static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
{
if (env->prog->jit_requested)
@@ -6005,17 +6133,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+ bool priv_stack_supported)
{
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int depth = 0, frame = 0, i, subprog_end;
+ int depth = 0, frame = 0, i, subprog_end, subprog_depth;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
int j;
i = subprog[idx].start;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
process_func:
/* protect against potential stack overflow that might happen when
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -6042,11 +6173,31 @@ process_func:
depth);
return -EACCES;
}
- depth += round_up_stack_depth(env, subprog[idx].stack_depth);
- if (depth > MAX_BPF_STACK) {
- verbose(env, "combined stack size of %d calls is %d. Too large\n",
- frame + 1, depth);
- return -EACCES;
+
+ subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (priv_stack_supported) {
+ /* Request private stack support only if the subprog stack
+ * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
+ * avoid jit penalty if the stack usage is small.
+ */
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
+ subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+ subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
+ }
+
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ if (subprog_depth > MAX_BPF_STACK) {
+ verbose(env, "stack size of subprog %d is %d. Too large\n",
+ idx, subprog_depth);
+ return -EACCES;
+ }
+ } else {
+ depth += subprog_depth;
+ if (depth > MAX_BPF_STACK) {
+ verbose(env, "combined stack size of %d calls is %d. Too large\n",
+ frame + 1, depth);
+ return -EACCES;
+ }
}
continue_func:
subprog_end = subprog[idx + 1].start;
@@ -6103,6 +6254,8 @@ continue_func:
}
i = next_insn;
idx = sidx;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -6136,7 +6289,8 @@ continue_func:
*/
if (frame == 0)
return 0;
- depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
+ depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
frame--;
i = ret_insn[frame];
idx = ret_prog[frame];
@@ -6145,17 +6299,45 @@ continue_func:
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
+ enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
struct bpf_subprog_info *si = env->subprog_info;
+ bool priv_stack_supported;
int ret;
for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].has_tail_call) {
+ priv_stack_mode = NO_PRIV_STACK;
+ break;
+ }
+ }
+
+ if (priv_stack_mode == PRIV_STACK_UNKNOWN)
+ priv_stack_mode = bpf_enable_priv_stack(env->prog);
+
+ /* All async_cb subprogs use normal kernel stack. If a particular
+ * subprog appears in both main prog and async_cb subtree, that
+ * subprog will use normal kernel stack to avoid potential nesting.
+ * The reverse subprog traversal ensures when main prog subtree is
+ * checked, the subprogs appearing in async_cb subtrees are already
+ * marked as using normal kernel stack, so stack size checking can
+ * be done properly.
+ */
+ for (int i = env->subprog_cnt - 1; i >= 0; i--) {
if (!i || si[i].is_async_cb) {
- ret = check_max_stack_depth_subprog(env, i);
+ priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
+ ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
if (ret < 0)
return ret;
}
- continue;
}
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ env->prog->aux->jits_use_priv_stack = true;
+ break;
+ }
+ }
+
return 0;
}
@@ -6595,6 +6777,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
const char *field_name = NULL;
enum bpf_type_flag flag = 0;
u32 btf_id = 0;
+ bool mask;
int ret;
if (!env->allow_ptr_leaks) {
@@ -6666,7 +6849,21 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (ret < 0)
return ret;
-
+ /* For raw_tp progs, we allow dereference of PTR_MAYBE_NULL
+ * trusted PTR_TO_BTF_ID, these are the ones that are possibly
+ * arguments to the raw_tp. Since internal checks in for trusted
+ * reg in check_ptr_to_btf_access would consider PTR_MAYBE_NULL
+ * modifier as problematic, mask it out temporarily for the
+ * check. Don't apply this to pointers with ref_obj_id > 0, as
+ * those won't be raw_tp args.
+ *
+ * We may end up applying this relaxation to other trusted
+ * PTR_TO_BTF_ID with maybe null flag, since we cannot
+ * distinguish PTR_MAYBE_NULL tagged for arguments vs normal
+ * tagging, but that should expand allowed behavior, and not
+ * cause regression for existing behavior.
+ */
+ mask = mask_raw_tp_reg(env, reg);
if (ret != PTR_TO_BTF_ID) {
/* just mark; */
@@ -6727,8 +6924,13 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
clear_trusted_flags(&flag);
}
- if (atype == BPF_READ && value_regno >= 0)
+ if (atype == BPF_READ && value_regno >= 0) {
mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
+ /* We've assigned a new type to regno, so don't undo masking. */
+ if (regno == value_regno)
+ mask = false;
+ }
+ unmask_raw_tp_reg(reg, mask);
return 0;
}
@@ -6949,7 +7151,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
if (tnum_is_const(reg->var_off))
kptr_field = btf_record_find(reg->map_ptr->record,
- off + reg->var_off.value, BPF_KPTR);
+ off + reg->var_off.value, BPF_KPTR | BPF_UPTR);
if (kptr_field) {
err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
} else if (t == BPF_READ && value_regno >= 0) {
@@ -7103,7 +7305,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (base_type(reg->type) == PTR_TO_BTF_ID &&
- !type_may_be_null(reg->type)) {
+ (mask_raw_tp_reg_cond(env, reg) || !type_may_be_null(reg->type))) {
err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
value_regno);
} else if (reg->type == CONST_PTR_TO_MAP) {
@@ -7648,19 +7850,20 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_state->active_lock remembers which map value element or allocated
+ * cur_func(env)->active_locks remembers which map value element or allocated
* object got locked and clears it after bpf_spin_unlock.
*/
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
bool is_lock)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_func_state *cur = cur_func(env);
u64 val = reg->var_off.value;
struct bpf_map *map = NULL;
struct btf *btf = NULL;
struct btf_record *rec;
+ int err;
if (!is_const) {
verbose(env,
@@ -7692,16 +7895,23 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
if (is_lock) {
- if (cur->active_lock.ptr) {
+ void *ptr;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (cur->active_locks) {
verbose(env,
"Locking two bpf_spin_locks are not allowed\n");
return -EINVAL;
}
- if (map)
- cur->active_lock.ptr = map;
- else
- cur->active_lock.ptr = btf;
- cur->active_lock.id = reg->id;
+ err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr);
+ if (err < 0) {
+ verbose(env, "Failed to acquire lock state\n");
+ return err;
+ }
} else {
void *ptr;
@@ -7710,20 +7920,17 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
else
ptr = btf;
- if (!cur->active_lock.ptr) {
+ if (!cur->active_locks) {
verbose(env, "bpf_spin_unlock without taking a lock\n");
return -EINVAL;
}
- if (cur->active_lock.ptr != ptr ||
- cur->active_lock.id != reg->id) {
+
+ if (release_lock_state(cur_func(env), REF_TYPE_LOCK, reg->id, ptr)) {
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
invalidate_non_owning_refs(env);
-
- cur->active_lock.ptr = NULL;
- cur->active_lock.id = 0;
}
return 0;
}
@@ -8796,6 +9003,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
enum bpf_reg_type type = reg->type;
u32 *arg_btf_id = NULL;
int err = 0;
+ bool mask;
if (arg_type == ARG_DONTCARE)
return 0;
@@ -8836,11 +9044,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
arg_btf_id = fn->arg_btf_id[arg];
+ mask = mask_raw_tp_reg(env, reg);
err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
- if (err)
- return err;
- err = check_func_arg_reg_off(env, reg, regno, arg_type);
+ err = err ?: check_func_arg_reg_off(env, reg, regno, arg_type);
+ unmask_raw_tp_reg(reg, mask);
if (err)
return err;
@@ -9635,14 +9843,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
return ret;
} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
struct bpf_call_arg_meta meta;
+ bool mask;
int err;
if (register_is_null(reg) && type_may_be_null(arg->arg_type))
continue;
memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
+ mask = mask_raw_tp_reg(env, reg);
err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+ unmask_raw_tp_reg(reg, mask);
if (err)
return err;
} else {
@@ -9781,7 +9992,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
const char *sub_name = subprog_name(env, subprog);
/* Only global subprogs cannot be called with a lock held. */
- if (env->cur_state->active_lock.ptr) {
+ if (cur_func(env)->active_locks) {
verbose(env, "global function calls are not allowed while holding a lock,\n"
"use static function instead\n");
return -EINVAL;
@@ -9910,7 +10121,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
{
/* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
* u64 flags);
- * callback_fn(u32 index, void *callback_ctx);
+ * callback_fn(u64 index, void *callback_ctx);
*/
callee->regs[BPF_REG_1].type = SCALAR_VALUE;
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
@@ -10122,17 +10333,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller->regs[BPF_REG_0] = *r0;
}
- /* callback_fn frame should have released its own additions to parent's
- * reference state at this point, or check_reference_leak would
- * complain, hence it must be the same as the caller. There is no need
- * to copy it back.
- */
- if (!callee->in_callback_fn) {
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
- }
+ /* Transfer references to the caller */
+ err = copy_reference_state(caller, callee);
+ if (err)
+ return err;
/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
* there function call logic would reschedule callback visit. If iteration
@@ -10302,11 +10506,11 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
bool refs_lingering = false;
int i;
- if (!exception_exit && state->frameno && !state->in_callback_fn)
+ if (!exception_exit && state->frameno)
return 0;
for (i = 0; i < state->acquired_refs; i++) {
- if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ if (state->refs[i].type != REF_TYPE_PTR)
continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
@@ -10315,6 +10519,34 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
return refs_lingering ? -EINVAL : 0;
}
+static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix)
+{
+ int err;
+
+ if (check_lock && cur_func(env)->active_locks) {
+ verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ err = check_reference_leak(env, exception_exit);
+ if (err) {
+ verbose(env, "%s would lead to reference leak\n", prefix);
+ return err;
+ }
+
+ if (check_lock && env->cur_state->active_rcu_lock) {
+ verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ if (check_lock && env->cur_state->active_preempt_lock) {
+ verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
struct bpf_reg_state *regs)
{
@@ -10583,11 +10815,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
switch (func_id) {
case BPF_FUNC_tail_call:
- err = check_reference_leak(env, false);
- if (err) {
- verbose(env, "tail_call would lead to reference leak\n");
+ err = check_resource_leak(env, false, true, "tail_call");
+ if (err)
return err;
- }
break;
case BPF_FUNC_get_local_storage:
/* check that flags argument in get_local_storage(map, flags) is 0,
@@ -11252,6 +11482,7 @@ enum special_kfunc_type {
KF_bpf_preempt_enable,
KF_bpf_iter_css_task_new,
KF_bpf_session_cookie,
+ KF_bpf_get_kmem_cache,
};
BTF_SET_START(special_kfunc_set)
@@ -11317,6 +11548,7 @@ BTF_ID(func, bpf_session_cookie)
#else
BTF_ID_UNUSED
#endif
+BTF_ID(func, bpf_get_kmem_cache)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -11512,10 +11744,9 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *state = env->cur_state;
struct btf_record *rec = reg_btf_record(reg);
- if (!state->active_lock.ptr) {
+ if (!cur_func(env)->active_locks) {
verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
return -EFAULT;
}
@@ -11612,6 +11843,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o
*/
static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
+ struct bpf_reference_state *s;
void *ptr;
u32 id;
@@ -11628,10 +11860,10 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
}
id = reg->id;
- if (!env->cur_state->active_lock.ptr)
+ if (!cur_func(env)->active_locks)
return -EINVAL;
- if (env->cur_state->active_lock.ptr != ptr ||
- env->cur_state->active_lock.id != id) {
+ s = find_lock_state(env, REF_TYPE_LOCK, id, ptr);
+ if (!s) {
verbose(env, "held lock and object are not in the same allocation\n");
return -EINVAL;
}
@@ -11942,6 +12174,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
enum bpf_arg_type arg_type = ARG_DONTCARE;
u32 regno = i + 1, ref_id, type_size;
bool is_ret_buf_sz = false;
+ bool mask = false;
int kf_arg_type;
t = btf_type_skip_modifiers(btf, args[i].type, NULL);
@@ -12000,12 +12233,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
+ mask = mask_raw_tp_reg(env, reg);
if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
(register_is_null(reg) || type_may_be_null(reg->type)) &&
!is_kfunc_arg_nullable(meta->btf, &args[i])) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+ unmask_raw_tp_reg(reg, mask);
return -EACCES;
}
+ unmask_raw_tp_reg(reg, mask);
if (reg->ref_obj_id) {
if (is_kfunc_release(meta) && meta->ref_obj_id) {
@@ -12063,16 +12299,24 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
break;
+ /* Allow passing maybe NULL raw_tp arguments to
+ * kfuncs for compatibility. Don't apply this to
+ * arguments with ref_obj_id > 0.
+ */
+ mask = mask_raw_tp_reg(env, reg);
if (!is_trusted_reg(reg)) {
if (!is_kfunc_rcu(meta)) {
verbose(env, "R%d must be referenced or trusted\n", regno);
+ unmask_raw_tp_reg(reg, mask);
return -EINVAL;
}
if (!is_rcu_reg(reg)) {
verbose(env, "R%d must be a rcu pointer\n", regno);
+ unmask_raw_tp_reg(reg, mask);
return -EINVAL;
}
}
+ unmask_raw_tp_reg(reg, mask);
fallthrough;
case KF_ARG_PTR_TO_CTX:
case KF_ARG_PTR_TO_DYNPTR:
@@ -12095,7 +12339,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_release(meta) && reg->ref_obj_id)
arg_type |= OBJ_RELEASE;
+ mask = mask_raw_tp_reg(env, reg);
ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ unmask_raw_tp_reg(reg, mask);
if (ret < 0)
return ret;
@@ -12272,6 +12518,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
fallthrough;
case KF_ARG_PTR_TO_BTF_ID:
+ mask = mask_raw_tp_reg(env, reg);
/* Only base_type is checked, further checks are done here */
if ((base_type(reg->type) != PTR_TO_BTF_ID ||
(bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
@@ -12280,9 +12527,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "expected %s or socket\n",
reg_type_str(env, base_type(reg->type) |
(type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+ unmask_raw_tp_reg(reg, mask);
return -EINVAL;
}
ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+ unmask_raw_tp_reg(reg, mask);
if (ret < 0)
return ret;
break;
@@ -12827,6 +13076,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
+ if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
+ regs[BPF_REG_0].type |= PTR_UNTRUSTED;
+
if (is_iter_next_kfunc(&meta)) {
struct bpf_reg_state *cur_iter;
@@ -13252,7 +13504,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
*/
static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
- const struct bpf_reg_state *ptr_reg,
+ struct bpf_reg_state *ptr_reg,
const struct bpf_reg_state *off_reg)
{
struct bpf_verifier_state *vstate = env->cur_state;
@@ -13266,6 +13518,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
struct bpf_sanitize_info info = {};
u8 opcode = BPF_OP(insn->code);
u32 dst = insn->dst_reg;
+ bool mask;
int ret;
dst_reg = &regs[dst];
@@ -13292,11 +13545,14 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
}
+ mask = mask_raw_tp_reg(env, ptr_reg);
if (ptr_reg->type & PTR_MAYBE_NULL) {
verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
dst, reg_type_str(env, ptr_reg->type));
+ unmask_raw_tp_reg(ptr_reg, mask);
return -EACCES;
}
+ unmask_raw_tp_reg(ptr_reg, mask);
switch (base_type(ptr_reg->type)) {
case PTR_TO_CTX:
@@ -15480,7 +15736,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
if (linked_regs.cnt > 1) {
- err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
if (err)
return err;
}
@@ -15744,26 +16000,9 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* gen_ld_abs() may terminate the program at runtime, leading to
* reference leak.
*/
- err = check_reference_leak(env, false);
- if (err) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
+ err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]");
+ if (err)
return err;
- }
-
- if (env->cur_state->active_lock.ptr) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_rcu_lock) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_preempt_lock) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n");
- return -EINVAL;
- }
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
@@ -15909,6 +16148,16 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
return -ENOTSUPP;
}
break;
+ case BPF_PROG_TYPE_KPROBE:
+ switch (env->prog->expected_attach_type) {
+ case BPF_TRACE_KPROBE_SESSION:
+ case BPF_TRACE_UPROBE_SESSION:
+ range = retval_range(0, 1);
+ break;
+ default:
+ return 0;
+ }
+ break;
case BPF_PROG_TYPE_SK_LOOKUP:
range = retval_range(SK_DROP, SK_PASS);
break;
@@ -16175,10 +16424,7 @@ static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta)
/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */
static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta)
{
- if (meta->btf == btf_vmlinux)
- return meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
- meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast];
- return false;
+ return meta->kfunc_flags & KF_FASTCALL;
}
/* LLVM define a bpf_fastcall function attribute.
@@ -17513,8 +17759,20 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
return false;
for (i = 0; i < old->acquired_refs; i++) {
- if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
+ old->refs[i].type != cur->refs[i].type)
return false;
+ switch (old->refs[i].type) {
+ case REF_TYPE_PTR:
+ break;
+ case REF_TYPE_LOCK:
+ if (old->refs[i].ptr != cur->refs[i].ptr)
+ return false;
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
+ return false;
+ }
}
return true;
@@ -17592,19 +17850,6 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->speculative && !cur->speculative)
return false;
- if (old->active_lock.ptr != cur->active_lock.ptr)
- return false;
-
- /* Old and cur active_lock's have to be either both present
- * or both absent.
- */
- if (!!old->active_lock.id != !!cur->active_lock.id)
- return false;
-
- if (old->active_lock.id &&
- !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
- return false;
-
if (old->active_rcu_lock != cur->active_rcu_lock)
return false;
@@ -17880,7 +18125,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
/* Avoid accumulating infinitely long jmp history */
- cur->jmp_history_cnt > 40;
+ cur->insn_hist_end - cur->insn_hist_start > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -18078,7 +18323,7 @@ hit:
* the current state.
*/
if (is_jmp_point(env, env->insn_idx))
- err = err ? : push_jmp_history(env, cur, 0, 0);
+ err = err ? : push_insn_history(env, cur, 0, 0);
err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
@@ -18177,8 +18422,8 @@ next:
cur->parent = new;
cur->first_insn_idx = insn_idx;
+ cur->insn_hist_start = cur->insn_hist_end;
cur->dfs_depth = new->dfs_depth + 1;
- clear_jmp_history(cur);
new_sl->next = *explored_state(env, insn_idx);
*explored_state(env, insn_idx) = new_sl;
/* connect new state to parentage chain. Current frame needs all
@@ -18346,7 +18591,7 @@ static int do_check(struct bpf_verifier_env *env)
}
if (is_jmp_point(env, env->insn_idx)) {
- err = push_jmp_history(env, state, 0, 0);
+ err = push_insn_history(env, state, 0, 0);
if (err)
return err;
}
@@ -18506,7 +18751,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_lock.ptr) {
+ if (cur_func(env)->active_locks) {
if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
(insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
(insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
@@ -18555,30 +18800,14 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
process_bpf_exit_full:
- if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) {
- verbose(env, "bpf_spin_unlock is missing\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) {
- verbose(env, "bpf_rcu_read_unlock is missing\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) {
- verbose(env, "%d bpf_preempt_enable%s missing\n",
- env->cur_state->active_preempt_lock,
- env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are");
- return -EINVAL;
- }
-
/* We must do check_reference_leak here before
* prepare_func_exit to handle the case when
* state->curframe > 0, it may be a callback
* function, for which reference_state must
* match caller reference state when it exits.
*/
- err = check_reference_leak(env, exception_exit);
+ err = check_resource_leak(env, exception_exit, !env->cur_state->curframe,
+ "BPF_EXIT instruction");
if (err)
return err;
@@ -19837,6 +20066,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
* for this case.
*/
case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
+ case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
if (type == BPF_READ) {
if (BPF_MODE(insn->code) == BPF_MEM)
insn->code = BPF_LDX | BPF_PROBE_MEM |
@@ -20041,6 +20271,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+ if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ func[i]->aux->jits_use_priv_stack = true;
+
func[i]->jit_requested = 1;
func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
@@ -21809,6 +22042,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
}
+ if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) {
+ verbose(env, "Private stack not supported by jit\n");
+ return -EACCES;
+ }
+
/* btf_ctx_access() used this to provide argument type info */
prog->aux->ctx_arg_info =
st_ops_desc->arg_info[member_idx].info;
@@ -22547,6 +22785,7 @@ err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
+ kvfree(env->insn_hist);
err_free_env:
kvfree(env);
return ret;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 9bc4a84bd309..d9061bd55436 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2140,8 +2140,10 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto exit_stats;
- ret = cgroup_bpf_inherit(root_cgrp);
- WARN_ON_ONCE(ret);
+ if (root == &cgrp_dfl_root) {
+ ret = cgroup_bpf_inherit(root_cgrp);
+ WARN_ON_ONCE(ret);
+ }
trace_cgroup_setup_root(root);
@@ -2314,10 +2316,8 @@ static void cgroup_kill_sb(struct super_block *sb)
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
- cgroup_bpf_offline(&root->cgrp);
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
- }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -5710,9 +5710,11 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_kernfs_remove;
- ret = cgroup_bpf_inherit(cgrp);
- if (ret)
- goto out_psi_free;
+ if (cgrp->root == &cgrp_dfl_root) {
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_psi_free;
+ }
/*
* New cgroup inherits effective freeze counter, and
@@ -6026,7 +6028,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
cgroup1_check_for_release(parent);
- cgroup_bpf_offline(cgrp);
+ if (cgrp->root == &cgrp_dfl_root)
+ cgroup_bpf_offline(cgrp);
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a4dd285cdf39..f321ed515f3a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -84,9 +84,19 @@ static bool have_boot_isolcpus;
static struct list_head remote_children;
/*
- * A flag to force sched domain rebuild at the end of an operation while
- * inhibiting it in the intermediate stages when set. Currently it is only
- * set in hotplug code.
+ * A flag to force sched domain rebuild at the end of an operation.
+ * It can be set in
+ * - update_partition_sd_lb()
+ * - remote_partition_check()
+ * - update_cpumasks_hier()
+ * - cpuset_update_flag()
+ * - cpuset_hotplug_update_tasks()
+ * - cpuset_handle_hotplug()
+ *
+ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ *
+ * Note that update_relax_domain_level() in cpuset-v1.c can still call
+ * rebuild_sched_domains_locked() directly without using this flag.
*/
static bool force_sd_rebuild;
@@ -283,6 +293,12 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
mutex_unlock(&cpuset_mutex);
}
+static inline bool cpuset_v2(void)
+{
+ return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
+ cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+}
+
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@@ -293,7 +309,7 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
*/
static inline bool is_in_v2_mode(void)
{
- return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ return cpuset_v2() ||
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
@@ -565,12 +581,24 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
- * tasks.
+ * tasks. This check is not done when scheduling is disabled as the
+ * users should know what they are doing.
+ *
+ * For v1, effective_cpus == cpus_allowed & user_xcpus() returns
+ * cpus_allowed.
+ *
+ * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only
+ * for non-isolated partition root. At this point, the target
+ * effective_cpus isn't computed yet. user_xcpus() is the best
+ * approximation.
+ *
+ * TBD: May need to precompute the real effective_cpus here in case
+ * incorrect scheduling of SCHED_DEADLINE tasks in a partition
+ * becomes an issue.
*/
ret = -EBUSY;
- if (is_cpu_exclusive(cur) &&
- !cpuset_cpumask_can_shrink(cur->cpus_allowed,
- trial->cpus_allowed))
+ if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
+ !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
goto out;
/*
@@ -728,7 +756,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
- bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ bool cgrpv2 = cpuset_v2();
int nslot_update;
doms = NULL;
@@ -990,6 +1018,7 @@ void rebuild_sched_domains_locked(void)
lockdep_assert_cpus_held();
lockdep_assert_held(&cpuset_mutex);
+ force_sd_rebuild = false;
/*
* If we have raced with CPU hotplug, return early to avoid
@@ -1164,8 +1193,8 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
- if (rebuild_domains && !force_sd_rebuild)
- rebuild_sched_domains_locked();
+ if (rebuild_domains)
+ cpuset_force_rebuild();
}
/*
@@ -1187,7 +1216,7 @@ static void reset_partition_data(struct cpuset *cs)
{
struct cpuset *parent = parent_cs(cs);
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (!cpuset_v2())
return;
lockdep_assert_held(&callback_lock);
@@ -1339,7 +1368,7 @@ static inline bool is_local_partition(struct cpuset *cs)
* remote_partition_enable - Enable current cpuset as a remote partition root
* @cs: the cpuset to update
* @new_prs: new partition_root_state
- * @tmp: temparary masks
+ * @tmp: temporary masks
* Return: 0 if successful, errcode if error
*
* Enable the current cpuset to become a remote partition root taking CPUs
@@ -1377,7 +1406,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
- * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
*/
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
@@ -1387,7 +1416,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
/*
* remote_partition_disable - Remove current cpuset from remote partition list
* @cs: the cpuset to update
- * @tmp: temparary masks
+ * @tmp: temporary masks
*
* The effective_cpus is also updated.
*
@@ -1413,7 +1442,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
- * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
*/
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
@@ -1423,7 +1452,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
* remote_cpus_update - cpus_exclusive change of remote partition
* @cs: the cpuset to be updated
* @newmask: the new effective_xcpus mask
- * @tmp: temparary masks
+ * @tmp: temporary masks
*
* top_cpuset and subpartitions_cpus will be updated or partition can be
* invalidated.
@@ -1465,7 +1494,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
- * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
*/
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
@@ -1480,7 +1509,7 @@ invalidate:
* @cs: the cpuset to be updated
* @newmask: the new effective_xcpus mask
* @delmask: temporary mask for deletion (not in tmp)
- * @tmp: temparary masks
+ * @tmp: temporary masks
*
* This should be called before the given cs has updated its cpus_allowed
* and/or effective_xcpus.
@@ -1512,8 +1541,8 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
remote_partition_disable(child, tmp);
disable_cnt++;
}
- if (disable_cnt && !force_sd_rebuild)
- rebuild_sched_domains_locked();
+ if (disable_cnt)
+ cpuset_force_rebuild();
}
/*
@@ -1923,12 +1952,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
}
/*
- * update_cpumasks_hier() flags
- */
-#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
-#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
-
-/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
@@ -1942,7 +1965,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
* Called with cpuset_mutex held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
- int flags)
+ bool force)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -2007,12 +2030,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
* Skip the whole subtree if
* 1) the cpumask remains the same,
* 2) has no partition root state,
- * 3) HIER_CHECKALL flag not set, and
+ * 3) force flag not set, and
* 4) for v2 load balance state same as its parent.
*/
- if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
+ if (!cp->partition_root_state && !force &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
- (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (!cpuset_v2() ||
(is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
pos_css = css_rightmost_descendant(pos_css);
continue;
@@ -2086,8 +2109,7 @@ get_css:
* from parent if current cpuset isn't a valid partition root
* and their load balance states differ.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_partition_valid(cp) &&
+ if (cpuset_v2() && !is_partition_valid(cp) &&
(is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
if (is_sched_load_balance(parent))
set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
@@ -2103,8 +2125,7 @@ get_css:
*/
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
- (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- is_partition_valid(cp)))
+ (!cpuset_v2() || is_partition_valid(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -2112,9 +2133,8 @@ get_css:
}
rcu_read_unlock();
- if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) &&
- !force_sd_rebuild)
- rebuild_sched_domains_locked();
+ if (need_rebuild_sched_domains)
+ cpuset_force_rebuild();
}
/**
@@ -2141,9 +2161,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* directly.
*
* The update_cpumasks_hier() function may sleep. So we have to
- * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
- * flag is used to suppress rebuild of sched domains as the callers
- * will take care of that.
+ * release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -2159,7 +2177,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
rcu_read_unlock();
- update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
+ update_cpumasks_hier(sibling, tmp, false);
rcu_read_lock();
css_put(&sibling->css);
}
@@ -2179,7 +2197,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
struct tmpmasks tmp;
struct cpuset *parent = parent_cs(cs);
bool invalidate = false;
- int hier_flags = 0;
+ bool force = false;
int old_prs = cs->partition_root_state;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -2206,7 +2224,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
return -EINVAL;
/*
- * When exclusive_cpus isn't explicitly set, it is constrainted
+ * When exclusive_cpus isn't explicitly set, it is constrained
* by cpus_allowed and parent's effective_xcpus. Otherwise,
* trialcs->effective_xcpus is used as a temporary cpumask
* for checking validity of the partition root.
@@ -2240,12 +2258,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Check all the descendants in update_cpumasks_hier() if
* effective_xcpus is to be changed.
*/
- if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
- hier_flags = HIER_CHECKALL;
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
retval = validate_change(cs, trialcs);
- if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if ((retval == -EINVAL) && cpuset_v2()) {
struct cgroup_subsys_state *css;
struct cpuset *cp;
@@ -2309,7 +2326,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
spin_unlock_irq(&callback_lock);
/* effective_cpus/effective_xcpus will be updated here */
- update_cpumasks_hier(cs, &tmp, hier_flags);
+ update_cpumasks_hier(cs, &tmp, force);
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
@@ -2334,7 +2351,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
struct tmpmasks tmp;
struct cpuset *parent = parent_cs(cs);
bool invalidate = false;
- int hier_flags = 0;
+ bool force = false;
int old_prs = cs->partition_root_state;
if (!*buf) {
@@ -2357,8 +2374,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Check all the descendants in update_cpumasks_hier() if
* effective_xcpus is to be changed.
*/
- if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
- hier_flags = HIER_CHECKALL;
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
retval = validate_change(cs, trialcs);
if (retval)
@@ -2411,8 +2427,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* of the subtree when it is a valid partition root or effective_xcpus
* is updated.
*/
- if (is_partition_valid(cs) || hier_flags)
- update_cpumasks_hier(cs, &tmp, hier_flags);
+ if (is_partition_valid(cs) || force)
+ update_cpumasks_hier(cs, &tmp, force);
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
@@ -2737,9 +2753,12 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
cs->flags = trialcs->flags;
spin_unlock_irq(&callback_lock);
- if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed &&
- !force_sd_rebuild)
- rebuild_sched_domains_locked();
+ if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
+ if (cpuset_v2())
+ cpuset_force_rebuild();
+ else
+ rebuild_sched_domains_locked();
+ }
if (spread_flag_changed)
cpuset1_update_tasks_flags(cs);
@@ -2853,12 +2872,14 @@ out:
update_unbound_workqueue_cpumask(new_xcpus_state);
/* Force update if switching back to member */
- update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
+ update_cpumasks_hier(cs, &tmpmask, !new_prs);
/* Update sched domains and load balance flag */
update_partition_sd_lb(cs, old_prs);
notify_partition_change(cs, old_prs);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
free_cpumasks(NULL, &tmpmask);
return 0;
}
@@ -2919,8 +2940,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* migration permission derives from hierarchy ownership in
* cgroup_procs_write_permission()).
*/
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- (cpus_updated || mems_updated)) {
+ if (!cpuset_v2() || (cpus_updated || mems_updated)) {
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
@@ -3034,8 +3054,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* in effective cpus and mems. In that case, we can optimize out
* by skipping the task iteration and update.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !cpus_updated && !mems_updated) {
+ if (cpuset_v2() && !cpus_updated && !mems_updated) {
cpuset_attach_nodemask_to = cs->effective_mems;
goto out;
}
@@ -3152,6 +3171,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
}
free_cpuset(trialcs);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
out_unlock:
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
@@ -3383,7 +3404,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
INIT_LIST_HEAD(&cs->remote_sibling);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (cpuset_v2())
__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
return &cs->css;
@@ -3410,8 +3431,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_sched_load_balance(parent))
+ if (cpuset_v2() && !is_sched_load_balance(parent))
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpuset_inc();
@@ -3481,8 +3501,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
if (is_partition_valid(cs))
update_prstate(cs, 0);
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- is_sched_load_balance(cs))
+ if (!cpuset_v2() && is_sched_load_balance(cs))
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
cpuset_dec();
@@ -3896,11 +3915,9 @@ static void cpuset_handle_hotplug(void)
rcu_read_unlock();
}
- /* rebuild sched domains if cpus_allowed has changed */
- if (force_sd_rebuild) {
- force_sd_rebuild = false;
+ /* rebuild sched domains if necessary */
+ if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
- }
free_cpumasks(NULL, ptmp);
}
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 617861a54793..bf1690a167dd 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -9,6 +9,28 @@
#include <trace/events/cgroup.h>
/*
+ * Update CGRP_FROZEN of cgroup.flag
+ * Return true if flags is updated; false if flags has no change
+ */
+static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ /* Already there? */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen)
+ return false;
+
+ if (frozen)
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ else
+ clear_bit(CGRP_FROZEN, &cgrp->flags);
+
+ cgroup_file_notify(&cgrp->events_file);
+ TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
+ return true;
+}
+
+/*
* Propagate the cgroup frozen state upwards by the cgroup tree.
*/
static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
@@ -24,24 +46,16 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
while ((cgrp = cgroup_parent(cgrp))) {
if (frozen) {
cgrp->freezer.nr_frozen_descendants += desc;
- if (!test_bit(CGRP_FROZEN, &cgrp->flags) &&
- test_bit(CGRP_FREEZE, &cgrp->flags) &&
- cgrp->freezer.nr_frozen_descendants ==
- cgrp->nr_descendants) {
- set_bit(CGRP_FROZEN, &cgrp->flags);
- cgroup_file_notify(&cgrp->events_file);
- TRACE_CGROUP_PATH(notify_frozen, cgrp, 1);
- desc++;
- }
+ if (!test_bit(CGRP_FREEZE, &cgrp->flags) ||
+ (cgrp->freezer.nr_frozen_descendants !=
+ cgrp->nr_descendants))
+ continue;
} else {
cgrp->freezer.nr_frozen_descendants -= desc;
- if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
- clear_bit(CGRP_FROZEN, &cgrp->flags);
- cgroup_file_notify(&cgrp->events_file);
- TRACE_CGROUP_PATH(notify_frozen, cgrp, 0);
- desc++;
- }
}
+
+ if (cgroup_update_frozen_flag(cgrp, frozen))
+ desc++;
}
}
@@ -53,8 +67,6 @@ void cgroup_update_frozen(struct cgroup *cgrp)
{
bool frozen;
- lockdep_assert_held(&css_set_lock);
-
/*
* If the cgroup has to be frozen (CGRP_FREEZE bit set),
* and all tasks are frozen and/or stopped, let's consider
@@ -63,24 +75,9 @@ void cgroup_update_frozen(struct cgroup *cgrp)
frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
- if (frozen) {
- /* Already there? */
- if (test_bit(CGRP_FROZEN, &cgrp->flags))
- return;
-
- set_bit(CGRP_FROZEN, &cgrp->flags);
- } else {
- /* Already there? */
- if (!test_bit(CGRP_FROZEN, &cgrp->flags))
- return;
-
- clear_bit(CGRP_FROZEN, &cgrp->flags);
- }
- cgroup_file_notify(&cgrp->events_file);
- TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
-
- /* Update the state of ancestor cgroups. */
- cgroup_propagate_frozen(cgrp, frozen);
+ /* If flags is updated, update the state of ancestor cgroups. */
+ if (cgroup_update_frozen_flag(cgrp, frozen))
+ cgroup_propagate_frozen(cgrp, frozen);
}
/*
@@ -260,8 +257,10 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
void cgroup_freeze(struct cgroup *cgrp, bool freeze)
{
struct cgroup_subsys_state *css;
+ struct cgroup *parent;
struct cgroup *dsct;
bool applied = false;
+ bool old_e;
lockdep_assert_held(&cgroup_mutex);
@@ -282,22 +281,18 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
if (cgroup_is_dead(dsct))
continue;
- if (freeze) {
- dsct->freezer.e_freeze++;
- /*
- * Already frozen because of ancestor's settings?
- */
- if (dsct->freezer.e_freeze > 1)
- continue;
- } else {
- dsct->freezer.e_freeze--;
- /*
- * Still frozen because of ancestor's settings?
- */
- if (dsct->freezer.e_freeze > 0)
- continue;
-
- WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
+ /*
+ * e_freeze is affected by parent's e_freeze and dst's freeze.
+ * If old e_freeze eq new e_freeze, no change, its children
+ * will not be affected. So do nothing and skip the subtree
+ */
+ old_e = dsct->freezer.e_freeze;
+ parent = cgroup_parent(dsct);
+ dsct->freezer.e_freeze = (dsct->freezer.freeze ||
+ parent->freezer.e_freeze);
+ if (dsct->freezer.e_freeze == old_e) {
+ css = css_rightmost_descendant(css);
+ continue;
}
/*
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index a06b45272411..5877974ece92 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -444,6 +444,7 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
#ifdef CONFIG_SCHED_CORE
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
+ dst_bstat->ntime += src_bstat->ntime;
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -455,6 +456,7 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
#ifdef CONFIG_SCHED_CORE
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
+ dst_bstat->ntime -= src_bstat->ntime;
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -534,8 +536,10 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
- case CPUTIME_USER:
case CPUTIME_NICE:
+ rstatc->bstat.ntime += delta_exec;
+ fallthrough;
+ case CPUTIME_USER:
rstatc->bstat.cputime.utime += delta_exec;
break;
case CPUTIME_SYSTEM:
@@ -591,6 +595,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
#ifdef CONFIG_SCHED_CORE
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
+ bstat->ntime += cpustat[CPUTIME_NICE];
}
}
@@ -608,13 +613,14 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- u64 usage, utime, stime;
+ u64 usage, utime, stime, ntime;
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
+ ntime = cgrp->bstat.ntime;
cgroup_rstat_flush_release(cgrp);
} else {
/* cgrp->bstat of root is not actually used, reuse it */
@@ -622,16 +628,19 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
usage = cgrp->bstat.cputime.sum_exec_runtime;
utime = cgrp->bstat.cputime.utime;
stime = cgrp->bstat.cputime.stime;
+ ntime = cgrp->bstat.ntime;
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
+ do_div(ntime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n"
- "user_usec %llu\n"
- "system_usec %llu\n",
- usage, utime, stime);
+ "user_usec %llu\n"
+ "system_usec %llu\n"
+ "nice_usec %llu\n",
+ usage, utime, stime, ntime);
cgroup_force_idle_show(seq, &cgrp->bstat);
}
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 509ee703de15..20552f163930 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -103,6 +103,7 @@ CONFIG_BUG_ON_DATA_CORRUPTION=y
#
# RCU Debugging
#
+CONFIG_RCU_EXPERT=y
CONFIG_PROVE_RCU=y
CONFIG_PROVE_RCU_LIST=y
#
diff --git a/kernel/cred.c b/kernel/cred.c
index 075cfa7c896f..da7da250f7c8 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -485,7 +485,7 @@ EXPORT_SYMBOL(abort_creds);
*/
const struct cred *override_creds(const struct cred *new)
{
- const struct cred *old = current->cred;
+ const struct cred *old;
kdebug("override_creds(%p{%ld})", new,
atomic_long_read(&new->usage));
@@ -499,7 +499,7 @@ const struct cred *override_creds(const struct cred *new)
* visible to other threads under RCU.
*/
get_new_cred((struct cred *)new);
- rcu_assign_pointer(current->cred, new);
+ old = override_creds_light(new);
kdebug("override_creds() = %p{%ld}", old,
atomic_long_read(&old->usage));
@@ -521,7 +521,7 @@ void revert_creds(const struct cred *old)
kdebug("revert_creds(%p{%ld})", old,
atomic_long_read(&old->usage));
- rcu_assign_pointer(current->cred, old);
+ revert_creds_light(old);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 372025cf1ca3..c0c2072f5452 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -460,13 +460,15 @@ static int kdb_bc(int argc, const char **argv)
break;
case KDBCMD_BE:
+ if (bp->bp_enabled)
+ break;
+
bp->bp_enabled = 1;
kdb_printf("Breakpoint %d at "
- kdb_bfd_vma_fmt " enabled",
+ kdb_bfd_vma_fmt " enabled\n",
i, bp->bp_addr);
- kdb_printf("\n");
break;
case KDBCMD_BD:
if (!bp->bp_enabled)
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 3c2987f46f6e..3a74604fdb8a 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,8 @@
#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
+#define CTRL(c) ((c) - 64)
+
static int kbd_exists;
static int kbd_last_ret;
@@ -123,24 +125,24 @@ int kdb_get_kbd_char(void)
return 8;
}
- /* Special Key */
+ /* Translate special keys to equivalent CTRL control characters */
switch (scancode) {
case 0xF: /* Tab */
- return 9;
+ return CTRL('I');
case 0x53: /* Del */
- return 4;
+ return CTRL('D');
case 0x47: /* Home */
- return 1;
+ return CTRL('A');
case 0x4F: /* End */
- return 5;
+ return CTRL('E');
case 0x4B: /* Left */
- return 2;
+ return CTRL('B');
case 0x48: /* Up */
- return 16;
+ return CTRL('P');
case 0x50: /* Down */
- return 14;
+ return CTRL('N');
case 0x4D: /* Right */
- return 6;
+ return CTRL('F');
}
if (scancode == 0xe0)
@@ -172,6 +174,19 @@ int kdb_get_kbd_char(void)
switch (KTYP(keychar)) {
case KT_LETTER:
case KT_LATIN:
+ switch (keychar) {
+ /* non-printable supported control characters */
+ case CTRL('A'): /* Home */
+ case CTRL('B'): /* Left */
+ case CTRL('D'): /* Del */
+ case CTRL('E'): /* End */
+ case CTRL('F'): /* Right */
+ case CTRL('I'): /* Tab */
+ case CTRL('N'): /* Down */
+ case CTRL('P'): /* Up */
+ return keychar;
+ }
+
if (isprint(keychar))
break; /* printable characters */
fallthrough;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index f5f7d7fb5936..5f4be507d79f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -306,8 +306,8 @@ static int kdbgetulenv(const char *match, unsigned long *value)
return KDB_NOTENV;
if (strlen(ep) == 0)
return KDB_NOENVVALUE;
-
- *value = simple_strtoul(ep, NULL, 0);
+ if (kstrtoul(ep, 0, value))
+ return KDB_BADINT;
return 0;
}
@@ -402,42 +402,15 @@ static void kdb_printenv(void)
*/
int kdbgetularg(const char *arg, unsigned long *value)
{
- char *endp;
- unsigned long val;
-
- val = simple_strtoul(arg, &endp, 0);
-
- if (endp == arg) {
- /*
- * Also try base 16, for us folks too lazy to type the
- * leading 0x...
- */
- val = simple_strtoul(arg, &endp, 16);
- if (endp == arg)
- return KDB_BADINT;
- }
-
- *value = val;
-
+ if (kstrtoul(arg, 0, value))
+ return KDB_BADINT;
return 0;
}
int kdbgetu64arg(const char *arg, u64 *value)
{
- char *endp;
- u64 val;
-
- val = simple_strtoull(arg, &endp, 0);
-
- if (endp == arg) {
-
- val = simple_strtoull(arg, &endp, 16);
- if (endp == arg)
- return KDB_BADINT;
- }
-
- *value = val;
-
+ if (kstrtou64(arg, 0, value))
+ return KDB_BADINT;
return 0;
}
@@ -473,10 +446,10 @@ int kdb_set(int argc, const char **argv)
*/
if (strcmp(argv[1], "KDBDEBUG") == 0) {
unsigned int debugflags;
- char *cp;
+ int ret;
- debugflags = simple_strtoul(argv[2], &cp, 0);
- if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+ ret = kstrtouint(argv[2], 0, &debugflags);
+ if (ret || debugflags & ~KDB_DEBUG_FLAG_MASK) {
kdb_printf("kdb: illegal debug flags '%s'\n",
argv[2]);
return 0;
@@ -1619,10 +1592,10 @@ static int kdb_md(int argc, const char **argv)
if (!argv[0][3])
valid = 1;
else if (argv[0][3] == 'c' && argv[0][4]) {
- char *p;
- repeat = simple_strtoul(argv[0] + 4, &p, 10);
+ if (kstrtouint(argv[0] + 4, 10, &repeat))
+ return KDB_BADINT;
mdcount = ((repeat * bytesperword) + 15) / 16;
- valid = !*p;
+ valid = 1;
}
last_repeat = repeat;
} else if (strcmp(argv[0], "md") == 0)
@@ -2083,15 +2056,10 @@ static int kdb_dmesg(int argc, const char **argv)
if (argc > 2)
return KDB_ARGCOUNT;
if (argc) {
- char *cp;
- lines = simple_strtol(argv[1], &cp, 0);
- if (*cp)
+ if (kstrtoint(argv[1], 0, &lines))
lines = 0;
- if (argc > 1) {
- adjust = simple_strtoul(argv[2], &cp, 0);
- if (*cp || adjust < 0)
- adjust = 0;
- }
+ if (argc > 1 && (kstrtoint(argv[2], 0, &adjust) || adjust < 0))
+ adjust = 0;
}
/* disable LOGGING if set */
@@ -2428,14 +2396,12 @@ static int kdb_help(int argc, const char **argv)
static int kdb_kill(int argc, const char **argv)
{
long sig, pid;
- char *endp;
struct task_struct *p;
if (argc != 2)
return KDB_ARGCOUNT;
- sig = simple_strtol(argv[1], &endp, 0);
- if (*endp)
+ if (kstrtol(argv[1], 0, &sig))
return KDB_BADINT;
if ((sig >= 0) || !valid_signal(-sig)) {
kdb_printf("Invalid signal parameter.<-signal>\n");
@@ -2443,8 +2409,7 @@ static int kdb_kill(int argc, const char **argv)
}
sig = -sig;
- pid = simple_strtol(argv[2], &endp, 0);
- if (*endp)
+ if (kstrtol(argv[2], 0, &pid))
return KDB_BADINT;
if (pid <= 0) {
kdb_printf("Process ID must be large than 0.\n");
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 4c0dcd909121..31cfdb6b4bc3 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -260,23 +260,6 @@ config DMA_API_DEBUG
If unsure, say N.
-config DMA_API_DEBUG_SG
- bool "Debug DMA scatter-gather usage"
- default y
- depends on DMA_API_DEBUG
- help
- Perform extra checking that callers of dma_map_sg() have respected the
- appropriate segment length/boundary limits for the given device when
- preparing DMA scatterlists.
-
- This is particularly likely to have been overlooked in cases where the
- dma_map_sg() API is used for general bulk mapping of pages rather than
- preparing literal scatter-gather descriptors, where there is a risk of
- unexpected behaviour from DMA API implementations if the scatterlist
- is technically out-of-spec.
-
- If unsure, say N.
-
config DMA_MAP_BENCHMARK
bool "Enable benchmarking of streaming DMA mapping"
depends on DEBUG_FS
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index ff5683a57f77..3b2bdca9f1d4 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -330,7 +330,8 @@ int dma_init_global_coherent(phys_addr_t phys_addr, size_t size)
#include <linux/of_reserved_mem.h>
#ifdef CONFIG_DMA_GLOBAL_POOL
-static struct reserved_mem *dma_reserved_default_memory __initdata;
+static phys_addr_t dma_reserved_default_memory_base __initdata;
+static phys_addr_t dma_reserved_default_memory_size __initdata;
#endif
static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
@@ -376,9 +377,10 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
#ifdef CONFIG_DMA_GLOBAL_POOL
if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) {
- WARN(dma_reserved_default_memory,
+ WARN(dma_reserved_default_memory_size,
"Reserved memory: region for default DMA coherent area is redefined\n");
- dma_reserved_default_memory = rmem;
+ dma_reserved_default_memory_base = rmem->base;
+ dma_reserved_default_memory_size = rmem->size;
}
#endif
@@ -391,10 +393,10 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
#ifdef CONFIG_DMA_GLOBAL_POOL
static int __init dma_init_reserved_memory(void)
{
- if (!dma_reserved_default_memory)
+ if (!dma_reserved_default_memory_size)
return -ENOMEM;
- return dma_init_global_coherent(dma_reserved_default_memory->base,
- dma_reserved_default_memory->size);
+ return dma_init_global_coherent(dma_reserved_default_memory_base,
+ dma_reserved_default_memory_size);
}
core_initcall(dma_init_reserved_memory);
#endif /* CONFIG_DMA_GLOBAL_POOL */
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index d570535342cb..295396226f31 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -59,8 +59,7 @@ enum map_err_types {
* @direction: enum dma_data_direction
* @sg_call_ents: 'nents' from dma_map_sg
* @sg_mapped_ents: 'mapped_ents' from dma_map_sg
- * @pfn: page frame of the start address
- * @offset: offset of mapping relative to pfn
+ * @paddr: physical start address of the mapping
* @map_err_type: track whether dma_mapping_error() was checked
* @stack_len: number of backtrace entries in @stack_entries
* @stack_entries: stack of backtrace history
@@ -74,8 +73,7 @@ struct dma_debug_entry {
int direction;
int sg_call_ents;
int sg_mapped_ents;
- unsigned long pfn;
- size_t offset;
+ phys_addr_t paddr;
enum map_err_types map_err_type;
#ifdef CONFIG_STACKTRACE
unsigned int stack_len;
@@ -389,14 +387,6 @@ static void hash_bucket_del(struct dma_debug_entry *entry)
list_del(&entry->list);
}
-static unsigned long long phys_addr(struct dma_debug_entry *entry)
-{
- if (entry->type == dma_debug_resource)
- return __pfn_to_phys(entry->pfn) + entry->offset;
-
- return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset;
-}
-
/*
* For each mapping (initial cacheline in the case of
* dma_alloc_coherent/dma_map_page, initial cacheline in each page of a
@@ -428,8 +418,8 @@ static DEFINE_SPINLOCK(radix_lock);
static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry)
{
- return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) +
- (entry->offset >> L1_CACHE_SHIFT);
+ return ((entry->paddr >> PAGE_SHIFT) << CACHELINE_PER_PAGE_SHIFT) +
+ (offset_in_page(entry->paddr) >> L1_CACHE_SHIFT);
}
static int active_cacheline_read_overlap(phys_addr_t cln)
@@ -538,11 +528,11 @@ void debug_dma_dump_mappings(struct device *dev)
if (!dev || dev == entry->dev) {
cln = to_cacheline_number(entry);
dev_info(entry->dev,
- "%s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n",
+ "%s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n",
type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- &cln, dir2name[entry->direction],
+ &entry->paddr, entry->dev_addr,
+ entry->size, &cln,
+ dir2name[entry->direction],
maperr2str[entry->map_err_type]);
}
}
@@ -569,13 +559,13 @@ static int dump_show(struct seq_file *seq, void *v)
list_for_each_entry(entry, &bucket->list, list) {
cln = to_cacheline_number(entry);
seq_printf(seq,
- "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n",
+ "%s %s %s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n",
dev_driver_string(entry->dev),
dev_name(entry->dev),
type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- &cln, dir2name[entry->direction],
+ &entry->paddr, entry->dev_addr,
+ entry->size, &cln,
+ dir2name[entry->direction],
maperr2str[entry->map_err_type]);
}
spin_unlock_irqrestore(&bucket->lock, flags);
@@ -1003,16 +993,16 @@ static void check_unmap(struct dma_debug_entry *ref)
"[mapped as %s] [unmapped as %s]\n",
ref->dev_addr, ref->size,
type2name[entry->type], type2name[ref->type]);
- } else if ((entry->type == dma_debug_coherent) &&
- (phys_addr(ref) != phys_addr(entry))) {
+ } else if (entry->type == dma_debug_coherent &&
+ ref->paddr != entry->paddr) {
err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different CPU address "
"[device address=0x%016llx] [size=%llu bytes] "
- "[cpu alloc address=0x%016llx] "
- "[cpu free address=0x%016llx]",
+ "[cpu alloc address=0x%pa] "
+ "[cpu free address=0x%pa]",
ref->dev_addr, ref->size,
- phys_addr(entry),
- phys_addr(ref));
+ &entry->paddr,
+ &ref->paddr);
}
if (ref->sg_call_ents && ref->type == dma_debug_sg &&
@@ -1052,9 +1042,13 @@ static void check_unmap(struct dma_debug_entry *ref)
}
hash_bucket_del(entry);
- dma_entry_free(entry);
-
put_hash_bucket(bucket, flags);
+
+ /*
+ * Free the entry outside of bucket_lock to avoid ABBA deadlocks
+ * between that and radix_lock.
+ */
+ dma_entry_free(entry);
}
static void check_for_stack(struct device *dev,
@@ -1169,7 +1163,6 @@ out:
static void check_sg_segment(struct device *dev, struct scatterlist *sg)
{
-#ifdef CONFIG_DMA_API_DEBUG_SG
unsigned int max_seg = dma_get_max_seg_size(dev);
u64 start, end, boundary = dma_get_seg_boundary(dev);
@@ -1190,7 +1183,6 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg)
if ((start ^ end) & ~boundary)
err_printk(dev, NULL, "mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
start, end, boundary);
-#endif
}
void debug_dma_map_single(struct device *dev, const void *addr,
@@ -1227,8 +1219,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
entry->dev = dev;
entry->type = dma_debug_single;
- entry->pfn = page_to_pfn(page);
- entry->offset = offset;
+ entry->paddr = page_to_phys(page);
entry->dev_addr = dma_addr;
entry->size = size;
entry->direction = direction;
@@ -1323,8 +1314,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->type = dma_debug_sg;
entry->dev = dev;
- entry->pfn = page_to_pfn(sg_page(s));
- entry->offset = s->offset;
+ entry->paddr = sg_phys(s);
entry->size = sg_dma_len(s);
entry->dev_addr = sg_dma_address(s);
entry->direction = direction;
@@ -1370,8 +1360,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .pfn = page_to_pfn(sg_page(s)),
- .offset = s->offset,
+ .paddr = sg_phys(s),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = dir,
@@ -1410,16 +1399,12 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
entry->type = dma_debug_coherent;
entry->dev = dev;
- entry->offset = offset_in_page(virt);
+ entry->paddr = page_to_phys((is_vmalloc_addr(virt) ?
+ vmalloc_to_page(virt) : virt_to_page(virt)));
entry->size = size;
entry->dev_addr = dma_addr;
entry->direction = DMA_BIDIRECTIONAL;
- if (is_vmalloc_addr(virt))
- entry->pfn = vmalloc_to_pfn(virt);
- else
- entry->pfn = page_to_pfn(virt_to_page(virt));
-
add_dma_entry(entry, attrs);
}
@@ -1429,7 +1414,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
struct dma_debug_entry ref = {
.type = dma_debug_coherent,
.dev = dev,
- .offset = offset_in_page(virt),
.dev_addr = dma_addr,
.size = size,
.direction = DMA_BIDIRECTIONAL,
@@ -1439,10 +1423,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt))
return;
- if (is_vmalloc_addr(virt))
- ref.pfn = vmalloc_to_pfn(virt);
- else
- ref.pfn = page_to_pfn(virt_to_page(virt));
+ ref.paddr = page_to_phys((is_vmalloc_addr(virt) ?
+ vmalloc_to_page(virt) : virt_to_page(virt)));
if (unlikely(dma_debug_disabled()))
return;
@@ -1465,8 +1447,7 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
entry->type = dma_debug_resource;
entry->dev = dev;
- entry->pfn = PHYS_PFN(addr);
- entry->offset = offset_in_page(addr);
+ entry->paddr = addr;
entry->size = size;
entry->dev_addr = dma_addr;
entry->direction = direction;
@@ -1543,8 +1524,7 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .pfn = page_to_pfn(sg_page(s)),
- .offset = s->offset,
+ .paddr = sg_phys(s),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = direction,
@@ -1575,8 +1555,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .pfn = page_to_pfn(sg_page(s)),
- .offset = s->offset,
+ .paddr = sg_phys(sg),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = direction,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 864a1121bf08..cda127027e48 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -223,6 +223,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
} else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
ents != -EIO && ents != -EREMOTEIO)) {
+ trace_dma_map_sg_err(dev, sg, nents, ents, dir, attrs);
return -EIO;
}
@@ -604,22 +605,29 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
if (WARN_ON_ONCE(flag & __GFP_COMP))
return NULL;
- if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
+ if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) {
+ trace_dma_alloc(dev, cpu_addr, *dma_handle, size,
+ DMA_BIDIRECTIONAL, flag, attrs);
return cpu_addr;
+ }
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (dma_alloc_direct(dev, ops))
+ if (dma_alloc_direct(dev, ops)) {
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
- else if (use_dma_iommu(dev))
+ } else if (use_dma_iommu(dev)) {
cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs);
- else if (ops->alloc)
+ } else if (ops->alloc) {
cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
- else
+ } else {
+ trace_dma_alloc(dev, NULL, 0, size, DMA_BIDIRECTIONAL, flag,
+ attrs);
return NULL;
+ }
- trace_dma_alloc(dev, cpu_addr, *dma_handle, size, flag, attrs);
+ trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL,
+ flag, attrs);
debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
return cpu_addr;
}
@@ -641,10 +649,11 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
*/
WARN_ON(irqs_disabled());
+ trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL,
+ attrs);
if (!cpu_addr)
return;
- trace_dma_free(dev, cpu_addr, dma_handle, size, attrs);
debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
if (dma_alloc_direct(dev, ops))
dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
@@ -683,9 +692,11 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
if (page) {
- trace_dma_map_page(dev, page_to_phys(page), *dma_handle, size,
- dir, 0);
+ trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle,
+ size, dir, gfp, 0);
debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
+ } else {
+ trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0);
}
return page;
}
@@ -708,7 +719,7 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
void dma_free_pages(struct device *dev, size_t size, struct page *page,
dma_addr_t dma_handle, enum dma_data_direction dir)
{
- trace_dma_unmap_page(dev, dma_handle, size, dir, 0);
+ trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0);
debug_dma_unmap_page(dev, dma_handle, size, dir);
__dma_free_pages(dev, size, page, dma_handle, dir);
}
@@ -768,8 +779,10 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (sgt) {
sgt->nents = 1;
- trace_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
+ trace_dma_alloc_sgt(dev, sgt, size, dir, gfp, attrs);
debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
+ } else {
+ trace_dma_alloc_sgt_err(dev, NULL, 0, size, dir, gfp, attrs);
}
return sgt;
}
@@ -787,7 +800,7 @@ static void free_single_sgt(struct device *dev, size_t size,
void dma_free_noncontiguous(struct device *dev, size_t size,
struct sg_table *sgt, enum dma_data_direction dir)
{
- trace_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir, 0);
+ trace_dma_free_sgt(dev, sgt, size, dir);
debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
if (use_dma_iommu(dev))
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index da59c68df841..b027a4030976 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -95,10 +95,6 @@ struct kprobe_insn_page {
char slot_used[];
};
-#define KPROBE_INSN_PAGE_SIZE(slots) \
- (offsetof(struct kprobe_insn_page, slot_used) + \
- (sizeof(char) * (slots)))
-
static int slots_per_page(struct kprobe_insn_cache *c)
{
return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -175,7 +171,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
goto retry;
/* All out of space. Need to allocate a new page. */
- kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
+ kip = kmalloc(struct_size(kip, slot_used, slots_per_page(c)), GFP_KERNEL);
if (!kip)
goto out;
@@ -206,29 +202,29 @@ static bool collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
- if (kip->nused == 0) {
+ if (kip->nused != 0)
+ return false;
+
+ /*
+ * Page is no longer in use. Free it unless
+ * it's the last one. We keep the last one
+ * so as not to have to set it up again the
+ * next time somebody inserts a probe.
+ */
+ if (!list_is_singular(&kip->list)) {
/*
- * Page is no longer in use. Free it unless
- * it's the last one. We keep the last one
- * so as not to have to set it up again the
- * next time somebody inserts a probe.
+ * Record perf ksymbol unregister event before removing
+ * the page.
*/
- if (!list_is_singular(&kip->list)) {
- /*
- * Record perf ksymbol unregister event before removing
- * the page.
- */
- perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
- (unsigned long)kip->insns, PAGE_SIZE, true,
- kip->cache->sym);
- list_del_rcu(&kip->list);
- synchronize_rcu();
- kip->cache->free(kip->insns);
- kfree(kip);
- }
- return true;
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ (unsigned long)kip->insns, PAGE_SIZE, true,
+ kip->cache->sym);
+ list_del_rcu(&kip->list);
+ synchronize_rcu();
+ kip->cache->free(kip->insns);
+ kfree(kip);
}
- return false;
+ return true;
}
static int collect_garbage_slots(struct kprobe_insn_cache *c)
@@ -353,8 +349,8 @@ struct kprobe_insn_cache kprobe_optinsn_slots = {
/* .insn_size is initialized later */
.nr_garbage = 0,
};
-#endif
-#endif
+#endif /* CONFIG_OPTPROBES */
+#endif /* __ARCH_WANT_KPROBES_INSN_SLOT */
/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
@@ -1543,7 +1539,7 @@ static int check_ftrace_location(struct kprobe *p)
if (ftrace_location(addr) == addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
p->flags |= KPROBE_FLAG_FTRACE;
-#else /* !CONFIG_KPROBES_ON_FTRACE */
+#else
return -EINVAL;
#endif
}
@@ -1725,28 +1721,29 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
if (unlikely(orig_p == NULL))
return ERR_PTR(-EINVAL);
- if (!kprobe_disabled(p)) {
- /* Disable probe if it is a child probe */
- if (p != orig_p)
- p->flags |= KPROBE_FLAG_DISABLED;
+ if (kprobe_disabled(p))
+ return orig_p;
- /* Try to disarm and disable this/parent probe */
- if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
- /*
- * Don't be lazy here. Even if 'kprobes_all_disarmed'
- * is false, 'orig_p' might not have been armed yet.
- * Note arm_all_kprobes() __tries__ to arm all kprobes
- * on the best effort basis.
- */
- if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
- ret = disarm_kprobe(orig_p, true);
- if (ret) {
- p->flags &= ~KPROBE_FLAG_DISABLED;
- return ERR_PTR(ret);
- }
+ /* Disable probe if it is a child probe */
+ if (p != orig_p)
+ p->flags |= KPROBE_FLAG_DISABLED;
+
+ /* Try to disarm and disable this/parent probe */
+ if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+ /*
+ * Don't be lazy here. Even if 'kprobes_all_disarmed'
+ * is false, 'orig_p' might not have been armed yet.
+ * Note arm_all_kprobes() __tries__ to arm all kprobes
+ * on the best effort basis.
+ */
+ if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
+ ret = disarm_kprobe(orig_p, true);
+ if (ret) {
+ p->flags &= ~KPROBE_FLAG_DISABLED;
+ return ERR_PTR(ret);
}
- orig_p->flags |= KPROBE_FLAG_DISABLED;
}
+ orig_p->flags |= KPROBE_FLAG_DISABLED;
}
return orig_p;
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 3fcb48502adb..c6bb47666aef 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -53,6 +53,8 @@ int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
/* Flags for a single printk record. */
enum printk_info_flags {
+ /* always show on console, ignore console_loglevel */
+ LOG_FORCE_CON = 1,
LOG_NEWLINE = 2, /* text ended with a newline */
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
@@ -90,6 +92,7 @@ bool printk_percpu_data_ready(void);
void defer_console_output(void);
bool is_printk_legacy_deferred(void);
+bool is_printk_force_console(void);
u16 printk_parse_prefix(const char *text, int *level,
enum printk_info_flags *flags);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 19911c8fa7b6..80910bc3470c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1157,6 +1157,17 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;
+static void print_log_buf_usage_stats(void)
+{
+ unsigned int descs_count = log_buf_len >> PRB_AVGBITS;
+ size_t meta_data_size;
+
+ meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info));
+
+ pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n",
+ log_buf_len, meta_data_size, log_buf_len + meta_data_size);
+}
+
void __init setup_log_buf(int early)
{
struct printk_info *new_infos;
@@ -1186,20 +1197,25 @@ void __init setup_log_buf(int early)
if (!early && !new_log_buf_len)
log_buf_add_cpu();
- if (!new_log_buf_len)
+ if (!new_log_buf_len) {
+ /* Show the memory stats only once. */
+ if (!early)
+ goto out;
+
return;
+ }
new_descs_count = new_log_buf_len >> PRB_AVGBITS;
if (new_descs_count == 0) {
pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
- return;
+ goto out;
}
new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
if (unlikely(!new_log_buf)) {
pr_err("log_buf_len: %lu text bytes not available\n",
new_log_buf_len);
- return;
+ goto out;
}
new_descs_size = new_descs_count * sizeof(struct prb_desc);
@@ -1262,7 +1278,7 @@ void __init setup_log_buf(int early)
prb_next_seq(&printk_rb_static) - seq);
}
- pr_info("log_buf_len: %u bytes\n", log_buf_len);
+ print_log_buf_usage_stats();
pr_info("early log buf free: %u(%u%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
return;
@@ -1271,6 +1287,8 @@ err_free_descs:
memblock_free(new_descs, new_descs_size);
err_free_log_buf:
memblock_free(new_log_buf, new_log_buf_len);
+out:
+ print_log_buf_usage_stats();
}
static bool __read_mostly ignore_loglevel;
@@ -1320,11 +1338,11 @@ static void boot_delay_msec(int level)
{
unsigned long long k;
unsigned long timeout;
+ bool suppress = !is_printk_force_console() &&
+ suppress_message_printing(level);
- if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
- || suppress_message_printing(level)) {
+ if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress)
return;
- }
k = (unsigned long long)loops_per_msec * boot_delay;
@@ -2274,6 +2292,9 @@ int vprintk_store(int facility, int level,
if (dev_info)
flags |= LOG_NEWLINE;
+ if (is_printk_force_console())
+ flags |= LOG_FORCE_CON;
+
if (flags & LOG_CONT) {
prb_rec_init_wr(&r, reserve_size);
if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
@@ -2281,6 +2302,9 @@ int vprintk_store(int facility, int level,
facility, &flags, fmt, args);
r.info->text_len += text_len;
+ if (flags & LOG_FORCE_CON)
+ r.info->flags |= LOG_FORCE_CON;
+
if (flags & LOG_NEWLINE) {
r.info->flags |= LOG_NEWLINE;
prb_final_commit(&e);
@@ -2948,6 +2972,7 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
struct printk_info info;
struct printk_record r;
size_t len = 0;
+ bool force_con;
/*
* Formatting extended messages requires a separate buffer, so use the
@@ -2966,9 +2991,13 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
pmsg->seq = r.info->seq;
pmsg->dropped = r.info->seq - seq;
+ force_con = r.info->flags & LOG_FORCE_CON;
- /* Skip record that has level above the console loglevel. */
- if (may_suppress && suppress_message_printing(r.info->level))
+ /*
+ * Skip records that are not forced to be printed on consoles and that
+ * has level above the console loglevel.
+ */
+ if (!force_con && may_suppress && suppress_message_printing(r.info->level))
goto out;
if (is_extended) {
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 2b35a9d3919d..6f94418d53ff 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -12,6 +12,24 @@
#include "internal.h"
+/* Context where printk messages are never suppressed */
+static atomic_t force_con;
+
+void printk_force_console_enter(void)
+{
+ atomic_inc(&force_con);
+}
+
+void printk_force_console_exit(void)
+{
+ atomic_dec(&force_con);
+}
+
+bool is_printk_force_console(void)
+{
+ return atomic_read(&force_con);
+}
+
static DEFINE_PER_CPU(int, printk_context);
/* Can be preempted by NMI. */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ecb88c528544..7fff1d045477 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -199,8 +199,10 @@ struct scx_dump_ctx {
/**
* struct sched_ext_ops - Operation table for BPF scheduler implementation
*
- * Userland can implement an arbitrary scheduling policy by implementing and
- * loading operations in this table.
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
*/
struct sched_ext_ops {
/**
@@ -218,10 +220,15 @@ struct sched_ext_ops {
* dispatch. While an explicit custom mechanism can be added,
* select_cpu() serves as the default way to wake up idle CPUs.
*
- * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p
- * is dispatched, the ops.enqueue() callback will be skipped. Finally,
- * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
- * local DSQ of whatever CPU is returned by this callback.
+ * @p may be inserted into a DSQ directly by calling
+ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+ * of the CPU returned by this operation.
+ *
+ * Note that select_cpu() is never called for tasks that can only run
+ * on a single CPU or tasks with migration disabled, as they don't have
+ * the option to select a different CPU. See select_task_rq() for
+ * details.
*/
s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
@@ -230,12 +237,12 @@ struct sched_ext_ops {
* @p: task being enqueued
* @enq_flags: %SCX_ENQ_*
*
- * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
- * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
- * scheduler owns @p and if it fails to dispatch @p, the task will
- * stall.
+ * @p is ready to run. Insert directly into a DSQ by calling
+ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+ * the task will stall.
*
- * If @p was dispatched from ops.select_cpu(), this callback is
+ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
* skipped.
*/
void (*enqueue)(struct task_struct *p, u64 enq_flags);
@@ -257,17 +264,17 @@ struct sched_ext_ops {
void (*dequeue)(struct task_struct *p, u64 deq_flags);
/**
- * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
+ * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs
* @cpu: CPU to dispatch tasks for
* @prev: previous task being switched out
*
* Called when a CPU's local dsq is empty. The operation should dispatch
* one or more tasks from the BPF scheduler into the DSQs using
- * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
- * scx_bpf_consume().
+ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+ * using scx_bpf_dsq_move_to_local().
*
- * The maximum number of times scx_bpf_dispatch() can be called without
- * an intervening scx_bpf_consume() is specified by
+ * The maximum number of times scx_bpf_dsq_insert() can be called
+ * without an intervening scx_bpf_dsq_move_to_local() is specified by
* ops.dispatch_max_batch. See the comments on top of the two functions
* for more details.
*
@@ -275,7 +282,7 @@ struct sched_ext_ops {
* @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
* @prev->scx.flags, it is not enqueued yet and will be enqueued after
* ops.dispatch() returns. To keep executing @prev, return without
- * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
+ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
*/
void (*dispatch)(s32 cpu, struct task_struct *prev);
@@ -594,7 +601,7 @@ struct sched_ext_ops {
* Update @tg's weight to @weight.
*/
void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-#endif /* CONFIG_CGROUPS */
+#endif /* CONFIG_EXT_GROUP_SCHED */
/*
* All online ops must come before ops.cpu_online().
@@ -707,7 +714,7 @@ enum scx_enq_flags {
/*
* Set the following to trigger preemption when calling
- * scx_bpf_dispatch() with a local dsq as the target. The slice of the
+ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
* current task is cleared to zero and the CPU is kicked into the
* scheduling path. Implies %SCX_ENQ_HEAD.
*/
@@ -862,8 +869,9 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static unsigned long scx_in_softlockup;
+static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
static int scx_ops_bypass_depth;
-static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
static bool scx_ops_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -876,6 +884,11 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+#ifdef CONFIG_SMP
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+#endif
+
static struct static_key_false scx_has_op[SCX_OPI_END] =
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
@@ -2309,7 +2322,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
/*
* We don't require the BPF scheduler to avoid dispatching to offline
* CPUs mostly for convenience but also because CPUs can go offline
- * between scx_bpf_dispatch() calls and here. Trigger error iff the
+ * between scx_bpf_dsq_insert() calls and here. Trigger error iff the
* picked CPU is outside the allowed mask.
*/
if (!task_allowed_on_cpu(p, cpu)) {
@@ -2397,11 +2410,115 @@ static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *r
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
+/**
+ * move_task_between_dsqs() - Move a task from one DSQ to another
+ * @p: target task
+ * @enq_flags: %SCX_ENQ_*
+ * @src_dsq: DSQ @p is currently on, must not be a local DSQ
+ * @dst_dsq: DSQ @p is being moved to, can be any DSQ
+ *
+ * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local
+ * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq
+ * will change. As @p's task_rq is locked, this function doesn't need to use the
+ * holding_cpu mechanism.
+ *
+ * On return, @src_dsq is unlocked and only @p's new task_rq, which is the
+ * return value, is locked.
+ */
+static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
+ struct scx_dispatch_q *src_dsq,
+ struct scx_dispatch_q *dst_dsq)
+{
+ struct rq *src_rq = task_rq(p), *dst_rq;
+
+ BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);
+ lockdep_assert_held(&src_dsq->lock);
+ lockdep_assert_rq_held(src_rq);
+
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+ if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
+ dst_dsq = find_global_dsq(p);
+ dst_rq = src_rq;
+ }
+ } else {
+ /* no need to migrate if destination is a non-local DSQ */
+ dst_rq = src_rq;
+ }
+
+ /*
+ * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
+ * CPU, @p will be migrated.
+ */
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ /* @p is going from a non-local DSQ to a local DSQ */
+ if (src_rq == dst_rq) {
+ task_unlink_from_dsq(p, src_dsq);
+ move_local_task_to_local_dsq(p, enq_flags,
+ src_dsq, dst_rq);
+ raw_spin_unlock(&src_dsq->lock);
+ } else {
+ raw_spin_unlock(&src_dsq->lock);
+ move_remote_task_to_local_dsq(p, enq_flags,
+ src_rq, dst_rq);
+ }
+ } else {
+ /*
+ * @p is going from a non-local DSQ to a non-local DSQ. As
+ * $src_dsq is already locked, do an abbreviated dequeue.
+ */
+ task_unlink_from_dsq(p, src_dsq);
+ p->scx.dsq = NULL;
+ raw_spin_unlock(&src_dsq->lock);
+
+ dispatch_enqueue(dst_dsq, p, enq_flags);
+ }
+
+ return dst_rq;
+}
+
+/*
+ * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
+ * banging on the same DSQ on a large NUMA system to the point where switching
+ * to the bypass mode can take a long time. Inject artifical delays while the
+ * bypass mode is switching to guarantee timely completion.
+ */
+static void scx_ops_breather(struct rq *rq)
+{
+ u64 until;
+
+ lockdep_assert_rq_held(rq);
+
+ if (likely(!atomic_read(&scx_ops_breather_depth)))
+ return;
+
+ raw_spin_rq_unlock(rq);
+
+ until = ktime_get_ns() + NSEC_PER_MSEC;
+
+ do {
+ int cnt = 1024;
+ while (atomic_read(&scx_ops_breather_depth) && --cnt)
+ cpu_relax();
+ } while (atomic_read(&scx_ops_breather_depth) &&
+ time_before64(ktime_get_ns(), until));
+
+ raw_spin_rq_lock(rq);
+}
+
static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
+ * This retry loop can repeatedly race against scx_ops_bypass()
+ * dequeueing tasks from @dsq trying to put the system into the bypass
+ * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
+ * live-lock the machine into soft lockups. Give a breather.
+ */
+ scx_ops_breather(rq);
+
+ /*
* The caller can't expect to successfully consume a task if the task's
* addition to @dsq isn't guaranteed to be visible somehow. Test
* @dsq->list without locking and skip if it seems empty.
@@ -2541,7 +2658,7 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* Dispatching to local DSQs may need to wait for queueing to complete or
* require rq lock dancing. As we don't wanna do either while inside
* ops.dispatch() to avoid locking order inversion, we split dispatching into
- * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
+ * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the
* task and its qseq. Once ops.dispatch() returns, this function is called to
* finish up.
*
@@ -2573,7 +2690,7 @@ retry:
/*
* If qseq doesn't match, @p has gone through at least one
* dispatch/dequeue and re-enqueue cycle between
- * scx_bpf_dispatch() and here and we have no claim on it.
+ * scx_bpf_dsq_insert() and here and we have no claim on it.
*/
if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
return;
@@ -2642,7 +2759,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* If the previous sched_class for the current CPU was not SCX,
* notify the BPF scheduler that it again has control of the
* core. This callback complements ->cpu_release(), which is
- * emitted in scx_next_task_picked().
+ * emitted in switch_class().
*/
if (SCX_HAS_OP(cpu_acquire))
SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
@@ -3098,28 +3215,216 @@ found:
goto retry;
}
+/*
+ * Return true if the LLC domains do not perfectly overlap with the NUMA
+ * domains, false otherwise.
+ */
+static bool llc_numa_mismatch(void)
+{
+ int cpu;
+
+ /*
+ * We need to scan all online CPUs to verify whether their scheduling
+ * domains overlap.
+ *
+ * While it is rare to encounter architectures with asymmetric NUMA
+ * topologies, CPU hotplugging or virtualized environments can result
+ * in asymmetric configurations.
+ *
+ * For example:
+ *
+ * NUMA 0:
+ * - LLC 0: cpu0..cpu7
+ * - LLC 1: cpu8..cpu15 [offline]
+ *
+ * NUMA 1:
+ * - LLC 0: cpu16..cpu23
+ * - LLC 1: cpu24..cpu31
+ *
+ * In this case, if we only check the first online CPU (cpu0), we might
+ * incorrectly assume that the LLC and NUMA domains are fully
+ * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
+ * domains).
+ */
+ for_each_online_cpu(cpu) {
+ const struct cpumask *numa_cpus;
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return true;
+
+ numa_cpus = cpumask_of_node(cpu_to_node(cpu));
+ if (sd->span_weight != cpumask_weight(numa_cpus))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
+ *
+ * Assumption: the kernel's internal topology representation assumes that each
+ * CPU belongs to a single LLC domain, and that each LLC domain is entirely
+ * contained within a single NUMA node.
+ */
+static void update_selcpu_topology(void)
+{
+ bool enable_llc = false, enable_numa = false;
+ struct sched_domain *sd;
+ const struct cpumask *cpus;
+ s32 cpu = cpumask_first(cpu_online_mask);
+
+ /*
+ * Enable LLC domain optimization only when there are multiple LLC
+ * domains among the online CPUs. If all online CPUs are part of a
+ * single LLC domain, the idle CPU selection logic can choose any
+ * online CPU without bias.
+ *
+ * Note that it is sufficient to check the LLC domain of the first
+ * online CPU to determine whether a single LLC domain includes all
+ * CPUs.
+ */
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (sd) {
+ if (sd->span_weight < num_online_cpus())
+ enable_llc = true;
+ }
+
+ /*
+ * Enable NUMA optimization only when there are multiple NUMA domains
+ * among the online CPUs and the NUMA domains don't perfectly overlaps
+ * with the LLC domains.
+ *
+ * If all CPUs belong to the same NUMA node and the same LLC domain,
+ * enabling both NUMA and LLC optimizations is unnecessary, as checking
+ * for an idle CPU in the same domain twice is redundant.
+ */
+ cpus = cpumask_of_node(cpu_to_node(cpu));
+ if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch())
+ enable_numa = true;
+ rcu_read_unlock();
+
+ pr_debug("sched_ext: LLC idle selection %s\n",
+ enable_llc ? "enabled" : "disabled");
+ pr_debug("sched_ext: NUMA idle selection %s\n",
+ enable_numa ? "enabled" : "disabled");
+
+ if (enable_llc)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+ if (enable_numa)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
+}
+
+/*
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
+ * idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ * branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * - if the above conditions aren't met, pick a CPU that shares the same LLC
+ * to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ * - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively, multiple
+ * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa).
+ *
+ * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
+ * we never call ops.select_cpu() for them, see select_task_rq().
+ */
static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *found)
{
+ const struct cpumask *llc_cpus = NULL;
+ const struct cpumask *numa_cpus = NULL;
s32 cpu;
*found = false;
+
+ /*
+ * This is necessary to protect llc_cpus.
+ */
+ rcu_read_lock();
+
+ /*
+ * Determine the scheduling domain only if the task is allowed to run
+ * on all CPUs.
+ *
+ * This is done primarily for efficiency, as it avoids the overhead of
+ * updating a cpumask every time we need to select an idle CPU (which
+ * can be costly in large SMP systems), but it also aligns logically:
+ * if a task's scheduling domain is restricted by user-space (through
+ * CPU affinity), the task will simply use the flat scheduling domain
+ * defined by user-space.
+ */
+ if (p->nr_cpus_allowed >= num_possible_cpus()) {
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+ numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
+ if (sd)
+ llc_cpus = sched_domain_span(sd);
+ }
+ }
+
/*
- * If WAKE_SYNC, the waker's local DSQ is empty, and the system is
- * under utilized, wake up @p to the local DSQ of the waker. Checking
- * only for an empty local DSQ is insufficient as it could give the
- * wakee an unfair advantage when the system is oversaturated.
- * Checking only for the presence of idle CPUs is also insufficient as
- * the local DSQ of the waker could have tasks piled up on it even if
- * there is an idle core elsewhere on the system.
- */
- cpu = smp_processor_id();
- if ((wake_flags & SCX_WAKE_SYNC) &&
- !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) &&
- cpu_rq(cpu)->scx.local_dsq.nr == 0) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
+ */
+ if (wake_flags & SCX_WAKE_SYNC) {
+ cpu = smp_processor_id();
+
+ /*
+ * If the waker's CPU is cache affine and prev_cpu is idle,
+ * then avoid a migration.
+ */
+ if (cpus_share_cache(cpu, prev_cpu) &&
+ test_and_clear_cpu_idle(prev_cpu)) {
+ cpu = prev_cpu;
goto cpu_found;
+ }
+
+ /*
+ * If the waker's local DSQ is empty, and the system is under
+ * utilized, try to wake up @p to the local DSQ of the waker.
+ *
+ * Checking only for an empty local DSQ is insufficient as it
+ * could give the wakee an unfair advantage when the system is
+ * oversaturated.
+ *
+ * Checking only for the presence of idle CPUs is also
+ * insufficient as the local DSQ of the waker could have tasks
+ * piled up on it even if there is an idle core elsewhere on
+ * the system.
+ */
+ if (!cpumask_empty(idle_masks.cpu) &&
+ !(current->flags & PF_EXITING) &&
+ cpu_rq(cpu)->scx.local_dsq.nr == 0) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ goto cpu_found;
+ }
}
/*
@@ -3127,29 +3432,80 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
* partially idle @prev_cpu.
*/
if (sched_smt_active()) {
+ /*
+ * Keep using @prev_cpu if it's part of a fully idle core.
+ */
if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
goto cpu_found;
}
+ /*
+ * Search for any fully idle core in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any fully idle core in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any full idle core usable by the task.
+ */
cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto cpu_found;
}
+ /*
+ * Use @prev_cpu if it's idle.
+ */
if (test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
goto cpu_found;
}
+ /*
+ * Search for any idle CPU in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = scx_pick_idle_cpu(llc_cpus, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any idle CPU in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any idle CPU usable by the task.
+ */
cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
if (cpu >= 0)
goto cpu_found;
+ rcu_read_unlock();
return prev_cpu;
cpu_found:
+ rcu_read_unlock();
+
*found = true;
return cpu;
}
@@ -3272,6 +3628,9 @@ static void handle_hotplug(struct rq *rq, bool online)
atomic_long_inc(&scx_hotplug_seq);
+ if (scx_enabled())
+ update_selcpu_topology();
+
if (online && SCX_HAS_OP(cpu_online))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
else if (!online && SCX_HAS_OP(cpu_offline))
@@ -4281,6 +4640,49 @@ bool task_should_scx(int policy)
}
/**
+ * scx_softlockup - sched_ext softlockup handler
+ *
+ * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
+ * live-lock the system by making many CPUs target the same DSQ to the point
+ * where soft-lockup detection triggers. This function is called from
+ * soft-lockup watchdog when the triggering point is close and tries to unjam
+ * the system by enabling the breather and aborting the BPF scheduler.
+ */
+void scx_softlockup(u32 dur_s)
+{
+ switch (scx_ops_enable_state()) {
+ case SCX_OPS_ENABLING:
+ case SCX_OPS_ENABLED:
+ break;
+ default:
+ return;
+ }
+
+ /* allow only one instance, cleared at the end of scx_ops_bypass() */
+ if (test_and_set_bit(0, &scx_in_softlockup))
+ return;
+
+ printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
+ smp_processor_id(), dur_s, scx_ops.name);
+
+ /*
+ * Some CPUs may be trapped in the dispatch paths. Enable breather
+ * immediately; otherwise, we might even be able to get to
+ * scx_ops_bypass().
+ */
+ atomic_inc(&scx_ops_breather_depth);
+
+ scx_ops_error("soft lockup - CPU#%d stuck for %us",
+ smp_processor_id(), dur_s);
+}
+
+static void scx_clear_softlockup(void)
+{
+ if (test_and_clear_bit(0, &scx_in_softlockup))
+ atomic_dec(&scx_ops_breather_depth);
+}
+
+/**
* scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
*
* Bypassing guarantees that all runnable tasks make forward progress without
@@ -4312,10 +4714,11 @@ bool task_should_scx(int policy)
*/
static void scx_ops_bypass(bool bypass)
{
+ static DEFINE_RAW_SPINLOCK(bypass_lock);
int cpu;
unsigned long flags;
- raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
+ raw_spin_lock_irqsave(&bypass_lock, flags);
if (bypass) {
scx_ops_bypass_depth++;
WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
@@ -4328,6 +4731,8 @@ static void scx_ops_bypass(bool bypass)
goto unlock;
}
+ atomic_inc(&scx_ops_breather_depth);
+
/*
* No task property is changing. We just need to make sure all currently
* queued tasks are re-queued according to the new scx_rq_bypassing()
@@ -4383,8 +4788,11 @@ static void scx_ops_bypass(bool bypass)
/* resched to restore ticks and idle state */
resched_cpu(cpu);
}
+
+ atomic_dec(&scx_ops_breather_depth);
unlock:
- raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
+ raw_spin_unlock_irqrestore(&bypass_lock, flags);
+ scx_clear_softlockup();
}
static void free_exit_info(struct scx_exit_info *ei)
@@ -5095,6 +5503,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable_cpuslocked(&scx_has_op[i]);
check_hotplug_seq(ops);
+#ifdef CONFIG_SMP
+ update_selcpu_topology();
+#endif
cpus_read_unlock();
ret = validate_ops(ops);
@@ -5302,67 +5713,7 @@ err_disable:
#include <linux/bpf.h>
#include <linux/btf.h>
-extern struct btf *btf_vmlinux;
static const struct btf_type *task_struct_type;
-static u32 task_struct_type_id;
-
-static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size,
- enum bpf_access_type type,
- const struct bpf_prog *prog,
- struct bpf_insn_access_aux *info)
-{
- struct btf *btf = bpf_get_btf_vmlinux();
- const struct bpf_struct_ops_desc *st_ops_desc;
- const struct btf_member *member;
- const struct btf_type *t;
- u32 btf_id, member_idx;
- const char *mname;
-
- /* struct_ops op args are all sequential, 64-bit numbers */
- if (off != arg_n * sizeof(__u64))
- return false;
-
- /* btf_id should be the type id of struct sched_ext_ops */
- btf_id = prog->aux->attach_btf_id;
- st_ops_desc = bpf_struct_ops_find(btf, btf_id);
- if (!st_ops_desc)
- return false;
-
- /* BTF type of struct sched_ext_ops */
- t = st_ops_desc->type;
-
- member_idx = prog->expected_attach_type;
- if (member_idx >= btf_type_vlen(t))
- return false;
-
- /*
- * Get the member name of this struct_ops program, which corresponds to
- * a field in struct sched_ext_ops. For example, the member name of the
- * dispatch struct_ops program (callback) is "dispatch".
- */
- member = &btf_type_member(t)[member_idx];
- mname = btf_name_by_offset(btf_vmlinux, member->name_off);
-
- if (!strcmp(mname, op)) {
- /*
- * The value is a pointer to a type (struct task_struct) given
- * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED),
- * however, can be a NULL (PTR_MAYBE_NULL). The BPF program
- * should check the pointer to make sure it is not NULL before
- * using it, or the verifier will reject the program.
- *
- * Longer term, this is something that should be addressed by
- * BTF, and be fully contained within the verifier.
- */
- info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED;
- info->btf = btf_vmlinux;
- info->btf_id = task_struct_type_id;
-
- return true;
- }
-
- return false;
-}
static bool bpf_scx_is_valid_access(int off, int size,
enum bpf_access_type type,
@@ -5371,9 +5722,6 @@ static bool bpf_scx_is_valid_access(int off, int size,
{
if (type != BPF_READ)
return false;
- if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) ||
- set_arg_maybe_null("yield", 1, off, size, type, prog, info))
- return true;
if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
return false;
if (off % size != 0)
@@ -5508,13 +5856,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
static int bpf_scx_init(struct btf *btf)
{
- s32 type_id;
-
- type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT);
- if (type_id < 0)
- return -EINVAL;
- task_struct_type = btf_type_by_id(btf, type_id);
- task_struct_type_id = type_id;
+ task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]);
return 0;
}
@@ -5536,78 +5878,78 @@ static int bpf_scx_validate(void *kdata)
return 0;
}
-static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
-static void enqueue_stub(struct task_struct *p, u64 enq_flags) {}
-static void dequeue_stub(struct task_struct *p, u64 enq_flags) {}
-static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {}
-static void tick_stub(struct task_struct *p) {}
-static void runnable_stub(struct task_struct *p, u64 enq_flags) {}
-static void running_stub(struct task_struct *p) {}
-static void stopping_stub(struct task_struct *p, bool runnable) {}
-static void quiescent_stub(struct task_struct *p, u64 deq_flags) {}
-static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; }
-static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; }
-static void set_weight_stub(struct task_struct *p, u32 weight) {}
-static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
-static void update_idle_stub(s32 cpu, bool idle) {}
-static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {}
-static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {}
-static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
-static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
-static void enable_stub(struct task_struct *p) {}
-static void disable_stub(struct task_struct *p) {}
+static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
+static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {}
+static void sched_ext_ops__tick(struct task_struct *p) {}
+static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__running(struct task_struct *p) {}
+static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {}
+static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {}
+static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; }
+static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; }
+static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {}
+static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {}
+static void sched_ext_ops__update_idle(s32 cpu, bool idle) {}
+static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {}
+static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {}
+static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
+static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {}
+static void sched_ext_ops__enable(struct task_struct *p) {}
+static void sched_ext_ops__disable(struct task_struct *p) {}
#ifdef CONFIG_EXT_GROUP_SCHED
-static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
-static void cgroup_exit_stub(struct cgroup *cgrp) {}
-static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
-static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
-static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
-static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {}
+static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
+static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {}
+static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
+static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
#endif
-static void cpu_online_stub(s32 cpu) {}
-static void cpu_offline_stub(s32 cpu) {}
-static s32 init_stub(void) { return -EINVAL; }
-static void exit_stub(struct scx_exit_info *info) {}
-static void dump_stub(struct scx_dump_ctx *ctx) {}
-static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
-static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {}
+static void sched_ext_ops__cpu_online(s32 cpu) {}
+static void sched_ext_ops__cpu_offline(s32 cpu) {}
+static s32 sched_ext_ops__init(void) { return -EINVAL; }
+static void sched_ext_ops__exit(struct scx_exit_info *info) {}
+static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {}
+static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
+static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {}
static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
- .select_cpu = select_cpu_stub,
- .enqueue = enqueue_stub,
- .dequeue = dequeue_stub,
- .dispatch = dispatch_stub,
- .tick = tick_stub,
- .runnable = runnable_stub,
- .running = running_stub,
- .stopping = stopping_stub,
- .quiescent = quiescent_stub,
- .yield = yield_stub,
- .core_sched_before = core_sched_before_stub,
- .set_weight = set_weight_stub,
- .set_cpumask = set_cpumask_stub,
- .update_idle = update_idle_stub,
- .cpu_acquire = cpu_acquire_stub,
- .cpu_release = cpu_release_stub,
- .init_task = init_task_stub,
- .exit_task = exit_task_stub,
- .enable = enable_stub,
- .disable = disable_stub,
+ .select_cpu = sched_ext_ops__select_cpu,
+ .enqueue = sched_ext_ops__enqueue,
+ .dequeue = sched_ext_ops__dequeue,
+ .dispatch = sched_ext_ops__dispatch,
+ .tick = sched_ext_ops__tick,
+ .runnable = sched_ext_ops__runnable,
+ .running = sched_ext_ops__running,
+ .stopping = sched_ext_ops__stopping,
+ .quiescent = sched_ext_ops__quiescent,
+ .yield = sched_ext_ops__yield,
+ .core_sched_before = sched_ext_ops__core_sched_before,
+ .set_weight = sched_ext_ops__set_weight,
+ .set_cpumask = sched_ext_ops__set_cpumask,
+ .update_idle = sched_ext_ops__update_idle,
+ .cpu_acquire = sched_ext_ops__cpu_acquire,
+ .cpu_release = sched_ext_ops__cpu_release,
+ .init_task = sched_ext_ops__init_task,
+ .exit_task = sched_ext_ops__exit_task,
+ .enable = sched_ext_ops__enable,
+ .disable = sched_ext_ops__disable,
#ifdef CONFIG_EXT_GROUP_SCHED
- .cgroup_init = cgroup_init_stub,
- .cgroup_exit = cgroup_exit_stub,
- .cgroup_prep_move = cgroup_prep_move_stub,
- .cgroup_move = cgroup_move_stub,
- .cgroup_cancel_move = cgroup_cancel_move_stub,
- .cgroup_set_weight = cgroup_set_weight_stub,
+ .cgroup_init = sched_ext_ops__cgroup_init,
+ .cgroup_exit = sched_ext_ops__cgroup_exit,
+ .cgroup_prep_move = sched_ext_ops__cgroup_prep_move,
+ .cgroup_move = sched_ext_ops__cgroup_move,
+ .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
+ .cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
#endif
- .cpu_online = cpu_online_stub,
- .cpu_offline = cpu_offline_stub,
- .init = init_stub,
- .exit = exit_stub,
- .dump = dump_stub,
- .dump_cpu = dump_cpu_stub,
- .dump_task = dump_task_stub,
+ .cpu_online = sched_ext_ops__cpu_online,
+ .cpu_offline = sched_ext_ops__cpu_offline,
+ .init = sched_ext_ops__init,
+ .exit = sched_ext_ops__exit,
+ .dump = sched_ext_ops__dump,
+ .dump_cpu = sched_ext_ops__dump_cpu,
+ .dump_task = sched_ext_ops__dump_task,
};
static struct bpf_struct_ops bpf_sched_ext_ops = {
@@ -5754,7 +6096,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
if (cpu != cpu_of(this_rq)) {
/*
* Pairs with smp_store_release() issued by this CPU in
- * scx_next_task_picked() on the resched path.
+ * switch_class() on the resched path.
*
* We busy-wait here to guarantee that no other task can
* be scheduled on our core before the target CPU has
@@ -5939,7 +6281,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
.set = &scx_kfunc_ids_select_cpu,
};
-static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
+static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
{
if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
return false;
@@ -5959,7 +6301,8 @@ static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
return true;
}
-static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
+static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct task_struct *ddsp_task;
@@ -5986,14 +6329,14 @@ static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags
__bpf_kfunc_start_defs();
/**
- * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
- * @p: task_struct to dispatch
- * @dsq_id: DSQ to dispatch to
+ * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
* @slice: duration @p can run for in nsecs, 0 to keep the current value
* @enq_flags: SCX_ENQ_*
*
- * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
- * to call this function spuriously. Can be called from ops.enqueue(),
+ * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to
+ * call this function spuriously. Can be called from ops.enqueue(),
* ops.select_cpu(), and ops.dispatch().
*
* When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
@@ -6002,14 +6345,14 @@ __bpf_kfunc_start_defs();
* ops.select_cpu() to be on the target CPU in the first place.
*
* When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
- * will be directly dispatched to the corresponding dispatch queue after
- * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be
- * dispatched to the local DSQ of the CPU returned by ops.select_cpu().
+ * will be directly inserted into the corresponding dispatch queue after
+ * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be
+ * inserted into the local DSQ of the CPU returned by ops.select_cpu().
* @enq_flags are OR'd with the enqueue flags on the enqueue path before the
- * task is dispatched.
+ * task is inserted.
*
* When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
- * and this function can be called upto ops.dispatch_max_batch times to dispatch
+ * and this function can be called upto ops.dispatch_max_batch times to insert
* multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
* remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
*
@@ -6021,10 +6364,10 @@ __bpf_kfunc_start_defs();
* %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
* scx_bpf_kick_cpu() to trigger scheduling.
*/
-__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
- u64 enq_flags)
+__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
+ u64 enq_flags)
{
- if (!scx_dispatch_preamble(p, enq_flags))
+ if (!scx_dsq_insert_preamble(p, enq_flags))
return;
if (slice)
@@ -6032,30 +6375,42 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
else
p->scx.slice = p->scx.slice ?: 1;
- scx_dispatch_commit(p, dsq_id, enq_flags);
+ scx_dsq_insert_commit(p, dsq_id, enq_flags);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
+ u64 enq_flags)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
+ scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
}
/**
- * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
- * @p: task_struct to dispatch
- * @dsq_id: DSQ to dispatch to
+ * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
* @slice: duration @p can run for in nsecs, 0 to keep the current value
* @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
* @enq_flags: SCX_ENQ_*
*
- * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
- * Tasks queued into the priority queue are ordered by @vtime and always
- * consumed after the tasks in the FIFO queue. All other aspects are identical
- * to scx_bpf_dispatch().
+ * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id.
+ * Tasks queued into the priority queue are ordered by @vtime. All other aspects
+ * are identical to scx_bpf_dsq_insert().
*
* @vtime ordering is according to time_before64() which considers wrapping. A
* numerically larger vtime may indicate an earlier position in the ordering and
* vice-versa.
+ *
+ * A DSQ can only be used as a FIFO or priority queue at any given time and this
+ * function must not be called on a DSQ which already has one or more FIFO tasks
+ * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
+ * SCX_DSQ_GLOBAL) cannot be used as priority queues.
*/
-__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 vtime, u64 enq_flags)
+__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
{
- if (!scx_dispatch_preamble(p, enq_flags))
+ if (!scx_dsq_insert_preamble(p, enq_flags))
return;
if (slice)
@@ -6065,12 +6420,22 @@ __bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
p->scx.dsq_vtime = vtime;
- scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
+ scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
@@ -6080,12 +6445,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
.set = &scx_kfunc_ids_enqueue_dispatch,
};
-static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
+static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
+ struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
- struct rq *this_rq, *src_rq, *dst_rq, *locked_rq;
+ struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
bool in_balance;
unsigned long flags;
@@ -6113,6 +6477,13 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
raw_spin_rq_lock(src_rq);
}
+ /*
+ * If the BPF scheduler keeps calling this function repeatedly, it can
+ * cause similar live-lock conditions as consume_dispatch_q(). Insert a
+ * breather if necessary.
+ */
+ scx_ops_breather(src_rq);
+
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
@@ -6131,51 +6502,18 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
/* @p is still on $src_dsq and stable, determine the destination */
dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
- if (dst_dsq->id == SCX_DSQ_LOCAL) {
- dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
- if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
- dst_dsq = find_global_dsq(p);
- dst_rq = src_rq;
- }
- } else {
- /* no need to migrate if destination is a non-local DSQ */
- dst_rq = src_rq;
- }
-
/*
- * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
- * CPU, @p will be migrated.
+ * Apply vtime and slice updates before moving so that the new time is
+ * visible before inserting into $dst_dsq. @p is still on $src_dsq but
+ * this is safe as we're locking it.
*/
- if (dst_dsq->id == SCX_DSQ_LOCAL) {
- /* @p is going from a non-local DSQ to a local DSQ */
- if (src_rq == dst_rq) {
- task_unlink_from_dsq(p, src_dsq);
- move_local_task_to_local_dsq(p, enq_flags,
- src_dsq, dst_rq);
- raw_spin_unlock(&src_dsq->lock);
- } else {
- raw_spin_unlock(&src_dsq->lock);
- move_remote_task_to_local_dsq(p, enq_flags,
- src_rq, dst_rq);
- locked_rq = dst_rq;
- }
- } else {
- /*
- * @p is going from a non-local DSQ to a non-local DSQ. As
- * $src_dsq is already locked, do an abbreviated dequeue.
- */
- task_unlink_from_dsq(p, src_dsq);
- p->scx.dsq = NULL;
- raw_spin_unlock(&src_dsq->lock);
-
- if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
- p->scx.dsq_vtime = kit->vtime;
- dispatch_enqueue(dst_dsq, p, enq_flags);
- }
-
+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
+ p->scx.dsq_vtime = kit->vtime;
if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
p->scx.slice = kit->slice;
+ /* execute move */
+ locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq);
dispatched = true;
out:
if (in_balance) {
@@ -6227,21 +6565,20 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
}
/**
- * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ
- * @dsq_id: DSQ to consume
+ * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
+ * @dsq_id: DSQ to move task from
*
- * Consume a task from the non-local DSQ identified by @dsq_id and transfer it
- * to the current CPU's local DSQ for execution. Can only be called from
- * ops.dispatch().
+ * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
+ * local DSQ for execution. Can only be called from ops.dispatch().
*
- * This function flushes the in-flight dispatches from scx_bpf_dispatch() before
- * trying to consume the specified DSQ. It may also grab rq locks and thus can't
- * be called under any BPF locks.
+ * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
+ * before trying to move from the specified DSQ. It may also grab rq locks and
+ * thus can't be called under any BPF locks.
*
- * Returns %true if a task has been consumed, %false if there isn't any task to
- * consume.
+ * Returns %true if a task has been moved, %false if there isn't any task to
+ * move.
*/
-__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
+__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
@@ -6271,17 +6608,24 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
}
}
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
+ return scx_bpf_dsq_move_to_local(dsq_id);
+}
+
/**
- * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ
+ * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
* @it__iter: DSQ iterator in progress
- * @slice: duration the dispatched task can run for in nsecs
+ * @slice: duration the moved task can run for in nsecs
*
- * Override the slice of the next task that will be dispatched from @it__iter
- * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called,
- * the previous slice duration is kept.
+ * Override the slice of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous
+ * slice duration is kept.
*/
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
- struct bpf_iter_scx_dsq *it__iter, u64 slice)
+__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
+ u64 slice)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
@@ -6289,18 +6633,26 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
}
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
+ struct bpf_iter_scx_dsq *it__iter, u64 slice)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
+ scx_bpf_dsq_move_set_slice(it__iter, slice);
+}
+
/**
- * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ
+ * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
* @it__iter: DSQ iterator in progress
* @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
*
- * Override the vtime of the next task that will be dispatched from @it__iter
- * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the
- * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to
- * dispatch the next task, the override is ignored and cleared.
+ * Override the vtime of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice
+ * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the
+ * override is ignored and cleared.
*/
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
- struct bpf_iter_scx_dsq *it__iter, u64 vtime)
+__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
+ u64 vtime)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
@@ -6308,8 +6660,16 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
}
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
+ struct bpf_iter_scx_dsq *it__iter, u64 vtime)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
+ scx_bpf_dsq_move_set_vtime(it__iter, vtime);
+}
+
/**
- * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ
+ * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
* @it__iter: DSQ iterator in progress
* @p: task to transfer
* @dsq_id: DSQ to move @p to
@@ -6324,8 +6684,7 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
* @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
* been queued before the iteration started.
*
- * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to
- * update.
+ * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update.
*
* Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
* lock (e.g. BPF timers or SYSCALL programs).
@@ -6333,16 +6692,25 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
* Returns %true if @p has been consumed, %false if @p had already been consumed
* or dequeued.
*/
+__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{
- return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
- p, dsq_id, enq_flags);
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
+ return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
}
/**
- * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ
+ * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
* @it__iter: DSQ iterator in progress
* @p: task to transfer
* @dsq_id: DSQ to move @p to
@@ -6352,19 +6720,27 @@ __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
* priority queue of the DSQ specified by @dsq_id. The destination must be a
* user DSQ as only user DSQs support priority queue.
*
- * @p's slice and vtime are kept by default. Use
- * scx_bpf_dispatch_from_dsq_set_slice() and
- * scx_bpf_dispatch_from_dsq_set_vtime() to update.
+ * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice()
+ * and scx_bpf_dsq_move_set_vtime() to update.
*
- * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See
- * scx_bpf_dispatch_vtime() for more information on @vtime.
+ * All other aspects are identical to scx_bpf_dsq_move(). See
+ * scx_bpf_dsq_insert_vtime() for more information on @vtime.
*/
+__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{
- return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
- p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
+ return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
}
__bpf_kfunc_end_defs();
@@ -6372,7 +6748,12 @@ __bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
BTF_ID_FLAGS(func, scx_bpf_consume)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
@@ -6473,6 +6854,12 @@ __bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
@@ -7148,15 +7535,8 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
goto out;
- /*
- * A task_group may either be a cgroup or an autogroup. In the latter
- * case, @tg->css.cgroup is %NULL. A task_group can't become the other
- * kind once created.
- */
- if (tg && tg->css.cgroup)
- cgrp = tg->css.cgroup;
- else
- cgrp = &cgrp_dfl_root.cgrp;
+ cgrp = tg_cgrp(tg);
+
out:
cgroup_get(cgrp);
return cgrp;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 79e6cb1d5c48..5c9202cb8f59 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1305,7 +1305,6 @@ int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write,
* @write: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
- * @ppos: file position
* @ppos: the current position in the file
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 721c3b221048..74c2b1d43bb9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -242,6 +242,16 @@ config FUNCTION_GRAPH_RETVAL
enable it via the trace option funcgraph-retval.
See Documentation/trace/ftrace.rst
+config FUNCTION_GRAPH_RETADDR
+ bool "Kernel Function Graph Return Address"
+ depends on FUNCTION_GRAPH_TRACER
+ default n
+ help
+ Support recording and printing the function return address when
+ using function graph tracer. It can be helpful to locate code line that
+ the function is called. This feature is off by default, and you can
+ enable it via the trace option funcgraph-retaddr.
+
config DYNAMIC_FTRACE
bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f86c78961708..949a3870946c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -802,6 +802,8 @@ struct send_signal_irq_work {
struct task_struct *task;
u32 sig;
enum pid_type type;
+ bool has_siginfo;
+ struct kernel_siginfo info;
};
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
@@ -809,27 +811,46 @@ static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
static void do_bpf_send_signal(struct irq_work *entry)
{
struct send_signal_irq_work *work;
+ struct kernel_siginfo *siginfo;
work = container_of(entry, struct send_signal_irq_work, irq_work);
- group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
+ siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV;
+
+ group_send_sig_info(work->sig, siginfo, work->task, work->type);
put_task_struct(work->task);
}
-static int bpf_send_signal_common(u32 sig, enum pid_type type)
+static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value)
{
struct send_signal_irq_work *work = NULL;
+ struct kernel_siginfo info;
+ struct kernel_siginfo *siginfo;
+
+ if (!task) {
+ task = current;
+ siginfo = SEND_SIG_PRIV;
+ } else {
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_KERNEL;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ info.si_value.sival_ptr = (void *)(unsigned long)value;
+ siginfo = &info;
+ }
/* Similar to bpf_probe_write_user, task needs to be
* in a sound condition and kernel memory access be
* permitted in order to send signal to the current
* task.
*/
- if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
+ if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
/* Task should not be pid=1 to avoid kernel panic. */
- if (unlikely(is_global_init(current)))
+ if (unlikely(is_global_init(task)))
return -EPERM;
if (irqs_disabled()) {
@@ -847,19 +868,22 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
- work->task = get_task_struct(current);
+ work->task = get_task_struct(task);
+ work->has_siginfo = siginfo == &info;
+ if (work->has_siginfo)
+ copy_siginfo(&work->info, &info);
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
return 0;
}
- return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
+ return group_send_sig_info(sig, siginfo, task, type);
}
BPF_CALL_1(bpf_send_signal, u32, sig)
{
- return bpf_send_signal_common(sig, PIDTYPE_TGID);
+ return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0);
}
static const struct bpf_func_proto bpf_send_signal_proto = {
@@ -871,7 +895,7 @@ static const struct bpf_func_proto bpf_send_signal_proto = {
BPF_CALL_1(bpf_send_signal_thread, u32, sig)
{
- return bpf_send_signal_common(sig, PIDTYPE_PID);
+ return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0);
}
static const struct bpf_func_proto bpf_send_signal_thread_proto = {
@@ -1557,6 +1581,17 @@ static inline bool is_kprobe_session(const struct bpf_prog *prog)
return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
}
+static inline bool is_uprobe_multi(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI ||
+ prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
+static inline bool is_uprobe_session(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
static const struct bpf_func_proto *
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1574,13 +1609,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_func_ip:
if (is_kprobe_multi(prog))
return &bpf_get_func_ip_proto_kprobe_multi;
- if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ if (is_uprobe_multi(prog))
return &bpf_get_func_ip_proto_uprobe_multi;
return &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
if (is_kprobe_multi(prog))
return &bpf_get_attach_cookie_proto_kmulti;
- if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ if (is_uprobe_multi(prog))
return &bpf_get_attach_cookie_proto_umulti;
return &bpf_get_attach_cookie_proto_trace;
default:
@@ -3072,6 +3107,7 @@ struct bpf_uprobe {
u64 cookie;
struct uprobe *uprobe;
struct uprobe_consumer consumer;
+ bool session;
};
struct bpf_uprobe_multi_link {
@@ -3084,7 +3120,7 @@ struct bpf_uprobe_multi_link {
};
struct bpf_uprobe_multi_run_ctx {
- struct bpf_run_ctx run_ctx;
+ struct bpf_session_run_ctx session_ctx;
unsigned long entry_ip;
struct bpf_uprobe *uprobe;
};
@@ -3195,17 +3231,22 @@ static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
unsigned long entry_ip,
- struct pt_regs *regs)
+ struct pt_regs *regs,
+ bool is_return, void *data)
{
struct bpf_uprobe_multi_link *link = uprobe->link;
struct bpf_uprobe_multi_run_ctx run_ctx = {
+ .session_ctx = {
+ .is_return = is_return,
+ .data = data,
+ },
.entry_ip = entry_ip,
.uprobe = uprobe,
};
struct bpf_prog *prog = link->link.prog;
bool sleepable = prog->sleepable;
struct bpf_run_ctx *old_run_ctx;
- int err = 0;
+ int err;
if (link->task && !same_thread_group(current, link->task))
return 0;
@@ -3217,7 +3258,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
migrate_disable();
- old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
@@ -3244,9 +3285,13 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
__u64 *data)
{
struct bpf_uprobe *uprobe;
+ int ret;
uprobe = container_of(con, struct bpf_uprobe, consumer);
- return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
+ ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data);
+ if (uprobe->session)
+ return ret ? UPROBE_HANDLER_IGNORE : 0;
+ return 0;
}
static int
@@ -3256,14 +3301,16 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, s
struct bpf_uprobe *uprobe;
uprobe = container_of(con, struct bpf_uprobe, consumer);
- return uprobe_prog_run(uprobe, func, regs);
+ uprobe_prog_run(uprobe, func, regs, true, data);
+ return 0;
}
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
struct bpf_uprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->entry_ip;
}
@@ -3271,7 +3318,8 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
struct bpf_uprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->uprobe->cookie;
}
@@ -3295,7 +3343,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
- if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI)
+ if (!is_uprobe_multi(prog))
return -EINVAL;
flags = attr->link_create.uprobe_multi.flags;
@@ -3371,11 +3419,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
uprobes[i].link = link;
- if (flags & BPF_F_UPROBE_MULTI_RETURN)
- uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
- else
+ if (!(flags & BPF_F_UPROBE_MULTI_RETURN))
uprobes[i].consumer.handler = uprobe_multi_link_handler;
-
+ if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog))
+ uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
+ if (is_uprobe_session(prog))
+ uprobes[i].session = true;
if (pid)
uprobes[i].consumer.filter = uprobe_multi_link_filter;
}
@@ -3464,7 +3513,7 @@ static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
return 0;
- if (!is_kprobe_session(prog))
+ if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
return -EACCES;
return 0;
@@ -3482,3 +3531,16 @@ static int __init bpf_kprobe_multi_kfuncs_init(void)
}
late_initcall(bpf_kprobe_multi_kfuncs_init);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type,
+ u64 value)
+{
+ if (type != PIDTYPE_PID && type != PIDTYPE_TGID)
+ return -EINVAL;
+
+ return bpf_send_signal_common(sig, type, task, value);
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 69e226a48daa..0bf78517b5d4 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -153,7 +153,7 @@ enum {
* SHADOW_STACK_OFFSET: The size in long words of the shadow stack
* SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be added
*/
-#define SHADOW_STACK_SIZE (PAGE_SIZE)
+#define SHADOW_STACK_SIZE (4096)
#define SHADOW_STACK_OFFSET (SHADOW_STACK_SIZE / sizeof(long))
/* Leave on a buffer at the end */
#define SHADOW_STACK_MAX_OFFSET \
@@ -172,6 +172,8 @@ enum {
DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
+static struct kmem_cache *fgraph_stack_cachep;
+
static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
static unsigned long fgraph_array_bitmask;
@@ -390,21 +392,7 @@ void *fgraph_reserve_data(int idx, int size_bytes)
*/
void *fgraph_retrieve_data(int idx, int *size_bytes)
{
- int offset = current->curr_ret_stack - 1;
- unsigned long val;
-
- val = get_fgraph_entry(current, offset);
- while (__get_type(val) == FGRAPH_TYPE_DATA) {
- if (__get_data_index(val) == idx)
- goto found;
- offset -= __get_data_size(val) + 1;
- val = get_fgraph_entry(current, offset);
- }
- return NULL;
-found:
- if (size_bytes)
- *size_bytes = __get_data_size(val) * sizeof(long);
- return get_data_type_data(current, offset);
+ return fgraph_retrieve_parent_data(idx, size_bytes, 0);
}
/**
@@ -460,8 +448,56 @@ get_ret_stack(struct task_struct *t, int offset, int *frame_offset)
return RET_STACK(t, offset);
}
+/**
+ * fgraph_retrieve_parent_data - get data from a parent function
+ * @idx: The index into the fgraph_array (fgraph_ops::idx)
+ * @size_bytes: A pointer to retrieved data size
+ * @depth: The depth to find the parent (0 is the current function)
+ *
+ * This is similar to fgraph_retrieve_data() but can be used to retrieve
+ * data from a parent caller function.
+ *
+ * Return: a pointer to the specified parent data or NULL if not found
+ */
+void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth)
+{
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = current->curr_ret_stack;
+ unsigned long val;
+
+ if (offset <= 0)
+ return NULL;
+
+ for (;;) {
+ int next_offset;
+
+ ret_stack = get_ret_stack(current, offset, &next_offset);
+ if (!ret_stack || --depth < 0)
+ break;
+ offset = next_offset;
+ }
+
+ if (!ret_stack)
+ return NULL;
+
+ offset--;
+
+ val = get_fgraph_entry(current, offset);
+ while (__get_type(val) == FGRAPH_TYPE_DATA) {
+ if (__get_data_index(val) == idx)
+ goto found;
+ offset -= __get_data_size(val) + 1;
+ val = get_fgraph_entry(current, offset);
+ }
+ return NULL;
+found:
+ if (size_bytes)
+ *size_bytes = __get_data_size(val) * sizeof(long);
+ return get_data_type_data(current, offset);
+}
+
/* Both enabled by default (can be cleared by function_graph tracer flags */
-static bool fgraph_sleep_time = true;
+bool fgraph_sleep_time = true;
#ifdef CONFIG_DYNAMIC_FTRACE
/*
@@ -524,7 +560,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
int fgraph_idx)
{
struct ftrace_ret_stack *ret_stack;
- unsigned long long calltime;
unsigned long val;
int offset;
@@ -554,8 +589,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
return -EBUSY;
}
- calltime = trace_clock_local();
-
offset = READ_ONCE(current->curr_ret_stack);
ret_stack = RET_STACK(current, offset);
offset += FGRAPH_FRAME_OFFSET;
@@ -589,7 +622,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
ret_stack->ret = ret;
ret_stack->func = func;
- ret_stack->calltime = calltime;
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
ret_stack->fp = frame_pointer;
#endif
@@ -723,7 +755,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
*offset += FGRAPH_FRAME_OFFSET;
*ret = ret_stack->ret;
trace->func = ret_stack->func;
- trace->calltime = ret_stack->calltime;
trace->overrun = atomic_read(&current->trace_overrun);
trace->depth = current->curr_ret_depth;
/*
@@ -868,6 +899,29 @@ ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
}
/**
+ * ftrace_graph_top_ret_addr - return the top return address in the shadow stack
+ * @task: The task to read the shadow stack from.
+ *
+ * Return the first return address on the shadow stack of the @task, which is
+ * not the fgraph's return_to_handler.
+ */
+unsigned long ftrace_graph_top_ret_addr(struct task_struct *task)
+{
+ unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler);
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = task->curr_ret_stack;
+
+ if (offset < 0)
+ return 0;
+
+ do {
+ ret_stack = get_ret_stack(task, offset, &offset);
+ } while (ret_stack && ret_stack->ret == return_handler);
+
+ return ret_stack ? ret_stack->ret : 0;
+}
+
+/**
* ftrace_graph_ret_addr - return the original value of the return address
* @task: The task the unwinder is being executed on
* @idx: An initialized pointer to the next stack index to use
@@ -892,7 +946,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
{
struct ftrace_ret_stack *ret_stack;
unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler);
- int i = task->curr_ret_stack;
+ int i;
if (ret != return_handler)
return ret;
@@ -970,8 +1024,11 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
struct task_struct *g, *t;
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return -ENOMEM;
+
for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
- ret_stack_list[i] = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
+ ret_stack_list[i] = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack_list[i]) {
start = 0;
end = i;
@@ -1002,7 +1059,7 @@ unlock:
rcu_read_unlock();
free:
for (i = start; i < end; i++)
- kfree(ret_stack_list[i]);
+ kmem_cache_free(fgraph_stack_cachep, ret_stack_list[i]);
return ret;
}
@@ -1012,9 +1069,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
struct task_struct *next,
unsigned int prev_state)
{
- struct ftrace_ret_stack *ret_stack;
unsigned long long timestamp;
- int offset;
/*
* Does the user want to count the time a function was asleep.
@@ -1031,17 +1086,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
if (!next->ftrace_timestamp)
return;
- /*
- * Update all the counters in next to make up for the
- * time next was sleeping.
- */
- timestamp -= next->ftrace_timestamp;
-
- for (offset = next->curr_ret_stack; offset > 0; ) {
- ret_stack = get_ret_stack(next, offset, &offset);
- if (ret_stack)
- ret_stack->calltime += timestamp;
- }
+ next->ftrace_sleeptime += timestamp - next->ftrace_timestamp;
}
static DEFINE_PER_CPU(unsigned long *, idle_ret_stack);
@@ -1077,9 +1122,12 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
if (ftrace_graph_active) {
unsigned long *ret_stack;
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+
ret_stack = per_cpu(idle_ret_stack, cpu);
if (!ret_stack) {
- ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
+ ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack)
return;
per_cpu(idle_ret_stack, cpu) = ret_stack;
@@ -1099,7 +1147,10 @@ void ftrace_graph_init_task(struct task_struct *t)
if (ftrace_graph_active) {
unsigned long *ret_stack;
- ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+
+ ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack)
return;
graph_init_task(t, ret_stack);
@@ -1114,7 +1165,11 @@ void ftrace_graph_exit_task(struct task_struct *t)
/* NULL must become visible to IRQs before we free it: */
barrier();
- kfree(ret_stack);
+ if (ret_stack) {
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+ kmem_cache_free(fgraph_stack_cachep, ret_stack);
+ }
}
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1254,6 +1309,14 @@ int register_ftrace_graph(struct fgraph_ops *gops)
guard(mutex)(&ftrace_lock);
+ if (!fgraph_stack_cachep) {
+ fgraph_stack_cachep = kmem_cache_create("fgraph_stack",
+ SHADOW_STACK_SIZE,
+ SHADOW_STACK_SIZE, 0, NULL);
+ if (!fgraph_stack_cachep)
+ return -ENOMEM;
+ }
+
if (!fgraph_initialized) {
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph:online",
fgraph_cpu_init, NULL);
@@ -1318,17 +1381,17 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
{
int command = 0;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
if (unlikely(!ftrace_graph_active))
- goto out;
+ return;
if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
fgraph_array[gops->idx] != gops))
- goto out;
+ return;
if (fgraph_lru_release_index(gops->idx) < 0)
- goto out;
+ return;
fgraph_array[gops->idx] = &fgraph_stub;
@@ -1350,7 +1413,5 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
}
- out:
gops->saved_func = NULL;
- mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4c28dd177ca6..9b17efb1a87d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -820,10 +820,16 @@ void ftrace_graph_graph_time_control(bool enable)
fgraph_graph_time = enable;
}
+struct profile_fgraph_data {
+ unsigned long long calltime;
+ unsigned long long subtime;
+ unsigned long long sleeptime;
+};
+
static int profile_graph_entry(struct ftrace_graph_ent *trace,
struct fgraph_ops *gops)
{
- struct ftrace_ret_stack *ret_stack;
+ struct profile_fgraph_data *profile_data;
function_profile_call(trace->func, 0, NULL, NULL);
@@ -831,9 +837,13 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
if (!current->ret_stack)
return 0;
- ret_stack = ftrace_graph_get_ret_stack(current, 0);
- if (ret_stack)
- ret_stack->subtime = 0;
+ profile_data = fgraph_reserve_data(gops->idx, sizeof(*profile_data));
+ if (!profile_data)
+ return 0;
+
+ profile_data->subtime = 0;
+ profile_data->sleeptime = current->ftrace_sleeptime;
+ profile_data->calltime = trace_clock_local();
return 1;
}
@@ -841,33 +851,42 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
static void profile_graph_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops)
{
- struct ftrace_ret_stack *ret_stack;
+ struct profile_fgraph_data *profile_data;
struct ftrace_profile_stat *stat;
unsigned long long calltime;
+ unsigned long long rettime = trace_clock_local();
struct ftrace_profile *rec;
unsigned long flags;
+ int size;
local_irq_save(flags);
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
goto out;
+ profile_data = fgraph_retrieve_data(gops->idx, &size);
+
/* If the calltime was zero'd ignore it */
- if (!trace->calltime)
+ if (!profile_data || !profile_data->calltime)
goto out;
- calltime = trace->rettime - trace->calltime;
+ calltime = rettime - profile_data->calltime;
+
+ if (!fgraph_sleep_time) {
+ if (current->ftrace_sleeptime)
+ calltime -= current->ftrace_sleeptime - profile_data->sleeptime;
+ }
if (!fgraph_graph_time) {
+ struct profile_fgraph_data *parent_data;
/* Append this call time to the parent time to subtract */
- ret_stack = ftrace_graph_get_ret_stack(current, 1);
- if (ret_stack)
- ret_stack->subtime += calltime;
+ parent_data = fgraph_retrieve_parent_data(gops->idx, &size, 1);
+ if (parent_data)
+ parent_data->subtime += calltime;
- ret_stack = ftrace_graph_get_ret_stack(current, 0);
- if (ret_stack && ret_stack->subtime < calltime)
- calltime -= ret_stack->subtime;
+ if (profile_data->subtime && profile_data->subtime < calltime)
+ calltime -= profile_data->subtime;
else
calltime = 0;
}
@@ -883,6 +902,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace,
}
static struct fgraph_ops fprofiler_ops = {
+ .ops = {
+ .flags = FTRACE_OPS_FL_INITIALIZED,
+ INIT_OPS_HASH(fprofiler_ops.ops)
+ },
.entryfunc = &profile_graph_entry,
.retfunc = &profile_graph_return,
};
@@ -3663,7 +3686,8 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops,
}
-static u64 ftrace_update_time;
+u64 ftrace_update_time;
+u64 ftrace_total_mod_time;
unsigned long ftrace_update_tot_cnt;
unsigned long ftrace_number_of_pages;
unsigned long ftrace_number_of_groups;
@@ -3683,7 +3707,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
bool init_nop = ftrace_need_init_nop();
struct ftrace_page *pg;
struct dyn_ftrace *p;
- u64 start, stop;
+ u64 start, stop, update_time;
unsigned long update_cnt = 0;
unsigned long rec_flags = 0;
int i;
@@ -3727,7 +3751,11 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
}
stop = ftrace_now(raw_smp_processor_id());
- ftrace_update_time = stop - start;
+ update_time = stop - start;
+ if (mod)
+ ftrace_total_mod_time += update_time;
+ else
+ ftrace_update_time = update_time;
ftrace_update_tot_cnt += update_cnt;
return 0;
@@ -4806,15 +4834,13 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
mod_g.len = strlen(mod_g.search);
}
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
if (unlikely(ftrace_disabled))
- goto out_unlock;
+ return 0;
- if (func_g.type == MATCH_INDEX) {
- found = add_rec_by_index(hash, &func_g, clear_filter);
- goto out_unlock;
- }
+ if (func_g.type == MATCH_INDEX)
+ return add_rec_by_index(hash, &func_g, clear_filter);
do_for_each_ftrace_rec(pg, rec) {
@@ -4823,16 +4849,12 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
ret = enter_record(hash, rec, clear_filter);
- if (ret < 0) {
- found = ret;
- goto out_unlock;
- }
+ if (ret < 0)
+ return ret;
found = 1;
}
cond_resched();
} while_for_each_ftrace_rec();
- out_unlock:
- mutex_unlock(&ftrace_lock);
return found;
}
@@ -4930,14 +4952,14 @@ static int cache_mod(struct trace_array *tr,
{
struct ftrace_mod_load *ftrace_mod, *n;
struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace;
- int ret;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
/* We do not cache inverse filters */
if (func[0] == '!') {
+ int ret = -EINVAL;
+
func++;
- ret = -EINVAL;
/* Look to remove this hash */
list_for_each_entry_safe(ftrace_mod, n, head, list) {
@@ -4953,20 +4975,15 @@ static int cache_mod(struct trace_array *tr,
continue;
}
}
- goto out;
+ return ret;
}
- ret = -EINVAL;
/* We only care about modules that have not been loaded yet */
if (module_exists(module))
- goto out;
+ return -EINVAL;
/* Save this string off, and execute it when the module is loaded */
- ret = ftrace_add_mod(tr, func, module, enable);
- out:
- mutex_unlock(&ftrace_lock);
-
- return ret;
+ return ftrace_add_mod(tr, func, module, enable);
}
static int
@@ -5076,6 +5093,9 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *func;
int ret;
+ if (!tr)
+ return -ENODEV;
+
/* match_records() modifies func, and we need the original */
func = kstrdup(func_orig, GFP_KERNEL);
if (!func)
@@ -5276,7 +5296,7 @@ static void release_probe(struct ftrace_func_probe *probe)
{
struct ftrace_probe_ops *probe_ops;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
WARN_ON(probe->ref <= 0);
@@ -5294,7 +5314,6 @@ static void release_probe(struct ftrace_func_probe *probe)
list_del(&probe->list);
kfree(probe);
}
- mutex_unlock(&ftrace_lock);
}
static void acquire_probe_locked(struct ftrace_func_probe *probe)
@@ -6805,12 +6824,10 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
func_g.len = strlen(func_g.search);
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
- if (unlikely(ftrace_disabled)) {
- mutex_unlock(&ftrace_lock);
+ if (unlikely(ftrace_disabled))
return -ENODEV;
- }
do_for_each_ftrace_rec(pg, rec) {
@@ -6826,7 +6843,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
if (entry)
continue;
if (add_hash_entry(hash, rec->ip) == NULL)
- goto out;
+ return 0;
} else {
if (entry) {
free_hash_entry(hash, entry);
@@ -6835,13 +6852,8 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
}
}
} while_for_each_ftrace_rec();
-out:
- mutex_unlock(&ftrace_lock);
- if (fail)
- return -EINVAL;
-
- return 0;
+ return fail ? -EINVAL : 0;
}
static ssize_t
@@ -7920,7 +7932,7 @@ out:
void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
- kmsan_unpoison_memory(fregs, sizeof(*fregs));
+ kmsan_unpoison_memory(fregs, ftrace_regs_size());
__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
}
#else
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5807116bcd0b..7e257e855dd1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -482,6 +482,8 @@ struct ring_buffer_per_cpu {
unsigned long nr_pages;
unsigned int current_context;
struct list_head *pages;
+ /* pages generation counter, incremented when the list changes */
+ unsigned long cnt;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
struct buffer_page *commit_page; /* committed pages */
@@ -1475,40 +1477,87 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK);
}
+static bool rb_check_links(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(list->next)->prev) != list))
+ return false;
+
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(list->prev)->next) != list))
+ return false;
+
+ return true;
+}
+
/**
* rb_check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
*
* As a safety measure we check to make sure the data pages have not
* been corrupted.
- *
- * Callers of this function need to guarantee that the list of pages doesn't get
- * modified during the check. In particular, if it's possible that the function
- * is invoked with concurrent readers which can swap in a new reader page then
- * the caller should take cpu_buffer->reader_lock.
*/
static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = rb_list_head(cpu_buffer->pages);
- struct list_head *tmp;
+ struct list_head *head, *tmp;
+ unsigned long buffer_cnt;
+ unsigned long flags;
+ int nr_loops = 0;
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(head->next)->prev) != head))
+ /*
+ * Walk the linked list underpinning the ring buffer and validate all
+ * its next and prev links.
+ *
+ * The check acquires the reader_lock to avoid concurrent processing
+ * with code that could be modifying the list. However, the lock cannot
+ * be held for the entire duration of the walk, as this would make the
+ * time when interrupts are disabled non-deterministic, dependent on the
+ * ring buffer size. Therefore, the code releases and re-acquires the
+ * lock after checking each page. The ring_buffer_per_cpu.cnt variable
+ * is then used to detect if the list was modified while the lock was
+ * not held, in which case the check needs to be restarted.
+ *
+ * The code attempts to perform the check at most three times before
+ * giving up. This is acceptable because this is only a self-validation
+ * to detect problems early on. In practice, the list modification
+ * operations are fairly spaced, and so this check typically succeeds at
+ * most on the second try.
+ */
+again:
+ if (++nr_loops > 3)
return;
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(head->prev)->next) != head))
- return;
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ head = rb_list_head(cpu_buffer->pages);
+ if (!rb_check_links(cpu_buffer, head))
+ goto out_locked;
+ buffer_cnt = cpu_buffer->cnt;
+ tmp = head;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
- return;
+ while (true) {
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
- return;
+ if (buffer_cnt != cpu_buffer->cnt) {
+ /* The list was updated, try again. */
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ goto again;
+ }
+
+ tmp = rb_list_head(tmp->next);
+ if (tmp == head)
+ /* The iteration circled back, all is done. */
+ goto out_locked;
+
+ if (!rb_check_links(cpu_buffer, tmp))
+ goto out_locked;
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
+
+out_locked:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
/*
@@ -2384,9 +2433,9 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
* __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory
* @size: the size in bytes per cpu that is needed.
* @flags: attributes to set for the ring buffer.
+ * @order: sub-buffer order
* @start: start of allocated range
* @range_size: size of allocated range
- * @order: sub-buffer order
* @key: ring buffer reader_lock_key.
*
* Currently the only flag that is available is the RB_FL_OVERWRITE
@@ -2532,6 +2581,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
/* make sure pages points to a valid page in the ring buffer */
cpu_buffer->pages = next_page;
+ cpu_buffer->cnt++;
/* update head page */
if (head_bit)
@@ -2638,6 +2688,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* pointer to point to end of list
*/
head_page->prev = last_page;
+ cpu_buffer->cnt++;
success = true;
break;
}
@@ -2873,12 +2924,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
*/
synchronize_rcu();
for_each_buffer_cpu(buffer, cpu) {
- unsigned long flags;
-
cpu_buffer = buffer->buffers[cpu];
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_check_pages(cpu_buffer);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
atomic_dec(&buffer->record_disabled);
}
@@ -4010,7 +4057,7 @@ static const char *show_irq_str(int bits)
return type[bits];
}
-/* Assume this is an trace event */
+/* Assume this is a trace event */
static const char *show_flags(struct ring_buffer_event *event)
{
struct trace_entry *entry;
@@ -5296,6 +5343,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
rb_inc_page(&cpu_buffer->head_page);
+ cpu_buffer->cnt++;
local_inc(&cpu_buffer->pages_read);
/* Finally update the reader page to the new head */
@@ -5835,12 +5883,9 @@ void
ring_buffer_read_finish(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
- unsigned long flags;
/* Use this opportunity to check the integrity of the ring buffer. */
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_check_pages(cpu_buffer);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
atomic_dec(&cpu_buffer->resize_disabled);
kfree(iter->event);
@@ -6757,6 +6802,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
/* Install the new pages, remove the head from the list */
cpu_buffer->pages = cpu_buffer->new_pages.next;
list_del_init(&cpu_buffer->new_pages);
+ cpu_buffer->cnt++;
cpu_buffer->head_page
= list_entry(cpu_buffer->pages, struct buffer_page, list);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 008187ebd7fe..cdc3aea12c93 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -307,14 +307,14 @@ static void ring_buffer_producer(void)
if (!disable_reader) {
if (consumer_fifo)
trace_printk("Running Consumer at SCHED_FIFO %s\n",
- consumer_fifo == 1 ? "low" : "high");
+ str_low_high(consumer_fifo == 1));
else
trace_printk("Running Consumer at nice: %d\n",
consumer_nice);
}
if (producer_fifo)
trace_printk("Running Producer at SCHED_FIFO %s\n",
- producer_fifo == 1 ? "low" : "high");
+ str_low_high(producer_fifo == 1));
else
trace_printk("Running Producer at nice: %d\n",
producer_nice);
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index dc819aec43e8..279c70e1bd74 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -41,7 +41,7 @@
* per-task monitor, and so on), and the helper functions that glue the
* monitor to the system via trace. Generally, a monitor includes some form
* of trace output as a reaction for event parsing and exceptions,
- * as depicted bellow:
+ * as depicted below:
*
* Linux +----- RV Monitor ----------------------------------+ Formal
* Realm | | Realm
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6a891e00aa7f..3ef047ed9705 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -593,19 +593,6 @@ int tracing_check_open_get_tr(struct trace_array *tr)
return 0;
}
-int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event)
-{
- if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
- !filter_match_preds(call->filter, rec)) {
- __trace_event_discard_commit(buffer, event);
- return 1;
- }
-
- return 0;
-}
-
/**
* trace_find_filtered_pid - check if a pid exists in a filtered_pid list
* @filtered_pids: The list of pids to check
@@ -988,7 +975,8 @@ static inline void trace_access_lock_init(void)
#endif
#ifdef CONFIG_STACKTRACE
-static void __ftrace_trace_stack(struct trace_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned int trace_ctx,
int skip, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
@@ -997,7 +985,8 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
int skip, struct pt_regs *regs);
#else
-static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
+static inline void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned int trace_ctx,
int skip, struct pt_regs *regs)
{
@@ -1934,7 +1923,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
- strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
+ strscpy(max_data->comm, tsk->comm);
max_data->pid = tsk->pid;
/*
* If tsk == current, then use current_uid(), as that does not use
@@ -2908,7 +2897,6 @@ void
trace_function(struct trace_array *tr, unsigned long ip, unsigned long
parent_ip, unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_function;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
@@ -2921,11 +2909,9 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long
entry->ip = ip;
entry->parent_ip = parent_ip;
- if (!call_filter_check_discard(call, entry, buffer, event)) {
- if (static_branch_unlikely(&trace_function_exports_enabled))
- ftrace_exports(event, TRACE_EXPORT_FUNCTION);
- __buffer_unlock_commit(buffer, event);
- }
+ if (static_branch_unlikely(&trace_function_exports_enabled))
+ ftrace_exports(event, TRACE_EXPORT_FUNCTION);
+ __buffer_unlock_commit(buffer, event);
}
#ifdef CONFIG_STACKTRACE
@@ -2933,7 +2919,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long
/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
#define FTRACE_KSTACK_NESTING 4
-#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING)
+#define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING)
struct ftrace_stack {
unsigned long calls[FTRACE_KSTACK_ENTRIES];
@@ -2947,11 +2933,11 @@ struct ftrace_stacks {
static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
-static void __ftrace_trace_stack(struct trace_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned int trace_ctx,
int skip, struct pt_regs *regs)
{
- struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
unsigned int size, nr_entries;
struct ftrace_stack *fstack;
@@ -2994,6 +2980,20 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
nr_entries = stack_trace_save(fstack->calls, size, skip);
}
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /* Mark entry of stack trace as trampoline code */
+ if (tr->ops && tr->ops->trampoline) {
+ unsigned long tramp_start = tr->ops->trampoline;
+ unsigned long tramp_end = tramp_start + tr->ops->trampoline_size;
+ unsigned long *calls = fstack->calls;
+
+ for (int i = 0; i < nr_entries; i++) {
+ if (calls[i] >= tramp_start && calls[i] < tramp_end)
+ calls[i] = FTRACE_TRAMPOLINE_MARKER;
+ }
+ }
+#endif
+
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
struct_size(entry, caller, nr_entries),
trace_ctx);
@@ -3005,8 +3005,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
memcpy(&entry->caller, fstack->calls,
flex_array_size(entry, caller, nr_entries));
- if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ __buffer_unlock_commit(buffer, event);
out:
/* Again, don't let gcc optimize things here */
@@ -3024,7 +3023,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
return;
- __ftrace_trace_stack(buffer, trace_ctx, skip, regs);
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs);
}
void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
@@ -3033,7 +3032,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) {
- __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL);
return;
}
@@ -3050,7 +3049,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
return;
ct_irq_enter_irqson();
- __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL);
ct_irq_exit_irqson();
}
@@ -3067,8 +3066,8 @@ void trace_dump_stack(int skip)
/* Skip 1 to skip this function. */
skip++;
#endif
- __ftrace_trace_stack(printk_trace->array_buffer.buffer,
- tracing_gen_ctx(), skip, NULL);
+ __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer,
+ tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3079,7 +3078,6 @@ static void
ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer, unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
@@ -3113,8 +3111,7 @@ ftrace_trace_userstack(struct trace_array *tr,
memset(&entry->caller, 0, sizeof(entry->caller));
stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
- if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ __buffer_unlock_commit(buffer, event);
out_drop_count:
__this_cpu_dec(user_stack_count);
@@ -3283,7 +3280,6 @@ static void trace_printk_start_stop_comm(int enabled)
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
- struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct trace_array *tr = READ_ONCE(printk_trace);
@@ -3327,10 +3323,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
entry->fmt = fmt;
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
- if (!call_filter_check_discard(call, entry, buffer, event)) {
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
- }
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
out:
ring_buffer_nest_end(buffer);
@@ -3350,7 +3344,6 @@ static int
__trace_array_vprintk(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, va_list args)
{
- struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
int len = 0, size;
struct print_entry *entry;
@@ -3385,10 +3378,8 @@ __trace_array_vprintk(struct trace_buffer *buffer,
entry->ip = ip;
memcpy(&entry->buf, tbuffer, len + 1);
- if (!call_filter_check_discard(call, entry, buffer, event)) {
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
- }
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
out:
ring_buffer_nest_end(buffer);
@@ -8587,15 +8578,22 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
char *buf;
int r;
- /* 256 should be plenty to hold the amount needed */
- buf = kmalloc(256, GFP_KERNEL);
+ /* 512 should be plenty to hold the amount needed */
+#define DYN_INFO_BUF_SIZE 512
+
+ buf = kmalloc(DYN_INFO_BUF_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n",
+ r = scnprintf(buf, DYN_INFO_BUF_SIZE,
+ "%ld pages:%ld groups: %ld\n"
+ "ftrace boot update time = %llu (ns)\n"
+ "ftrace module total update time = %llu (ns)\n",
ftrace_update_tot_cnt,
ftrace_number_of_pages,
- ftrace_number_of_groups);
+ ftrace_number_of_groups,
+ ftrace_update_time,
+ ftrace_total_mod_time);
ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
kfree(buf);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c866991b9c78..266740b4e121 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -46,6 +46,7 @@ enum trace_type {
TRACE_BRANCH,
TRACE_GRAPH_RET,
TRACE_GRAPH_ENT,
+ TRACE_GRAPH_RETADDR_ENT,
TRACE_USER_STACK,
TRACE_BLK,
TRACE_BPUTS,
@@ -512,6 +513,8 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
TRACE_GRAPH_ENT); \
+ IF_ASSIGN(var, ent, struct fgraph_retaddr_ent_entry,\
+ TRACE_GRAPH_RETADDR_ENT); \
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct func_repeats_entry, \
@@ -772,6 +775,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
extern unsigned long ftrace_update_tot_cnt;
extern unsigned long ftrace_number_of_pages;
extern unsigned long ftrace_number_of_groups;
+extern u64 ftrace_update_time;
+extern u64 ftrace_total_mod_time;
void ftrace_init_trace_array(struct trace_array *tr);
#else
static inline void ftrace_init_trace_array(struct trace_array *tr) { }
@@ -879,6 +884,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_GRAPH_TIME 0x400
#define TRACE_GRAPH_PRINT_RETVAL 0x800
#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000
+#define TRACE_GRAPH_PRINT_RETADDR 0x2000
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -900,6 +906,10 @@ extern void graph_trace_close(struct trace_iterator *iter);
extern int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx);
+extern int __trace_graph_retaddr_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx,
+ unsigned long retaddr);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned int trace_ctx);
@@ -1048,6 +1058,7 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra
#endif /* CONFIG_DYNAMIC_FTRACE */
extern unsigned int fgraph_max_depth;
+extern bool fgraph_sleep_time;
static inline bool
ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace)
@@ -1429,10 +1440,6 @@ struct trace_subsystem_dir {
int nr_events;
};
-extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event);
-
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
@@ -2176,4 +2183,11 @@ static inline int rv_init_interface(void)
}
#endif
+/*
+ * This is used only to distinguish
+ * function address from trampoline code.
+ * So this value has no meaning.
+ */
+#define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX)
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index e47fdb4c92fb..6d08a5523ce0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,7 +30,6 @@ static struct trace_array *branch_tracer;
static void
probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
- struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
struct trace_buffer *buffer;
struct trace_array_cpu *data;
@@ -74,16 +73,13 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
p--;
p++;
- strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE);
- strncpy(entry->file, p, TRACE_FILE_SIZE);
- entry->func[TRACE_FUNC_SIZE] = 0;
- entry->file[TRACE_FILE_SIZE] = 0;
+ strscpy(entry->func, f->data.func);
+ strscpy(entry->file, p);
entry->constant = f->constant;
entry->line = f->data.line;
entry->correct = val == expect;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
out:
current->trace_recursion &= ~TRACE_BRANCH_BIT;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 4702efb00ff2..4cb2ebc439be 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -154,5 +154,5 @@ static atomic64_t trace_counter;
*/
u64 notrace trace_clock_counter(void)
{
- return atomic64_add_return(1, &trace_counter);
+ return atomic64_inc_return(&trace_counter);
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c47422b20908..82fd174ebbe0 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -85,9 +85,35 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
);
-/* Function return entry */
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+
+/* Function call entry with a return address */
+FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
+
+ TRACE_GRAPH_RETADDR_ENT,
+
+ F_STRUCT(
+ __field_struct( struct fgraph_retaddr_ent, graph_ent )
+ __field_packed( unsigned long, graph_ent, func )
+ __field_packed( int, graph_ent, depth )
+ __field_packed( unsigned long, graph_ent, retaddr )
+ ),
+
+ F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth,
+ (void *)__entry->retaddr)
+);
+
+#else
+
+#ifndef fgraph_retaddr_ent_entry
+#define fgraph_retaddr_ent_entry ftrace_graph_ent_entry
+#endif
+
+#endif
+
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+/* Function return entry */
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
@@ -110,6 +136,7 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
#else
+/* Function return entry */
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 05e791241812..3ff9caa4a71b 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -352,10 +352,16 @@ void perf_uprobe_destroy(struct perf_event *p_event)
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
+ struct hw_perf_event *hwc = &p_event->hw;
if (!(flags & PERF_EF_START))
p_event->hw.state = PERF_HES_STOPPED;
+ if (is_sampling_event(p_event)) {
+ hwc->last_period = hwc->sample_period;
+ perf_swevent_set_period(p_event);
+ }
+
/*
* If TRACE_REG_PERF_ADD returns false; no custom action was performed
* and we need to take the default action of enqueueing our event on
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7266ec2a4eea..77e68efbd43e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3149,8 +3149,6 @@ static void __trace_remove_event_call(struct trace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
- free_event_filter(call->filter);
- call->filter = NULL;
}
static int probe_remove_event_call(struct trace_event_call *call)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0c611b281a5b..78051de581e7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1616,7 +1616,7 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- strncpy(num_buf, str + s, len);
+ memcpy(num_buf, str + s, len);
num_buf[len] = 0;
ret = kstrtoul(num_buf, 0, &ip);
@@ -1694,7 +1694,7 @@ static int parse_pred(const char *str, void *data,
if (!pred->regex)
goto err_mem;
pred->regex->len = len;
- strncpy(pred->regex->pattern, str + s, len);
+ memcpy(pred->regex->pattern, str + s, len);
pred->regex->pattern[len] = 0;
} else if (!strncmp(str + i, "CPUS", 4)) {
@@ -1859,7 +1859,7 @@ static int parse_pred(const char *str, void *data,
if (!pred->regex)
goto err_mem;
pred->regex->len = len;
- strncpy(pred->regex->pattern, str + s, len);
+ memcpy(pred->regex->pattern, str + s, len);
pred->regex->pattern[len] = 0;
filter_build_regex(pred);
@@ -1919,7 +1919,7 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- strncpy(num_buf, str + s, len);
+ memcpy(num_buf, str + s, len);
num_buf[len] = 0;
/* Make sure it is a value */
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5f9119eb7c67..9c058aa8baf3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -822,7 +822,7 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
{
struct tracepoint *tp = event->tp;
- if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
+ if (unlikely(static_key_enabled(&tp->key))) {
struct tracepoint_func *probe_func_ptr;
synth_probe_func_t probe_func;
void *__data;
@@ -1354,10 +1354,7 @@ static const char *hist_field_name(struct hist_field *field,
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
field_name = "common_timestamp";
else if (field->flags & HIST_FIELD_FL_STACKTRACE) {
- if (field->field)
- field_name = field->field->name;
- else
- field_name = "common_stacktrace";
+ field_name = "common_stacktrace";
} else if (field->flags & HIST_FIELD_FL_HITCOUNT)
field_name = "hitcount";
@@ -1599,7 +1596,7 @@ static inline void save_comm(char *comm, struct task_struct *task)
return;
}
- strncpy(comm, task->comm, TASK_COMM_LEN);
+ strscpy(comm, task->comm, TASK_COMM_LEN);
}
static void hist_elt_data_free(struct hist_elt_data *elt_data)
@@ -3405,7 +3402,7 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
elt_data = context->elt->private_data;
track_elt_data = track_data->elt.private_data;
if (elt_data->comm)
- strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
+ strscpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
track_data->updated = true;
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 42b0d998d103..17bcad8f79de 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1676,7 +1676,7 @@ static void update_enable_bit_for(struct user_event *user)
struct tracepoint *tp = &user->tracepoint;
char status = 0;
- if (atomic_read(&tp->key.enabled) > 0) {
+ if (static_key_enabled(&tp->key)) {
struct tracepoint_func *probe_func_ptr;
user_event_func_t probe_func;
@@ -2280,7 +2280,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
* It's possible key.enabled disables after this check, however
* we don't mind if a few events are included in this condition.
*/
- if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ if (likely(static_key_enabled(&tp->key))) {
struct tracepoint_func *probe_func_ptr;
user_event_func_t probe_func;
struct iov_iter copy;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 3b0cea37e029..74c353164ca1 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -176,6 +176,27 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(&tr->array_buffer);
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static __always_inline unsigned long
+function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs)
+{
+ unsigned long true_parent_ip;
+ int idx = 0;
+
+ true_parent_ip = parent_ip;
+ if (unlikely(parent_ip == (unsigned long)&return_to_handler) && fregs)
+ true_parent_ip = ftrace_graph_ret_addr(current, &idx, parent_ip,
+ (unsigned long *)ftrace_regs_get_stack_pointer(fregs));
+ return true_parent_ip;
+}
+#else
+static __always_inline unsigned long
+function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs)
+{
+ return parent_ip;
+}
+#endif
+
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
@@ -184,7 +205,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
struct trace_array_cpu *data;
unsigned int trace_ctx;
int bit;
- int cpu;
if (unlikely(!tr->function_enabled))
return;
@@ -193,10 +213,11 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
+
trace_ctx = tracing_gen_ctx();
- cpu = smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ data = this_cpu_ptr(tr->array_buffer.data);
if (!atomic_read(&data->disabled))
trace_function(tr, ip, parent_ip, trace_ctx);
@@ -241,6 +262,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
* recursive protection is performed.
*/
local_irq_save(flags);
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
@@ -300,7 +322,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
unsigned int trace_ctx;
unsigned long flags;
int bit;
- int cpu;
if (unlikely(!tr->function_enabled))
return;
@@ -309,8 +330,8 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- cpu = smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
+ data = this_cpu_ptr(tr->array_buffer.data);
if (atomic_read(&data->disabled))
goto out;
@@ -321,7 +342,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
* TODO: think about a solution that is better than just hoping to be
* lucky.
*/
- last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
+ last_info = this_cpu_ptr(tr->last_func_repeats);
if (is_repeat_check(tr, last_info, ip, parent_ip))
goto out;
@@ -356,6 +377,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
* recursive protection is performed.
*/
local_irq_save(flags);
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a569daaac4c4..5504b5e4e7b4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -31,7 +31,10 @@ struct fgraph_data {
struct fgraph_cpu_data __percpu *cpu_data;
/* Place to preserve last processed entry. */
- struct ftrace_graph_ent_entry ent;
+ union {
+ struct ftrace_graph_ent_entry ent;
+ struct fgraph_retaddr_ent_entry rent;
+ } ent;
struct ftrace_graph_ret_entry ret;
int failed;
int cpu;
@@ -64,6 +67,10 @@ static struct tracer_opt trace_opts[] = {
/* Display function return value in hexadecimal format ? */
{ TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) },
#endif
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ /* Display function return address ? */
+ { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) },
+#endif
/* Include sleep time (scheduled out) between entry and return */
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
@@ -83,6 +90,11 @@ static struct tracer_flags tracer_flags = {
.opts = trace_opts
};
+static bool tracer_flags_is_set(u32 flags)
+{
+ return (tracer_flags.val & flags) == flags;
+}
+
/*
* DURATION column is being also used to display IRQ signs,
* following values are used by print_graph_irq and others
@@ -102,7 +114,6 @@ int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
@@ -113,11 +124,42 @@ int __trace_graph_entry(struct trace_array *tr,
return 0;
entry = ring_buffer_event_data(event);
entry->graph_ent = *trace;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
+
+ return 1;
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx,
+ unsigned long retaddr)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct fgraph_retaddr_ent_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT,
+ sizeof(*entry), trace_ctx);
+ if (!event)
+ return 0;
+ entry = ring_buffer_event_data(event);
+ entry->graph_ent.func = trace->func;
+ entry->graph_ent.depth = trace->depth;
+ entry->graph_ent.retaddr = retaddr;
+ trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
}
+#else
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx,
+ unsigned long retaddr)
+{
+ return 1;
+}
+#endif
static inline int ftrace_graph_ignore_irqs(void)
{
@@ -127,12 +169,18 @@ static inline int ftrace_graph_ignore_irqs(void)
return in_hardirq();
}
+struct fgraph_times {
+ unsigned long long calltime;
+ unsigned long long sleeptime; /* may be optional! */
+};
+
int trace_graph_entry(struct ftrace_graph_ent *trace,
struct fgraph_ops *gops)
{
unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
+ struct fgraph_times *ftimes;
unsigned long flags;
unsigned int trace_ctx;
long disabled;
@@ -167,6 +215,19 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
if (ftrace_graph_ignore_irqs())
return 0;
+ if (fgraph_sleep_time) {
+ /* Only need to record the calltime */
+ ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime));
+ } else {
+ ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes));
+ if (ftimes)
+ ftimes->sleeptime = current->ftrace_sleeptime;
+ }
+ if (!ftimes)
+ return 0;
+
+ ftimes->calltime = trace_clock_local();
+
/*
* Stop here if tracing_threshold is set. We only write function return
* events to the ring buffer.
@@ -180,7 +241,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
trace_ctx = tracing_gen_ctx_flags(flags);
- ret = __trace_graph_entry(tr, trace, trace_ctx);
+ if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
+ tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR))) {
+ unsigned long retaddr = ftrace_graph_top_ret_addr(current);
+
+ ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
+ } else
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
} else {
ret = 0;
}
@@ -223,7 +290,6 @@ void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
@@ -234,8 +300,17 @@ void __trace_graph_return(struct trace_array *tr,
return;
entry = ring_buffer_event_data(event);
entry->ret = *trace;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
+}
+
+static void handle_nosleeptime(struct ftrace_graph_ret *trace,
+ struct fgraph_times *ftimes,
+ int size)
+{
+ if (fgraph_sleep_time || size < sizeof(*ftimes))
+ return;
+
+ ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime;
}
void trace_graph_return(struct ftrace_graph_ret *trace,
@@ -244,9 +319,11 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
+ struct fgraph_times *ftimes;
unsigned long flags;
unsigned int trace_ctx;
long disabled;
+ int size;
int cpu;
ftrace_graph_addr_finish(gops, trace);
@@ -256,6 +333,14 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
return;
}
+ ftimes = fgraph_retrieve_data(gops->idx, &size);
+ if (!ftimes)
+ return;
+
+ handle_nosleeptime(trace, ftimes, size);
+
+ trace->calltime = ftimes->calltime;
+
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
@@ -271,6 +356,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops)
{
+ struct fgraph_times *ftimes;
+ int size;
+
ftrace_graph_addr_finish(gops, trace);
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
@@ -278,8 +366,16 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
return;
}
+ ftimes = fgraph_retrieve_data(gops->idx, &size);
+ if (!ftimes)
+ return;
+
+ handle_nosleeptime(trace, ftimes, size);
+
+ trace->calltime = ftimes->calltime;
+
if (tracing_thresh &&
- (trace->rettime - trace->calltime < tracing_thresh))
+ (trace->rettime - ftimes->calltime < tracing_thresh))
return;
else
trace_graph_return(trace, gops);
@@ -457,7 +553,7 @@ get_return_for_leaf(struct trace_iterator *iter,
* then we just reuse the data from before.
*/
if (data && data->failed) {
- curr = &data->ent;
+ curr = &data->ent.ent;
next = &data->ret;
} else {
@@ -487,7 +583,10 @@ get_return_for_leaf(struct trace_iterator *iter,
* Save current and next entries for later reference
* if the output fails.
*/
- data->ent = *curr;
+ if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT))
+ data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr;
+ else
+ data->ent.ent = *curr;
/*
* If the next event is not a return type, then
* we only care about what type it is. Otherwise we can
@@ -651,52 +750,96 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
}
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
-
#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL
+#else
+#define __TRACE_GRAPH_PRINT_RETVAL 0
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+#define __TRACE_GRAPH_PRINT_RETADDR TRACE_GRAPH_PRINT_RETADDR
+static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_entry *entry,
+ u32 trace_flags, bool comment)
+{
+ if (comment)
+ trace_seq_puts(s, " /*");
+
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET);
+
+ if (comment)
+ trace_seq_puts(s, " */");
+}
+#else
+#define __TRACE_GRAPH_PRINT_RETADDR 0
+#define print_graph_retaddr(_seq, _entry, _tflags, _comment) do { } while (0)
+#endif
-static void print_graph_retval(struct trace_seq *s, unsigned long retval,
- bool leaf, void *func, bool hex_format)
+#if defined(CONFIG_FUNCTION_GRAPH_RETVAL) || defined(CONFIG_FUNCTION_GRAPH_RETADDR)
+
+static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry,
+ struct ftrace_graph_ret *graph_ret, void *func,
+ u32 opt_flags, u32 trace_flags)
{
unsigned long err_code = 0;
+ unsigned long retval = 0;
+ bool print_retaddr = false;
+ bool print_retval = false;
+ bool hex_format = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL_HEX);
- if (retval == 0 || hex_format)
- goto done;
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+ retval = graph_ret->retval;
+ print_retval = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL);
+#endif
- /* Check if the return value matches the negative format */
- if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
- (((u64)retval) >> 32) == 0) {
- /* sign extension */
- err_code = (unsigned long)(s32)retval;
- } else {
- err_code = retval;
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ print_retaddr = !!(opt_flags & TRACE_GRAPH_PRINT_RETADDR);
+#endif
+
+ if (print_retval && retval && !hex_format) {
+ /* Check if the return value matches the negative format */
+ if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
+ (((u64)retval) >> 32) == 0) {
+ err_code = sign_extend64(retval, 31);
+ } else {
+ err_code = retval;
+ }
+
+ if (!IS_ERR_VALUE(err_code))
+ err_code = 0;
}
- if (!IS_ERR_VALUE(err_code))
- err_code = 0;
+ if (entry) {
+ if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT)
+ print_retaddr = false;
-done:
- if (leaf) {
- if (hex_format || (err_code == 0))
- trace_seq_printf(s, "%ps(); /* = 0x%lx */\n",
- func, retval);
+ trace_seq_printf(s, "%ps();", func);
+ if (print_retval || print_retaddr)
+ trace_seq_puts(s, " /*");
else
- trace_seq_printf(s, "%ps(); /* = %ld */\n",
- func, err_code);
+ trace_seq_putc(s, '\n');
} else {
+ print_retaddr = false;
+ trace_seq_printf(s, "} /* %ps", func);
+ }
+
+ if (print_retaddr)
+ print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+ trace_flags, false);
+
+ if (print_retval) {
if (hex_format || (err_code == 0))
- trace_seq_printf(s, "} /* %ps = 0x%lx */\n",
- func, retval);
+ trace_seq_printf(s, " ret=0x%lx", retval);
else
- trace_seq_printf(s, "} /* %ps = %ld */\n",
- func, err_code);
+ trace_seq_printf(s, " ret=%ld", err_code);
}
+
+ if (!entry || print_retval || print_retaddr)
+ trace_seq_puts(s, " */\n");
}
#else
-#define __TRACE_GRAPH_PRINT_RETVAL 0
-
-#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0)
+#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0)
#endif
@@ -748,14 +891,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
trace_seq_putc(s, ' ');
/*
- * Write out the function return value if the option function-retval is
- * enabled.
+ * Write out the function return value or return address
*/
- if (flags & __TRACE_GRAPH_PRINT_RETVAL)
- print_graph_retval(s, graph_ret->retval, true, (void *)func,
- !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
- else
+ if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) {
+ print_graph_retval(s, entry, graph_ret,
+ (void *)graph_ret->func + iter->tr->text_delta,
+ flags, tr->trace_flags);
+ } else {
trace_seq_printf(s, "%ps();\n", (void *)func);
+ }
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -796,7 +940,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
func = call->func + iter->tr->text_delta;
- trace_seq_printf(s, "%ps() {\n", (void *)func);
+ trace_seq_printf(s, "%ps() {", (void *)func);
+ if (flags & __TRACE_GRAPH_PRINT_RETADDR &&
+ entry->ent.type == TRACE_GRAPH_RETADDR_ENT)
+ print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+ tr->trace_flags, true);
+ trace_seq_putc(s, '\n');
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
@@ -1043,11 +1192,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
/*
* Always write out the function name and its return value if the
- * function-retval option is enabled.
+ * funcgraph-retval option is enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
- print_graph_retval(s, trace->retval, false, (void *)func,
- !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
+ print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags);
} else {
/*
* If the return function does not have a matching entry,
@@ -1162,7 +1310,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
* to print out the missing entry which would never go out.
*/
if (data && data->failed) {
- field = &data->ent;
+ field = &data->ent.ent;
iter->cpu = data->cpu;
ret = print_graph_entry(field, s, iter, flags);
if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
@@ -1186,6 +1334,16 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
saved = *field;
return print_graph_entry(&saved, s, iter, flags);
}
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ case TRACE_GRAPH_RETADDR_ENT: {
+ struct fgraph_retaddr_ent_entry saved;
+ struct fgraph_retaddr_ent_entry *rfield;
+
+ trace_assign_type(rfield, entry);
+ saved = *rfield;
+ return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags);
+ }
+#endif
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
@@ -1380,6 +1538,13 @@ static struct trace_event graph_trace_entry_event = {
.funcs = &graph_functions,
};
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+static struct trace_event graph_trace_retaddr_entry_event = {
+ .type = TRACE_GRAPH_RETADDR_ENT,
+ .funcs = &graph_functions,
+};
+#endif
+
static struct trace_event graph_trace_ret_event = {
.type = TRACE_GRAPH_RET,
.funcs = &graph_functions
@@ -1466,6 +1631,13 @@ static __init int init_graph_trace(void)
return 1;
}
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ if (!register_trace_event(&graph_trace_retaddr_entry_event)) {
+ pr_warn("Warning: could not register graph trace retaddr events\n");
+ return 1;
+ }
+#endif
+
if (!register_trace_event(&graph_trace_ret_event)) {
pr_warn("Warning: could not register graph trace events\n");
return 1;
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 3bd6071441ad..b65353ec2837 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -130,7 +130,6 @@ static bool hwlat_busy;
static void trace_hwlat_sample(struct hwlat_sample *sample)
{
struct trace_array *tr = hwlat_trace;
- struct trace_event_call *call = &event_hwlat;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct hwlat_entry *entry;
@@ -148,8 +147,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
entry->nmi_count = sample->nmi_count;
entry->count = sample->count;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/* Macros to encapsulate the time capturing infrastructure */
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..1e72d20b3c2f 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -96,22 +96,19 @@ static int kdb_ftdump(int argc, const char **argv)
{
int skip_entries = 0;
long cpu_file;
- char *cp;
+ int err;
int cnt;
int cpu;
if (argc > 2)
return KDB_ARGCOUNT;
- if (argc) {
- skip_entries = simple_strtol(argv[1], &cp, 0);
- if (*cp)
- skip_entries = 0;
- }
+ if (argc && kstrtoint(argv[1], 0, &skip_entries))
+ return KDB_BADINT;
if (argc == 2) {
- cpu_file = simple_strtol(argv[2], &cp, 0);
- if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+ err = kstrtol(argv[2], 0, &cpu_file);
+ if (err || cpu_file >= NR_CPUS || cpu_file < 0 ||
!cpu_online(cpu_file))
return KDB_BADINT;
} else {
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 64e77b513697..ba5858866b2f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -294,7 +294,6 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_rw *rw)
{
- struct trace_event_call *call = &event_mmiotrace_rw;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
@@ -310,8 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->rw = *rw;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -325,7 +323,6 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_map *map)
{
- struct trace_event_call *call = &event_mmiotrace_map;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
@@ -341,8 +338,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->map = *map;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a50ed23bee77..b9f96c77527d 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -499,7 +499,6 @@ static void print_osnoise_headers(struct seq_file *s)
static void
__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
{
- struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct osnoise_entry *entry;
@@ -517,8 +516,7 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe
entry->softirq_count = sample->softirq_count;
entry->thread_count = sample->thread_count;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/*
@@ -578,7 +576,6 @@ static void print_timerlat_headers(struct seq_file *s)
static void
__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
{
- struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct timerlat_entry *entry;
@@ -591,8 +588,7 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf
entry->context = sample->context;
entry->timer_latency = sample->timer_latency;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/*
@@ -654,7 +650,6 @@ static void timerlat_save_stack(int skip)
static void
__timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size)
{
- struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct stack_entry *entry;
@@ -668,8 +663,7 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u
memcpy(&entry->caller, fstack->calls, size);
entry->size = fstack->nr_entries;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 868f2f912f28..e08aee34ef63 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -460,7 +460,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
(entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' :
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
bh_off ? 'b' :
- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
@@ -1246,6 +1245,10 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
break;
trace_seq_puts(s, " => ");
+ if ((*p) == FTRACE_TRAMPOLINE_MARKER) {
+ trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n");
+ continue;
+ }
seq_print_ip_sym(s, (*p) + delta, flags);
trace_seq_putc(s, '\n');
}
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index e37446f7916e..5c03633316a6 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -15,20 +15,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/preemptirq.h>
-/*
- * Use regular trace points on architectures that implement noinstr
- * tooling: these calls will only happen with RCU enabled, which can
- * use a regular tracepoint.
- *
- * On older architectures, use the rcuidle tracing methods (which
- * aren't NMI-safe - so exclude NMI contexts):
- */
-#ifdef CONFIG_ARCH_WANTS_NO_INSTR
-#define trace(point) trace_##point
-#else
-#define trace(point) if (!in_nmi()) trace_##point##_rcuidle
-#endif
-
#ifdef CONFIG_TRACE_IRQFLAGS
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);
@@ -42,7 +28,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
void trace_hardirqs_on_prepare(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -53,7 +39,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -75,7 +61,7 @@ void trace_hardirqs_off_finish(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
}
}
@@ -89,7 +75,7 @@ void trace_hardirqs_off(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
}
}
EXPORT_SYMBOL(trace_hardirqs_off);
@@ -100,13 +86,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
- trace(preempt_enable)(a0, a1);
+ trace_preempt_enable(a0, a1);
tracer_preempt_on(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
- trace(preempt_disable)(a0, a1);
+ trace_preempt_disable(a0, a1);
tracer_preempt_off(a0, a1);
}
#endif
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8a407adb0e1c..573b5d8e8a28 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -187,7 +187,7 @@ static inline char *get_saved_cmdlines(int idx)
static inline void set_cmdline(int idx, const char *cmdline)
{
- strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+ strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ae2ace5e515a..d6c7f18daa15 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -378,7 +378,6 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *next,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_context_switch;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -396,8 +395,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_state = task_state_index(next);
entry->next_cpu = task_cpu(next);
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void
@@ -406,7 +404,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *curr,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -424,8 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_state = task_state_index(wakee);
entry->next_cpu = task_cpu(wakee);
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void notrace
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 1469dd8075fa..38b5754790c9 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_PRINT:
case TRACE_BRANCH:
case TRACE_GRAPH_ENT:
+ case TRACE_GRAPH_RETADDR_ENT:
case TRACE_GRAPH_RET:
return 1;
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 785733245ead..46aab0ab9350 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -299,6 +299,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
int syscall_nr;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -338,6 +345,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct trace_event_buffer fbuffer;
int syscall_nr;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -584,6 +598,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int rctx;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -686,6 +707,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
int rctx;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 8879da16ef4d..1848ce7e2976 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,9 +25,6 @@ enum tp_func_state {
extern tracepoint_ptr_t __start___tracepoints_ptrs[];
extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
-DEFINE_SRCU(tracepoint_srcu);
-EXPORT_SYMBOL_GPL(tracepoint_srcu);
-
enum tp_transition_sync {
TP_TRANSITION_SYNC_1_0_1,
TP_TRANSITION_SYNC_N_2_1,
@@ -37,7 +34,6 @@ enum tp_transition_sync {
struct tp_transition_snapshot {
unsigned long rcu;
- unsigned long srcu;
bool ongoing;
};
@@ -50,7 +46,6 @@ static void tp_rcu_get_state(enum tp_transition_sync sync)
/* Keep the latest get_state snapshot. */
snapshot->rcu = get_state_synchronize_rcu();
- snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu);
snapshot->ongoing = true;
}
@@ -61,8 +56,6 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync)
if (!snapshot->ongoing)
return;
cond_synchronize_rcu(snapshot->rcu);
- if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu))
- synchronize_srcu(&tracepoint_srcu);
snapshot->ongoing = false;
}
@@ -85,9 +78,6 @@ static LIST_HEAD(tracepoint_module_list);
*/
static DEFINE_MUTEX(tracepoints_mutex);
-static struct rcu_head *early_probes;
-static bool ok_to_free_tracepoints;
-
/*
* Note about RCU :
* It is used to delay the free of multiple probes array until a quiescent
@@ -111,57 +101,21 @@ static inline void *allocate_probes(int count)
return p == NULL ? NULL : p->probes;
}
-static void srcu_free_old_probes(struct rcu_head *head)
-{
- kfree(container_of(head, struct tp_probes, rcu));
-}
-
static void rcu_free_old_probes(struct rcu_head *head)
{
- call_srcu(&tracepoint_srcu, head, srcu_free_old_probes);
-}
-
-static __init int release_early_probes(void)
-{
- struct rcu_head *tmp;
-
- ok_to_free_tracepoints = true;
-
- while (early_probes) {
- tmp = early_probes;
- early_probes = tmp->next;
- call_rcu(tmp, rcu_free_old_probes);
- }
-
- return 0;
+ kfree(container_of(head, struct tp_probes, rcu));
}
-/* SRCU is initialized at core_initcall */
-postcore_initcall(release_early_probes);
-
-static inline void release_probes(struct tracepoint_func *old)
+static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old)
{
if (old) {
struct tp_probes *tp_probes = container_of(old,
struct tp_probes, probes[0]);
- /*
- * We can't free probes if SRCU is not initialized yet.
- * Postpone the freeing till after SRCU is initialized.
- */
- if (unlikely(!ok_to_free_tracepoints)) {
- tp_probes->rcu.next = early_probes;
- early_probes = &tp_probes->rcu;
- return;
- }
-
- /*
- * Tracepoint probes are protected by both sched RCU and SRCU,
- * by calling the SRCU callback in the sched RCU callback we
- * cover both cases. So let us chain the SRCU and sched RCU
- * callbacks to wait for both grace periods.
- */
- call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+ if (tracepoint_is_faultable(tp))
+ call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes);
+ else
+ call_rcu(&tp_probes->rcu, rcu_free_old_probes);
}
}
@@ -327,8 +281,8 @@ static int tracepoint_add_func(struct tracepoint *tp,
struct tracepoint_func *old, *tp_funcs;
int ret;
- if (tp->regfunc && !static_key_enabled(&tp->key)) {
- ret = tp->regfunc();
+ if (tp->ext && tp->ext->regfunc && !static_key_enabled(&tp->key)) {
+ ret = tp->ext->regfunc();
if (ret < 0)
return ret;
}
@@ -358,7 +312,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
tracepoint_update_call(tp, tp_funcs);
/* Both iterator and static call handle NULL tp->funcs */
rcu_assign_pointer(tp->funcs, tp_funcs);
- static_key_enable(&tp->key);
+ static_branch_enable(&tp->key);
break;
case TP_FUNC_2: /* 1->2 */
/* Set iterator static call */
@@ -383,7 +337,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
break;
}
- release_probes(old);
+ release_probes(tp, old);
return 0;
}
@@ -411,10 +365,9 @@ static int tracepoint_remove_func(struct tracepoint *tp,
switch (nr_func_state(tp_funcs)) {
case TP_FUNC_0: /* 1->0 */
/* Removed last function */
- if (tp->unregfunc && static_key_enabled(&tp->key))
- tp->unregfunc();
-
- static_key_disable(&tp->key);
+ if (tp->ext && tp->ext->unregfunc && static_key_enabled(&tp->key))
+ tp->ext->unregfunc();
+ static_branch_disable(&tp->key);
/* Set iterator static call */
tracepoint_update_call(tp, tp_funcs);
/* Both iterator and static call handle NULL tp->funcs */
@@ -455,7 +408,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
WARN_ON_ONCE(1);
break;
}
- release_probes(old);
+ release_probes(tp, old);
return 0;
}
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 696406939be5..f950b5e59d63 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -70,7 +70,7 @@ static long ue_int_max = INT_MAX;
.extra1 = &ue_zero, \
.extra2 = &ue_int_max, \
}
-static struct ctl_table user_table[] = {
+static const struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_user_namespaces"),
UCOUNT_ENTRY("max_pid_namespaces"),
UCOUNT_ENTRY("max_uts_namespaces"),
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 262691ba62b7..5a93d4c446b8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -644,6 +644,14 @@ static int is_softlockup(unsigned long touch_ts,
need_counting_irqs())
start_counting_irqs();
+ /*
+ * A poorly behaving BPF scheduler can live-lock the system into
+ * soft lockups. Tell sched_ext to try ejecting the BPF
+ * scheduler when close to a soft lockup.
+ */
+ if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
+ scx_softlockup(now - touch_ts);
+
/* Warn about unreasonable delays. */
if (time_after(now, period_ts + get_softlockup_thresh()))
return now - touch_ts;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9949ffad8df0..8b07576814a5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3833,16 +3833,28 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
{
bool wait = false;
struct pool_workqueue *pwq;
+ struct worker_pool *current_pool = NULL;
if (flush_color >= 0) {
WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
atomic_set(&wq->nr_pwqs_to_flush, 1);
}
+ /*
+ * For unbound workqueue, pwqs will map to only a few pools.
+ * Most of the time, pwqs within the same pool will be linked
+ * sequentially to wq->pwqs by cpu index. So in the majority
+ * of pwq iters, the pool is the same, only doing lock/unlock
+ * if the pool has changed. This can largely reduce expensive
+ * lock operations.
+ */
for_each_pwq(pwq, wq) {
- struct worker_pool *pool = pwq->pool;
-
- raw_spin_lock_irq(&pool->lock);
+ if (current_pool != pwq->pool) {
+ if (likely(current_pool))
+ raw_spin_unlock_irq(&current_pool->lock);
+ current_pool = pwq->pool;
+ raw_spin_lock_irq(&current_pool->lock);
+ }
if (flush_color >= 0) {
WARN_ON_ONCE(pwq->flush_color != -1);
@@ -3859,9 +3871,11 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
pwq->work_color = work_color;
}
- raw_spin_unlock_irq(&pool->lock);
}
+ if (current_pool)
+ raw_spin_unlock_irq(&current_pool->lock);
+
if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
complete(&wq->first_flusher->done);