diff options
Diffstat (limited to '')
-rw-r--r-- | kernel/bpf/Makefile | 9 | ||||
-rw-r--r-- | kernel/bpf/arena.c | 38 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 26 | ||||
-rw-r--r-- | kernel/bpf/bpf_cgrp_storage.c | 4 | ||||
-rw-r--r-- | kernel/bpf/bpf_inode_storage.c | 5 | ||||
-rw-r--r-- | kernel/bpf/bpf_local_storage.c | 79 | ||||
-rw-r--r-- | kernel/bpf/bpf_lsm.c | 2 | ||||
-rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 115 | ||||
-rw-r--r-- | kernel/bpf/bpf_task_storage.c | 8 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 196 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 2 | ||||
-rw-r--r-- | kernel/bpf/core.c | 16 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 6 | ||||
-rw-r--r-- | kernel/bpf/dispatcher.c | 3 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 56 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 29 | ||||
-rw-r--r-- | kernel/bpf/kmem_cache_iter.c | 238 | ||||
-rw-r--r-- | kernel/bpf/lpm_trie.c | 133 | ||||
-rw-r--r-- | kernel/bpf/memalloc.c | 5 | ||||
-rw-r--r-- | kernel/bpf/range_tree.c | 272 | ||||
-rw-r--r-- | kernel/bpf/range_tree.h | 21 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 191 | ||||
-rw-r--r-- | kernel/bpf/task_iter.c | 6 | ||||
-rw-r--r-- | kernel/bpf/token.c | 1 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 60 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 626 |
26 files changed, 1765 insertions, 382 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 9b9c151b5c82..410028633621 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -16,7 +16,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) -obj-$(CONFIG_BPF_SYSCALL) += arena.o +obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) @@ -52,3 +52,10 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o +obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o + +CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index e52b3ad231b9..945a5680f6a5 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -3,9 +3,11 @@ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> +#include "linux/filter.h" #include <linux/btf_ids.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> +#include "range_tree.h" /* * bpf_arena is a sparsely populated shared memory region between bpf program and @@ -45,7 +47,7 @@ struct bpf_arena { u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; - struct maple_tree mt; + struct range_tree rt; struct list_head vma_list; struct mutex lock; }; @@ -98,6 +100,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) u64 vm_range; int err = -ENOMEM; + if (!bpf_jit_supports_arena()) + return ERR_PTR(-EOPNOTSUPP); + if (attr->key_size || attr->value_size || attr->max_entries == 0 || /* BPF_F_MMAPABLE must be set */ !(attr->map_flags & BPF_F_MMAPABLE) || @@ -132,7 +137,8 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) INIT_LIST_HEAD(&arena->vma_list); bpf_map_init_from_attr(&arena->map, attr); - mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE); + range_tree_init(&arena->rt); + range_tree_set(&arena->rt, 0, attr->max_entries); mutex_init(&arena->lock); return &arena->map; @@ -183,7 +189,7 @@ static void arena_map_free(struct bpf_map *map) apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); free_vm_area(arena->kern_vm); - mtree_destroy(&arena->mt); + range_tree_destroy(&arena->rt); bpf_map_area_free(arena); } @@ -274,20 +280,20 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) /* User space requested to segfault when page is not allocated by bpf prog */ return VM_FAULT_SIGSEGV; - ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL); + ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); if (ret) { - mtree_erase(&arena->mt, vmf->pgoff); + range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; } ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); if (ret) { - mtree_erase(&arena->mt, vmf->pgoff); + range_tree_set(&arena->rt, vmf->pgoff, 1); __free_page(page); return VM_FAULT_SIGSEGV; } @@ -444,12 +450,16 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt guard(mutex)(&arena->lock); - if (uaddr) - ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1, - MT_ENTRY, GFP_KERNEL); - else - ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY, - page_cnt, 0, page_cnt_max - 1, GFP_KERNEL); + if (uaddr) { + ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); + if (ret) + goto out_free_pages; + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + } else { + ret = pgoff = range_tree_find(&arena->rt, page_cnt); + if (pgoff >= 0) + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + } if (ret) goto out_free_pages; @@ -476,7 +486,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt kvfree(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: - mtree_erase(&arena->mt, pgoff); + range_tree_set(&arena->rt, pgoff, page_cnt); out_free_pages: kvfree(pages); return 0; @@ -516,7 +526,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) pgoff = compute_pgoff(arena, uaddr); /* clear range */ - mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL); + range_tree_set(&arena->rt, pgoff, page_cnt); if (page_cnt > 1) /* bulk zap if multiple pages being freed */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 79660e3fca4c..6cdbb4c33d31 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -947,22 +947,44 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_prog *prog = bpf_prog_get(fd); + bool is_extended; if (IS_ERR(prog)) return prog; - if (!bpf_prog_map_compatible(map, prog)) { + if (prog->type == BPF_PROG_TYPE_EXT || + !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } + mutex_lock(&prog->aux->ext_mutex); + is_extended = prog->aux->is_extended; + if (!is_extended) + prog->aux->prog_array_member_cnt++; + mutex_unlock(&prog->aux->ext_mutex); + if (is_extended) { + /* Extended prog can not be tail callee. It's to prevent a + * potential infinite loop like: + * tail callee prog entry -> tail callee prog subprog -> + * freplace prog entry --tailcall-> tail callee prog entry. + */ + bpf_prog_put(prog); + return ERR_PTR(-EBUSY); + } + return prog; } static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + struct bpf_prog *prog = ptr; + + mutex_lock(&prog->aux->ext_mutex); + prog->aux->prog_array_member_cnt--; + mutex_unlock(&prog->aux->ext_mutex); /* bpf_prog is freed after one RCU or tasks trace grace period */ - bpf_prog_put(ptr); + bpf_prog_put(prog); } static u32 prog_fd_array_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 28efd0a3f220..20f05de92e9c 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -107,7 +107,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); @@ -181,7 +181,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, BPF_NOEXIST, gfp_flags); + value, BPF_NOEXIST, false, gfp_flags); unlock: bpf_cgrp_storage_unlock(); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..a51c82dee1bd 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -16,7 +16,6 @@ #include <uapi/linux/btf.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(inode_cache); @@ -100,7 +99,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(file_inode(fd_file(f)), (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); return PTR_ERR_OR_ZERO(sdata); } @@ -154,7 +153,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index c938dea5ddbf..7e6a0af0afc1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem, gfp_t gfp_flags) + void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -99,9 +99,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, } if (selem) { - if (value) + if (value) { + /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); - /* No need to call check_and_init_map_value as memory is zero init */ + if (swap_uptrs) + bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); + } return selem; } @@ -209,8 +212,12 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + /* The bpf_local_storage_map_free will wait for rcu_barrier */ + smap = rcu_dereference_check(SDATA(selem)->smap, 1); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); bpf_mem_cache_raw_free(selem); } @@ -226,16 +233,25 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, bool reuse_now) { - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - if (!smap->bpf_ma) { + /* Only task storage has uptrs and task storage + * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true + * for task storage, so this bpf_obj_free_fields() won't unpin + * any uptr. + */ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } - if (!reuse_now) { - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); - } else { + if (reuse_now) { + /* reuse_now == true only happens when the storage owner + * (e.g. task_struct) is being destructed or the map itself + * is being destructed (ie map_free). In both cases, + * no bpf prog can have a hold on the selem. It is + * safe to unpin the uptrs and free the selem now. + */ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); /* Instead of using the vanilla call_rcu(), * bpf_mem_cache_free will be able to reuse selem * immediately. @@ -243,6 +259,26 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, migrate_disable(); bpf_mem_cache_free(&smap->selem_ma, selem); migrate_enable(); + return; + } + + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); +} + +static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + struct hlist_node *n; + + /* The "_safe" iteration is needed. + * The loop is not removing the selem from the list + * but bpf_selem_free will use the selem->rcu_head + * which is union-ized with the selem->free_node. + */ + hlist_for_each_entry_safe(selem, n, list, free_node) { + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + bpf_selem_free(selem, smap, reuse_now); } } @@ -252,7 +288,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, bool reuse_now) + bool uncharge_mem, struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -296,7 +332,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - bpf_selem_free(selem, smap, reuse_now); + hlist_add_head(&selem->free_node, free_selem_list); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); @@ -345,6 +381,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; bool bpf_ma, free_local_storage = false; + HLIST_HEAD(selem_free_list); unsigned long flags; if (unlikely(!selem_linked_to_storage_lockless(selem))) @@ -360,9 +397,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, reuse_now); + local_storage, selem, true, &selem_free_list); raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&selem_free_list, reuse_now); + if (free_local_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); } @@ -524,11 +563,12 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, gfp_t gfp_flags) + void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; + HLIST_HEAD(old_selem_free_list); unsigned long flags; int err; @@ -550,7 +590,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); @@ -584,7 +624,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); @@ -624,11 +664,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - true, false); + true, &old_selem_free_list); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); bpf_selem_free(alloc_selem, smap, true); @@ -706,6 +747,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; bool bpf_ma, free_storage = false; + HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; @@ -734,10 +776,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, true); + local_storage, selem, true, &free_selem_list); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&free_selem_list, true); + if (free_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); } @@ -883,6 +927,9 @@ void bpf_local_storage_map_free(struct bpf_map *map, synchronize_rcu(); if (smap->bpf_ma) { + rcu_barrier_tasks_trace(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); bpf_mem_alloc_destroy(&smap->selem_ma); bpf_mem_alloc_destroy(&smap->storage_ma); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 3bc61628ab25..967492b65185 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -375,8 +375,6 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_current_getsecid_subj) -BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) BTF_ID(func, bpf_lsm_task_to_inode) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index fda3dd2ee984..606efe32485a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -23,7 +23,6 @@ struct bpf_struct_ops_value { struct bpf_struct_ops_map { struct bpf_map map; - struct rcu_head rcu; const struct bpf_struct_ops_desc *st_ops_desc; /* protect map_update */ struct mutex lock; @@ -32,7 +31,9 @@ struct bpf_struct_ops_map { * (in kvalue.data). */ struct bpf_link **links; - u32 links_cnt; + /* ksyms for bpf trampolines */ + struct bpf_ksym **ksyms; + u32 funcs_cnt; u32 image_pages_cnt; /* image_pages is an array of pages that has all the trampolines * that stores the func args before calling the bpf_prog. @@ -481,11 +482,11 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) { u32 i; - for (i = 0; i < st_map->links_cnt; i++) { - if (st_map->links[i]) { - bpf_link_put(st_map->links[i]); - st_map->links[i] = NULL; - } + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + break; + bpf_link_put(st_map->links[i]); + st_map->links[i] = NULL; } } @@ -586,6 +587,49 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, return 0; } +static void bpf_struct_ops_ksym_init(const char *tname, const char *mname, + void *image, unsigned int size, + struct bpf_ksym *ksym) +{ + snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname); + INIT_LIST_HEAD_RCU(&ksym->lnode); + bpf_image_ksym_init(image, size, ksym); +} + +static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->ksyms[i]) + break; + bpf_image_ksym_add(st_map->ksyms[i]); + } +} + +static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->ksyms[i]) + break; + bpf_image_ksym_del(st_map->ksyms[i]); + } +} + +static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->ksyms[i]) + break; + kfree(st_map->ksyms[i]); + st_map->ksyms[i] = NULL; + } +} + static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { @@ -601,6 +645,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, int prog_fd, err; u32 i, trampoline_start, image_off = 0; void *cur_image = NULL, *image = NULL; + struct bpf_link **plink; + struct bpf_ksym **pksym; + const char *tname, *mname; if (flags) return -EINVAL; @@ -639,14 +686,19 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, udata = &uvalue->data; kdata = &kvalue->data; + plink = st_map->links; + pksym = st_map->ksyms; + tname = btf_name_by_offset(st_map->btf, t->name_off); module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; struct bpf_tramp_link *link; + struct bpf_ksym *ksym; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; + mname = btf_name_by_offset(st_map->btf, member->name_off); ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) @@ -714,7 +766,14 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog); - st_map->links[i] = &link->link; + *plink++ = &link->link; + + ksym = kzalloc(sizeof(*ksym), GFP_USER); + if (!ksym) { + err = -ENOMEM; + goto reset_unlock; + } + *pksym++ = ksym; trampoline_start = image_off; err = bpf_struct_ops_prepare_trampoline(tlinks, link, @@ -735,6 +794,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, /* put prog_id to udata */ *(unsigned long *)(udata + moff) = prog->aux->id; + + /* init ksym for this trampoline */ + bpf_struct_ops_ksym_init(tname, mname, + image + trampoline_start, + image_off - trampoline_start, + ksym); } if (st_ops->validate) { @@ -783,6 +848,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, */ reset_unlock: + bpf_struct_ops_map_free_ksyms(st_map); bpf_struct_ops_map_free_image(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); @@ -790,6 +856,8 @@ reset_unlock: unlock: kfree(tlinks); mutex_unlock(&st_map->lock); + if (!err) + bpf_struct_ops_map_add_ksyms(st_map); return err; } @@ -849,7 +917,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) if (st_map->links) bpf_struct_ops_map_put_progs(st_map); + if (st_map->ksyms) + bpf_struct_ops_map_free_ksyms(st_map); bpf_map_area_free(st_map->links); + bpf_map_area_free(st_map->ksyms); bpf_struct_ops_map_free_image(st_map); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); @@ -866,6 +937,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); + bpf_struct_ops_map_del_ksyms(st_map); + /* The struct_ops's function may switch to another struct_ops. * * For example, bpf_tcp_cc_x->init() may switch to @@ -895,6 +968,19 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) return 0; } +static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t) +{ + int i; + u32 count; + const struct btf_member *member; + + count = 0; + for_each_member(i, t, member) + if (btf_type_resolve_func_ptr(btf, member->type, NULL)) + count++; + return count; +} + static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) { const struct bpf_struct_ops_desc *st_ops_desc; @@ -961,11 +1047,15 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) map = &st_map->map; st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); - st_map->links_cnt = btf_type_vlen(t); + st_map->funcs_cnt = count_func_ptrs(btf, t); st_map->links = - bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *), + bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *), + NUMA_NO_NODE); + + st_map->ksyms = + bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *), NUMA_NO_NODE); - if (!st_map->uvalue || !st_map->links) { + if (!st_map->uvalue || !st_map->links || !st_map->ksyms) { ret = -ENOMEM; goto errout_free; } @@ -994,7 +1084,8 @@ static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) usage = sizeof(*st_map) + vt->size - sizeof(struct bpf_struct_ops_value); usage += vt->size; - usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); + usage += st_map->funcs_cnt * sizeof(struct bpf_link *); + usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *); usage += PAGE_SIZE; return usage; } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..bf7fa15fdcc6 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -16,7 +16,6 @@ #include <linux/filter.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(task_cache); @@ -129,6 +128,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, struct pid *pid; int fd, err; + if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR)) + return -EOPNOTSUPP; + fd = *(int *)key; pid = pidfd_get_pid(fd, &f_flags); if (IS_ERR(pid)) @@ -147,7 +149,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, - GFP_ATOMIC); + true, GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -219,7 +221,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? NULL : sdata->data; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5cd1c7a23848..e5a5f023cedd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2808,7 +2808,7 @@ static void btf_ref_type_log(struct btf_verifier_env *env, btf_verifier_log(env, "type_id=%u", t->type); } -static struct btf_kind_operations modifier_ops = { +static const struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, @@ -2817,7 +2817,7 @@ static struct btf_kind_operations modifier_ops = { .show = btf_modifier_show, }; -static struct btf_kind_operations ptr_ops = { +static const struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, @@ -2858,7 +2858,7 @@ static void btf_fwd_type_log(struct btf_verifier_env *env, btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); } -static struct btf_kind_operations fwd_ops = { +static const struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, @@ -3109,7 +3109,7 @@ static void btf_array_show(const struct btf *btf, const struct btf_type *t, __btf_array_show(btf, t, type_id, data, bits_offset, show); } -static struct btf_kind_operations array_ops = { +static const struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, @@ -3334,7 +3334,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, } static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, - u32 off, int sz, struct btf_field_info *info) + u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; u32 res_id; @@ -3358,9 +3358,14 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, type = BPF_KPTR_REF; else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_PERCPU; + else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_UPTR; else return -EINVAL; + if (!(type & field_mask)) + return BTF_FIELD_IGNORE; + /* Get the base type */ t = btf_type_skip_modifiers(btf, t->type, &res_id); /* Only pointer to struct is allowed */ @@ -3502,7 +3507,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ - if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) { + if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { type = BPF_KPTR_REF; goto end; } @@ -3535,6 +3540,7 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: case BPF_LIST_HEAD: case BPF_RB_ROOT: break; @@ -3667,8 +3673,9 @@ static int btf_find_field_one(const struct btf *btf, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: ret = btf_find_kptr(btf, var_type, off, sz, - info_cnt ? &info[0] : &tmp); + info_cnt ? &info[0] : &tmp, field_mask); if (ret < 0) return ret; break; @@ -3991,6 +3998,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; @@ -4050,12 +4058,28 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * Hence we only need to ensure that bpf_{list_head,rb_root} ownership * does not form cycles. */ - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR))) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; + const struct btf_type *t; u32 btf_id; + if (rec->fields[i].type == BPF_UPTR) { + /* The uptr only supports pinning one page and cannot + * point to a kernel struct + */ + if (btf_is_kernel(rec->fields[i].kptr.btf)) + return -EINVAL; + t = btf_type_by_id(rec->fields[i].kptr.btf, + rec->fields[i].kptr.btf_id); + if (!t->size) + return -EINVAL; + if (t->size > PAGE_SIZE) + return -E2BIG; + continue; + } + if (!(rec->fields[i].type & BPF_GRAPH_ROOT)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; @@ -4191,7 +4215,7 @@ static void btf_struct_show(const struct btf *btf, const struct btf_type *t, __btf_struct_show(btf, t, type_id, data, bits_offset, show); } -static struct btf_kind_operations struct_ops = { +static const struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, @@ -4359,7 +4383,7 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t, btf_show_end_type(show); } -static struct btf_kind_operations enum_ops = { +static const struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, @@ -4462,7 +4486,7 @@ static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, btf_show_end_type(show); } -static struct btf_kind_operations enum64_ops = { +static const struct btf_kind_operations enum64_ops = { .check_meta = btf_enum64_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, @@ -4540,7 +4564,7 @@ done: btf_verifier_log(env, ")"); } -static struct btf_kind_operations func_proto_ops = { +static const struct btf_kind_operations func_proto_ops = { .check_meta = btf_func_proto_check_meta, .resolve = btf_df_resolve, /* @@ -4598,7 +4622,7 @@ static int btf_func_resolve(struct btf_verifier_env *env, return 0; } -static struct btf_kind_operations func_ops = { +static const struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, .resolve = btf_func_resolve, .check_member = btf_df_check_member, @@ -5566,7 +5590,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) goto free_aof; } - ret = btf_find_kptr(btf, t, 0, 0, &tmp); + ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR); if (ret != BTF_FIELD_FOUND) continue; @@ -6415,6 +6439,101 @@ int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, return off; } +struct bpf_raw_tp_null_args { + const char *func; + u64 mask; +}; + +static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { + /* sched */ + { "sched_pi_setprio", 0x10 }, + /* ... from sched_numa_pair_template event class */ + { "sched_stick_numa", 0x100 }, + { "sched_swap_numa", 0x100 }, + /* afs */ + { "afs_make_fs_call", 0x10 }, + { "afs_make_fs_calli", 0x10 }, + { "afs_make_fs_call1", 0x10 }, + { "afs_make_fs_call2", 0x10 }, + { "afs_protocol_error", 0x1 }, + { "afs_flock_ev", 0x10 }, + /* cachefiles */ + { "cachefiles_lookup", 0x1 | 0x200 }, + { "cachefiles_unlink", 0x1 }, + { "cachefiles_rename", 0x1 }, + { "cachefiles_prep_read", 0x1 }, + { "cachefiles_mark_active", 0x1 }, + { "cachefiles_mark_failed", 0x1 }, + { "cachefiles_mark_inactive", 0x1 }, + { "cachefiles_vfs_error", 0x1 }, + { "cachefiles_io_error", 0x1 }, + { "cachefiles_ondemand_open", 0x1 }, + { "cachefiles_ondemand_copen", 0x1 }, + { "cachefiles_ondemand_close", 0x1 }, + { "cachefiles_ondemand_read", 0x1 }, + { "cachefiles_ondemand_cread", 0x1 }, + { "cachefiles_ondemand_fd_write", 0x1 }, + { "cachefiles_ondemand_fd_release", 0x1 }, + /* ext4, from ext4__mballoc event class */ + { "ext4_mballoc_discard", 0x10 }, + { "ext4_mballoc_free", 0x10 }, + /* fib */ + { "fib_table_lookup", 0x100 }, + /* filelock */ + /* ... from filelock_lock event class */ + { "posix_lock_inode", 0x10 }, + { "fcntl_setlk", 0x10 }, + { "locks_remove_posix", 0x10 }, + { "flock_lock_inode", 0x10 }, + /* ... from filelock_lease event class */ + { "break_lease_noblock", 0x10 }, + { "break_lease_block", 0x10 }, + { "break_lease_unblock", 0x10 }, + { "generic_delete_lease", 0x10 }, + { "time_out_leases", 0x10 }, + /* host1x */ + { "host1x_cdma_push_gather", 0x10000 }, + /* huge_memory */ + { "mm_khugepaged_scan_pmd", 0x10 }, + { "mm_collapse_huge_page_isolate", 0x1 }, + { "mm_khugepaged_scan_file", 0x10 }, + { "mm_khugepaged_collapse_file", 0x10 }, + /* kmem */ + { "mm_page_alloc", 0x1 }, + { "mm_page_pcpu_drain", 0x1 }, + /* .. from mm_page event class */ + { "mm_page_alloc_zone_locked", 0x1 }, + /* netfs */ + { "netfs_failure", 0x10 }, + /* power */ + { "device_pm_callback_start", 0x10 }, + /* qdisc */ + { "qdisc_dequeue", 0x1000 }, + /* rxrpc */ + { "rxrpc_recvdata", 0x1 }, + { "rxrpc_resend", 0x10 }, + /* sunrpc */ + { "xs_stream_read_data", 0x1 }, + /* ... from xprt_cong_event event class */ + { "xprt_reserve_cong", 0x10 }, + { "xprt_release_cong", 0x10 }, + { "xprt_get_cong", 0x10 }, + { "xprt_put_cong", 0x10 }, + /* tcp */ + { "tcp_send_reset", 0x11 }, + /* tegra_apb_dma */ + { "tegra_dma_tx_status", 0x100 }, + /* timer_migration */ + { "tmigr_update_events", 0x1 }, + /* writeback, from writeback_folio_template event class */ + { "writeback_dirty_folio", 0x10 }, + { "folio_wait_writeback", 0x10 }, + /* rdma */ + { "mr_integ_alloc", 0x2000 }, + /* bpf_testmod */ + { "bpf_testmod_test_read", 0x0 }, +}; + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -6425,6 +6544,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; + bool ptr_err_raw_tp = false; const char *tag_value; u32 nr_args, arg; int i, ret; @@ -6519,6 +6639,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; } + if (size != sizeof(u64)) { + bpf_log(log, "func '%s' size %d must be 8\n", + tname, size); + return false; + } + /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; @@ -6567,6 +6693,39 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { + struct btf *btf = prog->aux->attach_btf; + const struct btf_type *t; + const char *tname; + + /* BTF lookups cannot fail, return false on error */ + t = btf_type_by_id(btf, prog->aux->attach_btf_id); + if (!t) + return false; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return false; + /* Checked by bpf_check_attach_target */ + tname += sizeof("btf_trace_") - 1; + for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) { + /* Is this a func with potential NULL args? */ + if (strcmp(tname, raw_tp_null_args[i].func)) + continue; + if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + info->reg_type |= PTR_MAYBE_NULL; + /* Is the current arg IS_ERR? */ + if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + ptr_err_raw_tp = true; + break; + } + /* If we don't know NULL-ness specification and the tracepoint + * is coming from a loadable module, be conservative and mark + * argument as PTR_MAYBE_NULL. + */ + if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf)) + info->reg_type |= PTR_MAYBE_NULL; + } + if (tgt_prog) { enum bpf_prog_type tgt_type; @@ -6611,6 +6770,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); + + /* Perform all checks on the validity of type for this argument, but if + * we know it can be IS_ERR at runtime, scrub pointer type and mark as + * scalar. + */ + if (ptr_err_raw_tp) { + bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg); + info->reg_type = SCALAR_VALUE; + } return true; } EXPORT_SYMBOL_GPL(btf_ctx_access); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 025d7e2214ae..46e5db65dbc8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1708,7 +1708,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * returned value != 1 during execution. In all other cases 0 is returned. */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, - struct ctl_table *table, int write, + const struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, enum cgroup_bpf_attach_type atype) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5e77c58e0601..da729cbbaeb9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -21,7 +21,7 @@ #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> @@ -131,6 +131,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); #endif mutex_init(&fp->aux->used_maps_mutex); + mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); return fp; @@ -538,6 +539,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) { + int err; + /* Branch offsets can't overflow when program is shrinking, no need * to call bpf_adj_branches(..., true) here */ @@ -545,7 +548,9 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) sizeof(struct bpf_insn) * (prog->len - off - cnt)); prog->len -= cnt; - return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); + err = bpf_adj_branches(prog, off, off + cnt, off, false); + WARN_ON_ONCE(err); + return err; } static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) @@ -2935,7 +2940,7 @@ void __weak bpf_jit_compile(struct bpf_prog *prog) { } -bool __weak bpf_helper_changes_pkt_data(void *func) +bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { return false; } @@ -3044,6 +3049,11 @@ bool __weak bpf_jit_supports_exceptions(void) return false; } +bool __weak bpf_jit_supports_private_stack(void) +{ + return false; +} + void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 7878be18e9d2..3aa002a47a96 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -184,7 +184,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i; + u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -821,7 +821,7 @@ static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; - int k = *(u32 *)key; + u32 k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; @@ -838,7 +838,7 @@ static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; - int k = *(u32 *)key; + u32 k = *(u32 *)key; unsigned long flags; int ret = -ENOENT; diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 70fb82bf1637..b77db7413f8c 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -154,7 +154,8 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, d->image = NULL; goto out; } - bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym); + bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym); + bpf_image_ksym_add(&d->ksym); } prev_num_progs = d->num_progs; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b14b87463ee0..3ec941a0ea41 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -896,9 +896,12 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { check_and_free_fields(htab, l); + + migrate_disable(); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); bpf_mem_cache_free(&htab->ma, l); + migrate_enable(); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -948,7 +951,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) if (htab_is_prealloc(htab)) { bpf_map_dec_elem_count(&htab->map); check_and_free_fields(htab, l); - __pcpu_freelist_push(&htab->freelist, &l->fnode); + pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); htab_elem_free(htab, l); @@ -1018,7 +1021,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, */ pl_new = this_cpu_ptr(htab->extra_elems); l_new = *pl_new; - htab_put_fd_value(htab, old_elem); *pl_new = old_elem; } else { struct pcpu_freelist_node *l; @@ -1105,6 +1107,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; + void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1183,12 +1186,27 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); + + /* l_old has already been stashed in htab->extra_elems, free + * its special fields before it is available for reuse. Also + * save the old map pointer in htab of maps before unlock + * and release it after unlock. + */ + old_map_ptr = NULL; + if (htab_is_prealloc(htab)) { + if (map->ops->map_fd_put_ptr) + old_map_ptr = fd_htab_map_get_ptr(map, l_old); + check_and_free_fields(htab, l_old); + } + } + htab_unlock_bucket(htab, b, hash, flags); + if (l_old) { + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); - else - check_and_free_fields(htab, l_old); } - ret = 0; + return 0; err: htab_unlock_bucket(htab, b, hash, flags); return ret; @@ -1432,15 +1450,15 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) return ret; l = lookup_elem_raw(head, hash, key, key_size); - - if (l) { + if (l) hlist_nulls_del_rcu(&l->hash_node); - free_htab_elem(htab, l); - } else { + else ret = -ENOENT; - } htab_unlock_bucket(htab, b, hash, flags); + + if (l) + free_htab_elem(htab, l); return ret; } @@ -1853,13 +1871,14 @@ again_nocopy: * may cause deadlock. See comments in function * prealloc_lru_pop(). Let us do bpf_lru_push_free() * after releasing the bucket lock. + * + * For htab of maps, htab_put_fd_value() in + * free_htab_elem() may acquire a spinlock with bucket + * lock being held and it violates the lock rule, so + * invoke free_htab_elem() after unlock as well. */ - if (is_lru_map) { - l->batch_flink = node_to_free; - node_to_free = l; - } else { - free_htab_elem(htab, l); - } + l->batch_flink = node_to_free; + node_to_free = l; } dst_key += key_size; dst_val += value_size; @@ -1871,7 +1890,10 @@ again_nocopy: while (node_to_free) { l = node_to_free; node_to_free = node_to_free->batch_flink; - htab_lru_push_free(htab, l); + if (is_lru_map) + htab_lru_push_free(htab, l); + else + free_htab_elem(htab, l); } next_batch: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3d45ebe8afb4..751c150f9e1c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2522,6 +2522,25 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) } /** + * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up + * in the pid namespace of the current task. If a task is returned, it must + * either be stored in a map, or released with bpf_task_release(). + * @vpid: The vpid of the task being looked up. + */ +__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) +{ + struct task_struct *p; + + rcu_read_lock(); + p = find_task_by_vpid(vpid); + if (p) + p = bpf_task_acquire(p); + rcu_read_unlock(); + + return p; +} + +/** * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr @@ -3068,7 +3087,9 @@ BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) +BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { @@ -3086,8 +3107,8 @@ BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_KFUNCS_START(common_btf_ids) -BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) -BTF_ID_FLAGS(func, bpf_rdonly_cast) +BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL) +BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL) BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) @@ -3124,6 +3145,10 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_get_kmem_cache) +BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c new file mode 100644 index 000000000000..3ae2158d767f --- /dev/null +++ b/kernel/bpf/kmem_cache_iter.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Google */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */ + +/* open-coded version */ +struct bpf_iter_kmem_cache { + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +struct bpf_iter_kmem_cache_kern { + struct kmem_cache *pos; +} __attribute__((aligned(8))); + +#define KMEM_CACHE_POS_START ((void *)1L) + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) +{ + struct bpf_iter_kmem_cache_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(*kit) > sizeof(*it)); + BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it)); + + kit->pos = KMEM_CACHE_POS_START; + return 0; +} + +__bpf_kfunc struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it) +{ + struct bpf_iter_kmem_cache_kern *kit = (void *)it; + struct kmem_cache *prev = kit->pos; + struct kmem_cache *next; + bool destroy = false; + + if (!prev) + return NULL; + + mutex_lock(&slab_mutex); + + if (list_empty(&slab_caches)) { + mutex_unlock(&slab_mutex); + return NULL; + } + + if (prev == KMEM_CACHE_POS_START) + next = list_first_entry(&slab_caches, struct kmem_cache, list); + else if (list_last_entry(&slab_caches, struct kmem_cache, list) == prev) + next = NULL; + else + next = list_next_entry(prev, list); + + /* boot_caches have negative refcount, don't touch them */ + if (next && next->refcount > 0) + next->refcount++; + + /* Skip kmem_cache_destroy() for active entries */ + if (prev && prev != KMEM_CACHE_POS_START) { + if (prev->refcount > 1) + prev->refcount--; + else if (prev->refcount == 1) + destroy = true; + } + + mutex_unlock(&slab_mutex); + + if (destroy) + kmem_cache_destroy(prev); + + kit->pos = next; + return next; +} + +__bpf_kfunc void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it) +{ + struct bpf_iter_kmem_cache_kern *kit = (void *)it; + struct kmem_cache *s = kit->pos; + bool destroy = false; + + if (s == NULL || s == KMEM_CACHE_POS_START) + return; + + mutex_lock(&slab_mutex); + + /* Skip kmem_cache_destroy() for active entries */ + if (s->refcount > 1) + s->refcount--; + else if (s->refcount == 1) + destroy = true; + + mutex_unlock(&slab_mutex); + + if (destroy) + kmem_cache_destroy(s); +} + +__bpf_kfunc_end_defs(); + +struct bpf_iter__kmem_cache { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct kmem_cache *, s); +}; + +union kmem_cache_iter_priv { + struct bpf_iter_kmem_cache it; + struct bpf_iter_kmem_cache_kern kit; +}; + +static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + loff_t cnt = 0; + bool found = false; + struct kmem_cache *s; + union kmem_cache_iter_priv *p = seq->private; + + mutex_lock(&slab_mutex); + + /* Find an entry at the given position in the slab_caches list instead + * of keeping a reference (of the last visited entry, if any) out of + * slab_mutex. It might miss something if one is deleted in the middle + * while it releases the lock. But it should be rare and there's not + * much we can do about it. + */ + list_for_each_entry(s, &slab_caches, list) { + if (cnt == *pos) { + /* Make sure this entry remains in the list by getting + * a new reference count. Note that boot_cache entries + * have a negative refcount, so don't touch them. + */ + if (s->refcount > 0) + s->refcount++; + found = true; + break; + } + cnt++; + } + mutex_unlock(&slab_mutex); + + if (!found) + s = NULL; + + p->kit.pos = s; + return s; +} + +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_iter__kmem_cache ctx = { + .meta = &meta, + .s = v, + }; + union kmem_cache_iter_priv *p = seq->private; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog && !ctx.s) + bpf_iter_run_prog(prog, &ctx); + + bpf_iter_kmem_cache_destroy(&p->it); +} + +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + union kmem_cache_iter_priv *p = seq->private; + + ++*pos; + + return bpf_iter_kmem_cache_next(&p->it); +} + +static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_iter__kmem_cache ctx = { + .meta = &meta, + .s = v, + }; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static const struct seq_operations kmem_cache_iter_seq_ops = { + .start = kmem_cache_iter_seq_start, + .next = kmem_cache_iter_seq_next, + .stop = kmem_cache_iter_seq_stop, + .show = kmem_cache_iter_seq_show, +}; + +BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache) + +static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = { + .seq_ops = &kmem_cache_iter_seq_ops, + .seq_priv_size = sizeof(union kmem_cache_iter_priv), +}; + +static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_puts(seq, "kmem_cache iter\n"); +} + +DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta, + struct kmem_cache *s) + +static struct bpf_iter_reg bpf_kmem_cache_reg_info = { + .target = "kmem_cache", + .feature = BPF_ITER_RESCHED, + .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__kmem_cache, s), + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, + }, + .seq_info = &kmem_cache_iter_seq_info, +}; + +static int __init bpf_kmem_cache_iter_init(void) +{ + bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0]; + return bpf_iter_reg_target(&bpf_kmem_cache_reg_info); +} + +late_initcall(bpf_kmem_cache_iter_init); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 9b60eda0f727..f8bc1e096182 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -15,6 +15,7 @@ #include <net/ipv6.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <linux/bpf_mem_alloc.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -22,7 +23,6 @@ struct lpm_trie_node; struct lpm_trie_node { - struct rcu_head rcu; struct lpm_trie_node __rcu *child[2]; u32 prefixlen; u32 flags; @@ -32,10 +32,11 @@ struct lpm_trie_node { struct lpm_trie { struct bpf_map map; struct lpm_trie_node __rcu *root; + struct bpf_mem_alloc ma; size_t n_entries; size_t max_prefixlen; size_t data_size; - spinlock_t lock; + raw_spinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) return found->data + trie->data_size; } -static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, - const void *value) +static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie, + const void *value, + bool disable_migration) { struct lpm_trie_node *node; - size_t size = sizeof(struct lpm_trie_node) + trie->data_size; - if (value) - size += trie->map.value_size; + if (disable_migration) + migrate_disable(); + node = bpf_mem_cache_alloc(&trie->ma); + if (disable_migration) + migrate_enable(); - node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN, - trie->map.numa_node); if (!node) return NULL; @@ -310,12 +312,22 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, return node; } +static int trie_check_add_elem(struct lpm_trie *trie, u64 flags) +{ + if (flags == BPF_EXIST) + return -ENOENT; + if (trie->n_entries == trie->map.max_entries) + return -ENOSPC; + trie->n_entries++; + return 0; +} + /* Called from syscall or from eBPF program */ static long trie_update_elem(struct bpf_map *map, void *_key, void *value, u64 flags) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *node, *im_node, *new_node; struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; @@ -330,22 +342,14 @@ static long trie_update_elem(struct bpf_map *map, if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - spin_lock_irqsave(&trie->lock, irq_flags); - - /* Allocate and fill a new node */ - - if (trie->n_entries == trie->map.max_entries) { - ret = -ENOSPC; - goto out; - } - - new_node = lpm_trie_node_alloc(trie, value); - if (!new_node) { - ret = -ENOMEM; - goto out; - } + /* Allocate and fill a new node. Need to disable migration before + * invoking bpf_mem_cache_alloc(). + */ + new_node = lpm_trie_node_alloc(trie, value, true); + if (!new_node) + return -ENOMEM; - trie->n_entries++; + raw_spin_lock_irqsave(&trie->lock, irq_flags); new_node->prefixlen = key->prefixlen; RCU_INIT_POINTER(new_node->child[0], NULL); @@ -364,8 +368,7 @@ static long trie_update_elem(struct bpf_map *map, matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || - node->prefixlen == key->prefixlen || - node->prefixlen == trie->max_prefixlen) + node->prefixlen == key->prefixlen) break; next_bit = extract_bit(key->data, node->prefixlen); @@ -376,6 +379,10 @@ static long trie_update_elem(struct bpf_map *map, * simply assign the @new_node to that slot and be done. */ if (!node) { + ret = trie_check_add_elem(trie, flags); + if (ret) + goto out; + rcu_assign_pointer(*slot, new_node); goto out; } @@ -384,18 +391,30 @@ static long trie_update_elem(struct bpf_map *map, * which already has the correct data array set. */ if (node->prefixlen == matchlen) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) { + if (flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out; + } + } else { + ret = trie_check_add_elem(trie, flags); + if (ret) + goto out; + } + new_node->child[0] = node->child[0]; new_node->child[1] = node->child[1]; - if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) - trie->n_entries--; - rcu_assign_pointer(*slot, new_node); free_node = node; goto out; } + ret = trie_check_add_elem(trie, flags); + if (ret) + goto out; + /* If the new node matches the prefix completely, it must be inserted * as an ancestor. Simply insert it between @node and *@slot. */ @@ -406,8 +425,10 @@ static long trie_update_elem(struct bpf_map *map, goto out; } - im_node = lpm_trie_node_alloc(trie, NULL); + /* migration is disabled within the locked scope */ + im_node = lpm_trie_node_alloc(trie, NULL, false); if (!im_node) { + trie->n_entries--; ret = -ENOMEM; goto out; } @@ -429,16 +450,13 @@ static long trie_update_elem(struct bpf_map *map, rcu_assign_pointer(*slot, im_node); out: - if (ret) { - if (new_node) - trie->n_entries--; + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - kfree(new_node); - kfree(im_node); - } - - spin_unlock_irqrestore(&trie->lock, irq_flags); - kfree_rcu(free_node, rcu); + migrate_disable(); + if (ret) + bpf_mem_cache_free(&trie->ma, new_node); + bpf_mem_cache_free_rcu(&trie->ma, free_node); + migrate_enable(); return ret; } @@ -459,7 +477,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - spin_lock_irqsave(&trie->lock, irq_flags); + raw_spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -535,9 +553,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) free_node = node; out: - spin_unlock_irqrestore(&trie->lock, irq_flags); - kfree_rcu(free_parent, rcu); - kfree_rcu(free_node, rcu); + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + migrate_disable(); + bpf_mem_cache_free_rcu(&trie->ma, free_parent); + bpf_mem_cache_free_rcu(&trie->ma, free_node); + migrate_enable(); return ret; } @@ -559,6 +580,8 @@ out: static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; + size_t leaf_size; + int err; /* check sanity of attributes */ if (attr->max_entries == 0 || @@ -581,9 +604,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; - spin_lock_init(&trie->lock); + raw_spin_lock_init(&trie->lock); + /* Allocate intermediate and leaf nodes from the same allocator */ + leaf_size = sizeof(struct lpm_trie_node) + trie->data_size + + trie->map.value_size; + err = bpf_mem_alloc_init(&trie->ma, leaf_size, false); + if (err) + goto free_out; return &trie->map; + +free_out: + bpf_map_area_free(trie); + return ERR_PTR(err); } static void trie_free(struct bpf_map *map) @@ -615,13 +648,17 @@ static void trie_free(struct bpf_map *map) continue; } - kfree(node); + /* No bpf program may access the map, so freeing the + * node without waiting for the extra RCU GP. + */ + bpf_mem_cache_raw_free(node); RCU_INIT_POINTER(*slot, NULL); break; } } out: + bpf_mem_alloc_destroy(&trie->ma); bpf_map_area_free(trie); } @@ -633,7 +670,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) struct lpm_trie_node **node_stack = NULL; int err = 0, stack_ptr = -1; unsigned int next_bit; - size_t matchlen; + size_t matchlen = 0; /* The get_next_key follows postorder. For the 4 node example in * the top of this file, the trie_get_next_key() returns the following @@ -672,7 +709,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) next_bit = extract_bit(key->data, node->prefixlen); node = rcu_dereference(node->child[next_bit]); } - if (!node || node->prefixlen != key->prefixlen || + if (!node || node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 146f5b57cfb1..889374722d0a 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -254,11 +254,8 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) static void free_one(void *obj, bool percpu) { - if (percpu) { + if (percpu) free_percpu(((void __percpu **)obj)[1]); - kfree(obj); - return; - } kfree(obj); } diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c new file mode 100644 index 000000000000..5bdf9aadca3a --- /dev/null +++ b/kernel/bpf/range_tree.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <linux/interval_tree_generic.h> +#include <linux/slab.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/bpf.h> +#include "range_tree.h" + +/* + * struct range_tree is a data structure used to allocate contiguous memory + * ranges in bpf arena. It's a large bitmap. The contiguous sequence of bits is + * represented by struct range_node or 'rn' for short. + * rn->rn_rbnode links it into an interval tree while + * rn->rb_range_size links it into a second rbtree sorted by size of the range. + * __find_range() performs binary search and best fit algorithm to find the + * range less or equal requested size. + * range_tree_clear/set() clears or sets a range of bits in this bitmap. The + * adjacent ranges are merged or split at the same time. + * + * The split/merge logic is based/borrowed from XFS's xbitmap32 added + * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree"). + * + * The implementation relies on external lock to protect rbtree-s. + * The alloc/free of range_node-s is done via bpf_mem_alloc. + * + * bpf arena is using range_tree to represent unallocated slots. + * At init time: + * range_tree_set(rt, 0, max); + * Then: + * start = range_tree_find(rt, len); + * if (start >= 0) + * range_tree_clear(rt, start, len); + * to find free range and mark slots as allocated and later: + * range_tree_set(rt, start, len); + * to mark as unallocated after use. + */ +struct range_node { + struct rb_node rn_rbnode; + struct rb_node rb_range_size; + u32 rn_start; + u32 rn_last; /* inclusive */ + u32 __rn_subtree_last; +}; + +static struct range_node *rb_to_range_node(struct rb_node *rb) +{ + return rb_entry(rb, struct range_node, rb_range_size); +} + +static u32 rn_size(struct range_node *rn) +{ + return rn->rn_last - rn->rn_start + 1; +} + +/* Find range that fits best to requested size */ +static inline struct range_node *__find_range(struct range_tree *rt, u32 len) +{ + struct rb_node *rb = rt->range_size_root.rb_root.rb_node; + struct range_node *best = NULL; + + while (rb) { + struct range_node *rn = rb_to_range_node(rb); + + if (len <= rn_size(rn)) { + best = rn; + rb = rb->rb_right; + } else { + rb = rb->rb_left; + } + } + + return best; +} + +s64 range_tree_find(struct range_tree *rt, u32 len) +{ + struct range_node *rn; + + rn = __find_range(rt, len); + if (!rn) + return -ENOENT; + return rn->rn_start; +} + +/* Insert the range into rbtree sorted by the range size */ +static inline void __range_size_insert(struct range_node *rn, + struct rb_root_cached *root) +{ + struct rb_node **link = &root->rb_root.rb_node, *rb = NULL; + u64 size = rn_size(rn); + bool leftmost = true; + + while (*link) { + rb = *link; + if (size > rn_size(rb_to_range_node(rb))) { + link = &rb->rb_left; + } else { + link = &rb->rb_right; + leftmost = false; + } + } + + rb_link_node(&rn->rb_range_size, rb, link); + rb_insert_color_cached(&rn->rb_range_size, root, leftmost); +} + +#define START(node) ((node)->rn_start) +#define LAST(node) ((node)->rn_last) + +INTERVAL_TREE_DEFINE(struct range_node, rn_rbnode, u32, + __rn_subtree_last, START, LAST, + static inline __maybe_unused, + __range_it) + +static inline __maybe_unused void +range_it_insert(struct range_node *rn, struct range_tree *rt) +{ + __range_size_insert(rn, &rt->range_size_root); + __range_it_insert(rn, &rt->it_root); +} + +static inline __maybe_unused void +range_it_remove(struct range_node *rn, struct range_tree *rt) +{ + rb_erase_cached(&rn->rb_range_size, &rt->range_size_root); + RB_CLEAR_NODE(&rn->rb_range_size); + __range_it_remove(rn, &rt->it_root); +} + +static inline __maybe_unused struct range_node * +range_it_iter_first(struct range_tree *rt, u32 start, u32 last) +{ + return __range_it_iter_first(&rt->it_root, start, last); +} + +/* Clear the range in this range tree */ +int range_tree_clear(struct range_tree *rt, u32 start, u32 len) +{ + u32 last = start + len - 1; + struct range_node *new_rn; + struct range_node *rn; + + while ((rn = range_it_iter_first(rt, start, last))) { + if (rn->rn_start < start && rn->rn_last > last) { + u32 old_last = rn->rn_last; + + /* Overlaps with the entire clearing range */ + range_it_remove(rn, rt); + rn->rn_last = start - 1; + range_it_insert(rn, rt); + + /* Add a range */ + migrate_disable(); + new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); + migrate_enable(); + if (!new_rn) + return -ENOMEM; + new_rn->rn_start = last + 1; + new_rn->rn_last = old_last; + range_it_insert(new_rn, rt); + } else if (rn->rn_start < start) { + /* Overlaps with the left side of the clearing range */ + range_it_remove(rn, rt); + rn->rn_last = start - 1; + range_it_insert(rn, rt); + } else if (rn->rn_last > last) { + /* Overlaps with the right side of the clearing range */ + range_it_remove(rn, rt); + rn->rn_start = last + 1; + range_it_insert(rn, rt); + break; + } else { + /* in the middle of the clearing range */ + range_it_remove(rn, rt); + migrate_disable(); + bpf_mem_free(&bpf_global_ma, rn); + migrate_enable(); + } + } + return 0; +} + +/* Is the whole range set ? */ +int is_range_tree_set(struct range_tree *rt, u32 start, u32 len) +{ + u32 last = start + len - 1; + struct range_node *left; + + /* Is this whole range set ? */ + left = range_it_iter_first(rt, start, last); + if (left && left->rn_start <= start && left->rn_last >= last) + return 0; + return -ESRCH; +} + +/* Set the range in this range tree */ +int range_tree_set(struct range_tree *rt, u32 start, u32 len) +{ + u32 last = start + len - 1; + struct range_node *right; + struct range_node *left; + int err; + + /* Is this whole range already set ? */ + left = range_it_iter_first(rt, start, last); + if (left && left->rn_start <= start && left->rn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + err = range_tree_clear(rt, start, len); + if (err) + return err; + + /* Do we have a left-adjacent range ? */ + left = range_it_iter_first(rt, start - 1, start - 1); + if (left && left->rn_last + 1 != start) + return -EFAULT; + + /* Do we have a right-adjacent range ? */ + right = range_it_iter_first(rt, last + 1, last + 1); + if (right && right->rn_start != last + 1) + return -EFAULT; + + if (left && right) { + /* Combine left and right adjacent ranges */ + range_it_remove(left, rt); + range_it_remove(right, rt); + left->rn_last = right->rn_last; + range_it_insert(left, rt); + migrate_disable(); + bpf_mem_free(&bpf_global_ma, right); + migrate_enable(); + } else if (left) { + /* Combine with the left range */ + range_it_remove(left, rt); + left->rn_last = last; + range_it_insert(left, rt); + } else if (right) { + /* Combine with the right range */ + range_it_remove(right, rt); + right->rn_start = start; + range_it_insert(right, rt); + } else { + migrate_disable(); + left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); + migrate_enable(); + if (!left) + return -ENOMEM; + left->rn_start = start; + left->rn_last = last; + range_it_insert(left, rt); + } + return 0; +} + +void range_tree_destroy(struct range_tree *rt) +{ + struct range_node *rn; + + while ((rn = range_it_iter_first(rt, 0, -1U))) { + range_it_remove(rn, rt); + migrate_disable(); + bpf_mem_free(&bpf_global_ma, rn); + migrate_enable(); + } +} + +void range_tree_init(struct range_tree *rt) +{ + rt->it_root = RB_ROOT_CACHED; + rt->range_size_root = RB_ROOT_CACHED; +} diff --git a/kernel/bpf/range_tree.h b/kernel/bpf/range_tree.h new file mode 100644 index 000000000000..ff0b9110eb71 --- /dev/null +++ b/kernel/bpf/range_tree.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#ifndef _RANGE_TREE_H +#define _RANGE_TREE_H 1 + +struct range_tree { + /* root of interval tree */ + struct rb_root_cached it_root; + /* root of rbtree of interval sizes */ + struct rb_root_cached range_size_root; +}; + +void range_tree_init(struct range_tree *rt); +void range_tree_destroy(struct range_tree *rt); + +int range_tree_clear(struct range_tree *rt, u32 start, u32 len); +int range_tree_set(struct range_tree *rt, u32 start, u32 len); +int is_range_tree_set(struct range_tree *rt, u32 start, u32 len); +s64 range_tree_find(struct range_tree *rt, u32 len); + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c5aa127ed4cc..5684e8ce132d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> +#include <linux/tracepoint.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -155,6 +156,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) synchronize_rcu(); } +static void unpin_uptr_kaddr(void *kaddr) +{ + if (kaddr) + unpin_user_page(virt_to_page(kaddr)); +} + +static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj) +{ + const struct btf_field *field; + void **uptr_addr; + int i; + + for (i = 0, field = rec->fields; i < cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + unpin_uptr_kaddr(*uptr_addr); + } +} + +static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj) +{ + if (!btf_record_has_field(rec, BPF_UPTR)) + return; + + __bpf_obj_unpin_uptrs(rec, rec->cnt, obj); +} + +static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj) +{ + const struct btf_field *field; + const struct btf_type *t; + unsigned long start, end; + struct page *page; + void **uptr_addr; + int i, err; + + if (!btf_record_has_field(rec, BPF_UPTR)) + return 0; + + for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + start = *(unsigned long *)uptr_addr; + if (!start) + continue; + + t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); + /* t->size was checked for zero before */ + if (check_add_overflow(start, t->size - 1, &end)) { + err = -EFAULT; + goto unpin_all; + } + + /* The uptr's struct cannot span across two pages */ + if ((start & PAGE_MASK) != (end & PAGE_MASK)) { + err = -EOPNOTSUPP; + goto unpin_all; + } + + err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page); + if (err != 1) + goto unpin_all; + + if (PageHighMem(page)) { + err = -EOPNOTSUPP; + unpin_user_page(page); + goto unpin_all; + } + + *uptr_addr = page_address(page) + offset_in_page(start); + } + + return 0; + +unpin_all: + __bpf_obj_unpin_uptrs(rec, i, obj); + return err; +} + static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, void *key, void *value, __u64 flags) { @@ -199,9 +283,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_push_elem(map, value, flags); } else { - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, flags); - rcu_read_unlock(); + err = bpf_obj_pin_uptrs(map->record, value); + if (!err) { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, flags); + rcu_read_unlock(); + if (err) + bpf_obj_unpin_uptrs(map->record, value); + } } bpf_enable_instrumentation(); @@ -548,6 +637,7 @@ void btf_record_free(struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); if (btf_is_kernel(rec->fields[i].kptr.btf)) @@ -597,6 +687,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (btf_is_kernel(fields[i].kptr.btf)) btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { @@ -714,6 +805,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) field->kptr.dtor(xchgd_field); } break; + case BPF_UPTR: + /* The caller ensured that no one is using the uptr */ + unpin_uptr_kaddr(*(void **)field_ptr); + break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; @@ -1105,7 +1200,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1161,6 +1256,12 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, goto free_map_tab; } break; + case BPF_UPTR: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; case BPF_LIST_HEAD: case BPF_RB_ROOT: if (map->map_type != BPF_MAP_TYPE_HASH && @@ -2933,17 +3034,33 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog) +/* bpf_link_init_sleepable() allows to specify whether BPF link itself has + * "sleepable" semantics, which normally would mean that BPF link's attach + * hook can dereference link or link's underlying program for some time after + * detachment due to RCU Tasks Trace-based lifetime protection scheme. + * BPF program itself can be non-sleepable, yet, because it's transitively + * reachable through BPF link, its freeing has to be delayed until after RCU + * Tasks Trace GP. + */ +void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable) { WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); link->type = type; + link->sleepable = sleepable; link->id = 0; link->ops = ops; link->prog = prog; } +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog) +{ + bpf_link_init_sleepable(link, type, ops, prog, false); +} + static void bpf_link_free_id(int id) { if (!id) @@ -2976,12 +3093,24 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); } +static void bpf_link_dealloc(struct bpf_link *link) +{ + /* now that we know that bpf_link itself can't be reached, put underlying BPF program */ + if (link->prog) + bpf_prog_put(link->prog); + + /* free bpf_link and its containing memory */ + if (link->ops->dealloc_deferred) + link->ops->dealloc_deferred(link); + else + link->ops->dealloc(link); +} + static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) { struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); - /* free bpf_link and its containing memory */ - link->ops->dealloc_deferred(link); + bpf_link_dealloc(link); } static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) @@ -2996,26 +3125,27 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) static void bpf_link_free(struct bpf_link *link) { const struct bpf_link_ops *ops = link->ops; - bool sleepable = false; bpf_link_free_id(link->id); - if (link->prog) { - sleepable = link->prog->sleepable; - /* detach BPF program, clean up used resources */ + /* detach BPF program, clean up used resources */ + if (link->prog) ops->release(link); - bpf_prog_put(link->prog); - } if (ops->dealloc_deferred) { - /* schedule BPF link deallocation; if underlying BPF program - * is sleepable, we need to first wait for RCU tasks trace - * sync, then go through "classic" RCU grace period + /* Schedule BPF link deallocation, which will only then + * trigger putting BPF program refcount. + * If underlying BPF program is sleepable or BPF link's target + * attach hookpoint is sleepable or otherwise requires RCU GPs + * to ensure link and its underlying BPF program is not + * reachable anymore, we need to first wait for RCU tasks + * trace sync, and then go through "classic" RCU grace period */ - if (sleepable) + if (link->sleepable || (link->prog && link->prog->sleepable)) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); - } else if (ops->dealloc) - ops->dealloc(link); + } else if (ops->dealloc) { + bpf_link_dealloc(link); + } } static void bpf_link_put_deferred(struct work_struct *work) @@ -3218,7 +3348,8 @@ static void bpf_tracing_link_release(struct bpf_link *link) container_of(link, struct bpf_tracing_link, link.link); WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, - tr_link->trampoline)); + tr_link->trampoline, + tr_link->tgt_prog)); bpf_trampoline_put(tr_link->trampoline); @@ -3358,7 +3489,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, * in prog->aux * * - if prog->aux->dst_trampoline is NULL, the program has already been - * attached to a target and its initial target was cleared (below) + * attached to a target and its initial target was cleared (below) * * - if tgt_prog != NULL, the caller specified tgt_prog_fd + * target_btf_id using the link_create API. @@ -3433,7 +3564,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; - err = bpf_trampoline_link_prog(&link->link, tr); + err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); if (err) { bpf_link_cleanup(&link_primer); link = NULL; @@ -3835,8 +3966,9 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, - &bpf_raw_tp_link_lops, prog); + bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog, + tracepoint_is_faultable(btp->tp)); link->btp = btp; link->cookie = cookie; @@ -4002,10 +4134,14 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && + attach_type != BPF_TRACE_UPROBE_SESSION) + return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_SESSION && - attach_type != BPF_TRACE_UPROBE_MULTI) + attach_type != BPF_TRACE_UPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_SESSION) return -EINVAL; return 0; case BPF_PROG_TYPE_SCHED_CLS: @@ -5258,7 +5394,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 5af9e130e500..98d9b4c0daff 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -5,7 +5,6 @@ #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/fs.h> -#include <linux/fdtable.h> #include <linux/filter.h> #include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> @@ -286,17 +285,14 @@ again: curr_fd = 0; } - rcu_read_lock(); - f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); + f = fget_task_next(curr_task, &curr_fd); if (f) { /* set info->fd */ info->fd = curr_fd; - rcu_read_unlock(); return f; } /* the current task is done, go to the next task */ - rcu_read_unlock(); put_task_struct(curr_task); if (info->common.type == BPF_TASK_ITER_TID) { diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index dcbec1a0dfb3..26057aa13503 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,6 +1,5 @@ #include <linux/bpf.h> #include <linux/vmalloc.h> -#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f8302a5ca400..c4b1a98ff726 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -115,10 +115,14 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym) +void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; ksym->end = ksym->start + size; +} + +void bpf_image_ksym_add(struct bpf_ksym *ksym) +{ bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, false, ksym->name); @@ -377,7 +381,8 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size) ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); - bpf_image_ksym_add(image, size, ksym); + bpf_image_ksym_init(image, size, ksym); + bpf_image_ksym_add(ksym); return im; out_free_image: @@ -523,7 +528,27 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) +{ + struct bpf_prog_aux *aux = tgt_prog->aux; + + guard(mutex)(&aux->ext_mutex); + if (aux->prog_array_member_cnt) + /* Program extensions can not extend target prog when the target + * prog has been updated to any prog_array map as tail callee. + * It's to prevent a potential infinite loop like: + * tgt prog entry -> tgt prog subprog -> freplace prog entry + * --tailcall-> tgt prog entry. + */ + return -EBUSY; + + aux->is_extended = true; + return 0; +} + +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; @@ -544,6 +569,9 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr /* Cannot attach extension if fentry/fexit are in use. */ if (cnt) return -EBUSY; + err = bpf_freplace_check_tgt_prog(tgt_prog); + if (err) + return err; tr->extension_prog = link->link.prog; return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); @@ -570,17 +598,21 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr return err; } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); - err = __bpf_trampoline_link_prog(link, tr); + err = __bpf_trampoline_link_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } -static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; int err; @@ -591,6 +623,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; + guard(mutex)(&tgt_prog->aux->ext_mutex); + tgt_prog->aux->is_extended = false; return err; } hlist_del_init(&link->tramp_hlist); @@ -599,12 +633,14 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_ } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); - err = __bpf_trampoline_unlink_prog(link, tr); + err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } @@ -619,7 +655,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link) if (!shim_link->trampoline) return; - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL)); bpf_trampoline_put(shim_link->trampoline); } @@ -733,7 +769,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, goto err; } - err = __bpf_trampoline_link_prog(&shim_link->link, tr); + err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL); if (err) goto err; @@ -868,6 +904,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); @@ -944,6 +982,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb99bada7e2e..f7f892a52a37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem { #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 +#define BPF_PRIV_STACK_MIN_SIZE 64 + static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); @@ -1181,14 +1183,17 @@ static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack) /* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which * case they are equivalent, or it's STACK_ZERO, in which case we preserve * more precise STACK_ZERO. - * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take - * env->allow_ptr_leaks into account and force STACK_MISC, if necessary. + * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged + * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is + * unnecessary as both are considered equivalent when loading data and pruning, + * in case of unprivileged mode it will be incorrect to allow reads of invalid + * slots. */ static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype) { if (*stype == STACK_ZERO) return; - if (env->allow_ptr_leaks && *stype == STACK_INVALID) + if (*stype == STACK_INVALID) return; *stype = STACK_MISC; } @@ -1265,6 +1270,7 @@ static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_fun if (!dst->refs) return -ENOMEM; + dst->active_locks = src->active_locks; dst->acquired_refs = src->acquired_refs; return 0; } @@ -1335,13 +1341,32 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) if (err) return err; id = ++env->id_gen; + state->refs[new_ofs].type = REF_TYPE_PTR; state->refs[new_ofs].id = id; state->refs[new_ofs].insn_idx = insn_idx; - state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0; return id; } +static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type, + int id, void *ptr) +{ + struct bpf_func_state *state = cur_func(env); + int new_ofs = state->acquired_refs; + int err; + + err = resize_reference_state(state, state->acquired_refs + 1); + if (err) + return err; + state->refs[new_ofs].type = type; + state->refs[new_ofs].id = id; + state->refs[new_ofs].insn_idx = insn_idx; + state->refs[new_ofs].ptr = ptr; + + state->active_locks++; + return 0; +} + /* release function corresponding to acquire_reference_state(). Idempotent. */ static int release_reference_state(struct bpf_func_state *state, int ptr_id) { @@ -1349,10 +1374,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != REF_TYPE_PTR) + continue; if (state->refs[i].id == ptr_id) { - /* Cannot release caller references in callbacks */ - if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) - return -EINVAL; if (last_idx && i != last_idx) memcpy(&state->refs[i], &state->refs[last_idx], sizeof(*state->refs)); @@ -1364,6 +1388,45 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) return -EINVAL; } +static int release_lock_state(struct bpf_func_state *state, int type, int id, void *ptr) +{ + int i, last_idx; + + last_idx = state->acquired_refs - 1; + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != type) + continue; + if (state->refs[i].id == id && state->refs[i].ptr == ptr) { + if (last_idx && i != last_idx) + memcpy(&state->refs[i], &state->refs[last_idx], + sizeof(*state->refs)); + memset(&state->refs[last_idx], 0, sizeof(*state->refs)); + state->acquired_refs--; + state->active_locks--; + return 0; + } + } + return -EINVAL; +} + +static struct bpf_reference_state *find_lock_state(struct bpf_verifier_env *env, enum ref_state_type type, + int id, void *ptr) +{ + struct bpf_func_state *state = cur_func(env); + int i; + + for (i = 0; i < state->acquired_refs; i++) { + struct bpf_reference_state *s = &state->refs[i]; + + if (s->type == REF_TYPE_PTR || s->type != type) + continue; + + if (s->id == id && s->ptr == ptr) + return s; + } + return NULL; +} + static void free_func_state(struct bpf_func_state *state) { if (!state) @@ -1373,13 +1436,6 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } -static void clear_jmp_history(struct bpf_verifier_state *state) -{ - kfree(state->jmp_history); - state->jmp_history = NULL; - state->jmp_history_cnt = 0; -} - static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -1389,7 +1445,6 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } - clear_jmp_history(state); if (free_self) kfree(state); } @@ -1415,13 +1470,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, struct bpf_func_state *dst; int i, err; - dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, - src->jmp_history_cnt, sizeof(*dst_state->jmp_history), - GFP_USER); - if (!dst_state->jmp_history) - return -ENOMEM; - dst_state->jmp_history_cnt = src->jmp_history_cnt; - /* if dst has more stack frames then src frame, free them, this is also * necessary in case of exceptional exits using bpf_throw. */ @@ -1434,12 +1482,12 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->active_preempt_lock = src->active_preempt_lock; dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; - dst_state->active_lock.ptr = src->active_lock.ptr; - dst_state->active_lock.id = src->active_lock.id; dst_state->branches = src->branches; dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; + dst_state->insn_hist_start = src->insn_hist_start; + dst_state->insn_hist_end = src->insn_hist_end; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; @@ -2492,9 +2540,14 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). + * But we do need to make sure to not clobber insn_hist, so we keep + * chaining insn_hist_start/insn_hist_end indices as for a normal + * child state. */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; + elem->st.insn_hist_start = env->cur_state->insn_hist_end; + elem->st.insn_hist_end = elem->st.insn_hist_start; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -2525,16 +2578,36 @@ static int cmp_subprogs(const void *a, const void *b) ((struct bpf_subprog_info *)b)->start; } +/* Find subprogram that contains instruction at 'off' */ +static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *vals = env->subprog_info; + int l, r, m; + + if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0) + return NULL; + + l = 0; + r = env->subprog_cnt - 1; + while (l < r) { + m = l + (r - l + 1) / 2; + if (vals[m].start <= off) + l = m; + else + r = m - 1; + } + return &vals[l]; +} + +/* Find subprogram that starts exactly at 'off' */ static int find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = bsearch(&off, env->subprog_info, env->subprog_cnt, - sizeof(env->subprog_info[0]), cmp_subprogs); - if (!p) + p = find_containing_subprog(env, off); + if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; - } static int add_subprog(struct bpf_verifier_env *env, int off) @@ -3474,11 +3547,10 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) +static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs) { - u32 cnt = cur->jmp_history_cnt; - struct bpf_jmp_history_entry *p; + struct bpf_insn_hist_entry *p; size_t alloc_size; /* combine instruction flags if we already recorded this instruction */ @@ -3498,29 +3570,32 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st return 0; } - cnt++; - alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); - p = krealloc(cur->jmp_history, alloc_size, GFP_USER); - if (!p) - return -ENOMEM; - cur->jmp_history = p; + if (cur->insn_hist_end + 1 > env->insn_hist_cap) { + alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p)); + p = kvrealloc(env->insn_hist, alloc_size, GFP_USER); + if (!p) + return -ENOMEM; + env->insn_hist = p; + env->insn_hist_cap = alloc_size / sizeof(*p); + } - p = &cur->jmp_history[cnt - 1]; + p = &env->insn_hist[cur->insn_hist_end]; p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; p->linked_regs = linked_regs; - cur->jmp_history_cnt = cnt; + + cur->insn_hist_end++; env->cur_hist_ent = p; return 0; } -static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, - u32 hist_end, int insn_idx) +static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env, + u32 hist_start, u32 hist_end, int insn_idx) { - if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) - return &st->jmp_history[hist_end - 1]; + if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx) + return &env->insn_hist[hist_end - 1]; return NULL; } @@ -3537,25 +3612,26 @@ static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_stat * history entry recording a jump from last instruction of parent state and * first instruction of given state. */ -static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, - u32 *history) +static int get_prev_insn_idx(const struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + int insn_idx, u32 hist_start, u32 *hist_endp) { - u32 cnt = *history; + u32 hist_end = *hist_endp; + u32 cnt = hist_end - hist_start; - if (i == st->first_insn_idx) { + if (insn_idx == st->first_insn_idx) { if (cnt == 0) return -ENOENT; - if (cnt == 1 && st->jmp_history[0].idx == i) + if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx) return -ENOENT; } - if (cnt && st->jmp_history[cnt - 1].idx == i) { - i = st->jmp_history[cnt - 1].prev_idx; - (*history)--; + if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) { + (*hist_endp)--; + return env->insn_hist[hist_end - 1].prev_idx; } else { - i--; + return insn_idx - 1; } - return i; } static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) @@ -3727,7 +3803,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -3772,7 +3848,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) + struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -4191,7 +4267,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * SCALARS, as well as any other registers and slots that contribute to * a tracked state of given registers/stack slots, depending on specific BPF * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded jmp_history and is able to + * logic). This backtracking relies on recorded insn_hist and is able to * traverse entire chain of parent states. This process ends only when all the * necessary registers/slots and their transitive dependencies are marked as * precise. @@ -4308,8 +4384,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); - u32 history = st->jmp_history_cnt; - struct bpf_jmp_history_entry *hist; + u32 hist_start = st->insn_hist_start; + u32 hist_end = st->insn_hist_end; + struct bpf_insn_hist_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4348,7 +4425,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - hist = get_jmp_hist_entry(st, history, i); + hist = get_insn_hist_entry(env, hist_start, hist_end, i); err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { @@ -4365,7 +4442,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) */ return 0; subseq_idx = i; - i = get_prev_insn_idx(st, i, &history); + i = get_prev_insn_idx(env, st, i, hist_start, &hist_end); if (i == -ENOENT) break; if (i >= env->prog->len) { @@ -4627,6 +4704,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, */ if (!env->allow_ptr_leaks && is_spilled_reg(&state->stack[spi]) && + !is_spilled_scalar_reg(&state->stack[spi]) && size != BPF_REG_SIZE) { verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; @@ -4731,7 +4809,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return push_insn_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5038,7 +5116,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return push_insn_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5423,7 +5501,7 @@ static bool in_sleepable(struct bpf_verifier_env *env) static bool in_rcu_cs(struct bpf_verifier_env *env) { return env->cur_state->active_rcu_lock || - env->cur_state->active_lock.ptr || + cur_func(env)->active_locks || !in_sleepable(env); } @@ -5491,6 +5569,22 @@ static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr return ret; } +static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno, + struct btf_field *field) +{ + struct bpf_reg_state *reg; + const struct btf_type *t; + + t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); + mark_reg_known_zero(env, cur_regs(env), regno); + reg = reg_state(env, regno); + reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; + reg->mem_size = t->size; + reg->id = ++env->id_gen; + + return 0; +} + static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, struct btf_field *kptr_field) @@ -5519,9 +5613,15 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } + if (class != BPF_LDX && kptr_field->type == BPF_UPTR) { + verbose(env, "store to uptr disallowed\n"); + return -EACCES; + } if (class == BPF_LDX) { - val_reg = reg_state(env, value_regno); + if (kptr_field->type == BPF_UPTR) + return mark_uptr_ld_reg(env, value_regno, kptr_field); + /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ @@ -5579,21 +5679,26 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (src != ACCESS_DIRECT) { - verbose(env, "kptr cannot be accessed indirectly by helper\n"); + verbose(env, "%s cannot be accessed indirectly by helper\n", + btf_field_type_name(field->type)); return -EACCES; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "kptr access cannot have variable offset\n"); + verbose(env, "%s access cannot have variable offset\n", + btf_field_type_name(field->type)); return -EACCES; } if (p != off + reg->var_off.value) { - verbose(env, "kptr access misaligned expected=%u off=%llu\n", + verbose(env, "%s access misaligned expected=%u off=%llu\n", + btf_field_type_name(field->type), p, off + reg->var_off.value); return -EACCES; } if (size != bpf_size_to_bytes(BPF_DW)) { - verbose(env, "kptr access size must be BPF_DW\n"); + verbose(env, "%s access size must be BPF_DW\n", + btf_field_type_name(field->type)); return -EACCES; } break; @@ -5988,6 +6093,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog) +{ + if (!bpf_jit_supports_private_stack()) + return NO_PRIV_STACK; + + /* bpf_prog_check_recur() checks all prog types that use bpf trampoline + * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked + * explicitly. + */ + switch (prog->type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + return PRIV_STACK_ADAPTIVE; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog)) + return PRIV_STACK_ADAPTIVE; + fallthrough; + default: + break; + } + + return NO_PRIV_STACK; +} + static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) { if (env->prog->jit_requested) @@ -6005,17 +6138,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) * Since recursion is prevented by check_cfg() this algorithm * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ -static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, + bool priv_stack_supported) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int depth = 0, frame = 0, i, subprog_end; + int depth = 0, frame = 0, i, subprog_end, subprog_depth; bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; int j; i = subprog[idx].start; + if (!priv_stack_supported) + subprog[idx].priv_stack_mode = NO_PRIV_STACK; process_func: /* protect against potential stack overflow that might happen when * bpf2bpf calls get combined with tailcalls. Limit the caller's stack @@ -6042,11 +6178,31 @@ process_func: depth); return -EACCES; } - depth += round_up_stack_depth(env, subprog[idx].stack_depth); - if (depth > MAX_BPF_STACK) { - verbose(env, "combined stack size of %d calls is %d. Too large\n", - frame + 1, depth); - return -EACCES; + + subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); + if (priv_stack_supported) { + /* Request private stack support only if the subprog stack + * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to + * avoid jit penalty if the stack usage is small. + */ + if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN && + subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) + subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE; + } + + if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + if (subprog_depth > MAX_BPF_STACK) { + verbose(env, "stack size of subprog %d is %d. Too large\n", + idx, subprog_depth); + return -EACCES; + } + } else { + depth += subprog_depth; + if (depth > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + frame + 1, depth); + return -EACCES; + } } continue_func: subprog_end = subprog[idx + 1].start; @@ -6103,6 +6259,8 @@ continue_func: } i = next_insn; idx = sidx; + if (!priv_stack_supported) + subprog[idx].priv_stack_mode = NO_PRIV_STACK; if (subprog[idx].has_tail_call) tail_call_reachable = true; @@ -6136,7 +6294,8 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up_stack_depth(env, subprog[idx].stack_depth); + if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE) + depth -= round_up_stack_depth(env, subprog[idx].stack_depth); frame--; i = ret_insn[frame]; idx = ret_prog[frame]; @@ -6145,17 +6304,45 @@ continue_func: static int check_max_stack_depth(struct bpf_verifier_env *env) { + enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN; struct bpf_subprog_info *si = env->subprog_info; + bool priv_stack_supported; int ret; for (int i = 0; i < env->subprog_cnt; i++) { + if (si[i].has_tail_call) { + priv_stack_mode = NO_PRIV_STACK; + break; + } + } + + if (priv_stack_mode == PRIV_STACK_UNKNOWN) + priv_stack_mode = bpf_enable_priv_stack(env->prog); + + /* All async_cb subprogs use normal kernel stack. If a particular + * subprog appears in both main prog and async_cb subtree, that + * subprog will use normal kernel stack to avoid potential nesting. + * The reverse subprog traversal ensures when main prog subtree is + * checked, the subprogs appearing in async_cb subtrees are already + * marked as using normal kernel stack, so stack size checking can + * be done properly. + */ + for (int i = env->subprog_cnt - 1; i >= 0; i--) { if (!i || si[i].is_async_cb) { - ret = check_max_stack_depth_subprog(env, i); + priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE; + ret = check_max_stack_depth_subprog(env, i, priv_stack_supported); if (ret < 0) return ret; } - continue; } + + for (int i = 0; i < env->subprog_cnt; i++) { + if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + env->prog->aux->jits_use_priv_stack = true; + break; + } + } + return 0; } @@ -6949,7 +7136,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; if (tnum_is_const(reg->var_off)) kptr_field = btf_record_find(reg->map_ptr->record, - off + reg->var_off.value, BPF_KPTR); + off + reg->var_off.value, BPF_KPTR | BPF_UPTR); if (kptr_field) { err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { @@ -7648,19 +7835,20 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg * Since only one bpf_spin_lock is allowed the checks are simpler than * reg_is_refcounted() logic. The verifier needs to remember only * one spin_lock instead of array of acquired_refs. - * cur_state->active_lock remembers which map value element or allocated + * cur_func(env)->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ static int process_spin_lock(struct bpf_verifier_env *env, int regno, bool is_lock) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); + struct bpf_func_state *cur = cur_func(env); u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; struct btf_record *rec; + int err; if (!is_const) { verbose(env, @@ -7692,16 +7880,23 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, return -EINVAL; } if (is_lock) { - if (cur->active_lock.ptr) { + void *ptr; + + if (map) + ptr = map; + else + ptr = btf; + + if (cur->active_locks) { verbose(env, "Locking two bpf_spin_locks are not allowed\n"); return -EINVAL; } - if (map) - cur->active_lock.ptr = map; - else - cur->active_lock.ptr = btf; - cur->active_lock.id = reg->id; + err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); + if (err < 0) { + verbose(env, "Failed to acquire lock state\n"); + return err; + } } else { void *ptr; @@ -7710,20 +7905,17 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, else ptr = btf; - if (!cur->active_lock.ptr) { + if (!cur->active_locks) { verbose(env, "bpf_spin_unlock without taking a lock\n"); return -EINVAL; } - if (cur->active_lock.ptr != ptr || - cur->active_lock.id != reg->id) { + + if (release_lock_state(cur_func(env), REF_TYPE_LOCK, reg->id, ptr)) { verbose(env, "bpf_spin_unlock of different lock\n"); return -EINVAL; } invalidate_non_owning_refs(env); - - cur->active_lock.ptr = NULL; - cur->active_lock.id = 0; } return 0; } @@ -7864,7 +8056,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, "arg#%d expected pointer to stack or const struct bpf_dynptr\n", - regno); + regno - 1); return -EINVAL; } @@ -7918,7 +8110,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn if (!is_dynptr_reg_valid_init(env, reg)) { verbose(env, "Expected an initialized dynptr as arg #%d\n", - regno); + regno - 1); return -EINVAL; } @@ -7926,7 +8118,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { verbose(env, "Expected a dynptr of type %s as arg #%d\n", - dynptr_type_str(arg_to_dynptr_type(arg_type)), regno); + dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); return -EINVAL; } @@ -7982,6 +8174,11 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id const struct btf_type *t; int spi, err, i, nr_slots, btf_id; + if (reg->type != PTR_TO_STACK) { + verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1); + return -EINVAL; + } + /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs() * ensures struct convention, so we wouldn't need to do any BTF * validation here. But given iter state can be passed as a parameter @@ -7990,7 +8187,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id */ btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); if (btf_id < 0) { - verbose(env, "expected valid iter pointer as arg #%d\n", regno); + verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1); return -EINVAL; } t = btf_type_by_id(meta->btf, btf_id); @@ -8000,7 +8197,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id /* bpf_iter_<type>_new() expects pointer to uninit iter state */ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { verbose(env, "expected uninitialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno); + iter_type_str(meta->btf, btf_id), regno - 1); return -EINVAL; } @@ -8024,7 +8221,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id break; case -EINVAL: verbose(env, "expected an initialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno); + iter_type_str(meta->btf, btf_id), regno - 1); return err; case -EPROTO: verbose(env, "expected an RCU CS when using %s\n", meta->func_name); @@ -9781,7 +9978,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, const char *sub_name = subprog_name(env, subprog); /* Only global subprogs cannot be called with a lock held. */ - if (env->cur_state->active_lock.ptr) { + if (cur_func(env)->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; @@ -9802,6 +9999,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, verbose(env, "Func#%d ('%s') is global and assumed valid.\n", subprog, sub_name); + if (env->subprog_info[subprog].changes_pkt_data) + clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); @@ -9910,7 +10109,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, { /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, * u64 flags); - * callback_fn(u32 index, void *callback_ctx); + * callback_fn(u64 index, void *callback_ctx); */ callee->regs[BPF_REG_1].type = SCALAR_VALUE; callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; @@ -10122,17 +10321,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller->regs[BPF_REG_0] = *r0; } - /* callback_fn frame should have released its own additions to parent's - * reference state at this point, or check_reference_leak would - * complain, hence it must be the same as the caller. There is no need - * to copy it back. - */ - if (!callee->in_callback_fn) { - /* Transfer references to the caller */ - err = copy_reference_state(caller, callee); - if (err) - return err; - } + /* Transfer references to the caller */ + err = copy_reference_state(caller, callee); + if (err) + return err; /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, * there function call logic would reschedule callback visit. If iteration @@ -10302,11 +10494,11 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi bool refs_lingering = false; int i; - if (!exception_exit && state->frameno && !state->in_callback_fn) + if (!exception_exit && state->frameno) return 0; for (i = 0; i < state->acquired_refs; i++) { - if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + if (state->refs[i].type != REF_TYPE_PTR) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); @@ -10315,6 +10507,34 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi return refs_lingering ? -EINVAL : 0; } +static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix) +{ + int err; + + if (check_lock && cur_func(env)->active_locks) { + verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix); + return -EINVAL; + } + + err = check_reference_leak(env, exception_exit); + if (err) { + verbose(env, "%s would lead to reference leak\n", prefix); + return err; + } + + if (check_lock && env->cur_state->active_rcu_lock) { + verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix); + return -EINVAL; + } + + if (check_lock && env->cur_state->active_preempt_lock) { + verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix); + return -EINVAL; + } + + return 0; +} + static int check_bpf_snprintf_call(struct bpf_verifier_env *env, struct bpf_reg_state *regs) { @@ -10467,7 +10687,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } /* With LD_ABS/IND some JITs save/restore skb from r1. */ - changes_data = bpf_helper_changes_pkt_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", func_id_name(func_id), func_id); @@ -10583,11 +10803,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn switch (func_id) { case BPF_FUNC_tail_call: - err = check_reference_leak(env, false); - if (err) { - verbose(env, "tail_call would lead to reference leak\n"); + err = check_resource_leak(env, false, true, "tail_call"); + if (err) return err; - } break; case BPF_FUNC_get_local_storage: /* check that flags argument in get_local_storage(map, flags) is 0, @@ -11252,6 +11470,7 @@ enum special_kfunc_type { KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, KF_bpf_session_cookie, + KF_bpf_get_kmem_cache, }; BTF_SET_START(special_kfunc_set) @@ -11317,6 +11536,7 @@ BTF_ID(func, bpf_session_cookie) #else BTF_ID_UNUSED #endif +BTF_ID(func, bpf_get_kmem_cache) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11512,10 +11732,9 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_verifier_state *state = env->cur_state; struct btf_record *rec = reg_btf_record(reg); - if (!state->active_lock.ptr) { + if (!cur_func(env)->active_locks) { verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); return -EFAULT; } @@ -11612,6 +11831,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o */ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { + struct bpf_reference_state *s; void *ptr; u32 id; @@ -11628,10 +11848,10 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ } id = reg->id; - if (!env->cur_state->active_lock.ptr) + if (!cur_func(env)->active_locks) return -EINVAL; - if (env->cur_state->active_lock.ptr != ptr || - env->cur_state->active_lock.id != id) { + s = find_lock_state(env, REF_TYPE_LOCK, id, ptr); + if (!s) { verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL; } @@ -12827,6 +13047,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) + regs[BPF_REG_0].type |= PTR_UNTRUSTED; + if (is_iter_next_kfunc(&meta)) { struct bpf_reg_state *cur_iter; @@ -15480,7 +15703,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -15744,26 +15967,9 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * gen_ld_abs() may terminate the program at runtime, leading to * reference leak. */ - err = check_reference_leak(env, false); - if (err) { - verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); + err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]"); + if (err) return err; - } - - if (env->cur_state->active_lock.ptr) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); - return -EINVAL; - } - - if (env->cur_state->active_rcu_lock) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n"); - return -EINVAL; - } - - if (env->cur_state->active_preempt_lock) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n"); - return -EINVAL; - } if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, @@ -15909,6 +16115,16 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -ENOTSUPP; } break; + case BPF_PROG_TYPE_KPROBE: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_KPROBE_SESSION: + case BPF_TRACE_UPROBE_SESSION: + range = retval_range(0, 1); + break; + default: + return 0; + } + break; case BPF_PROG_TYPE_SK_LOOKUP: range = retval_range(SK_DROP, SK_PASS); break; @@ -15968,6 +16184,29 @@ enforce_retval: return 0; } +static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->changes_pkt_data = true; +} + +/* 't' is an index of a call-site. + * 'w' is a callee entry point. + * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. + * Rely on DFS traversal order and absence of recursive calls to guarantee that + * callee's change_pkt_data marks would be correct at that moment. + */ +static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) +{ + struct bpf_subprog_info *caller, *callee; + + caller = find_containing_subprog(env, t); + callee = find_containing_subprog(env, w); + caller->changes_pkt_data |= callee->changes_pkt_data; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -16101,6 +16340,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, bool visit_callee) { int ret, insn_sz; + int w; insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); @@ -16112,8 +16352,10 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, mark_jmp_point(env, t + insn_sz); if (visit_callee) { + w = t + insns[t].imm + 1; mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + merge_callee_effects(env, t, w); + ret = push_insn(t, w, BRANCH, env); } return ret; } @@ -16175,10 +16417,7 @@ static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) /* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) { - if (meta->btf == btf_vmlinux) - return meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || - meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]; - return false; + return meta->kfunc_flags & KF_FASTCALL; } /* LLVM define a bpf_fastcall function attribute. @@ -16433,6 +16672,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } + if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -16567,6 +16808,7 @@ walk_cfg: } } ret = 0; /* cfg looks good */ + env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; err_free: kvfree(insn_state); @@ -17513,8 +17755,20 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, return false; for (i = 0; i < old->acquired_refs; i++) { - if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap)) + if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || + old->refs[i].type != cur->refs[i].type) + return false; + switch (old->refs[i].type) { + case REF_TYPE_PTR: + break; + case REF_TYPE_LOCK: + if (old->refs[i].ptr != cur->refs[i].ptr) + return false; + break; + default: + WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type); return false; + } } return true; @@ -17592,19 +17846,6 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->speculative && !cur->speculative) return false; - if (old->active_lock.ptr != cur->active_lock.ptr) - return false; - - /* Old and cur active_lock's have to be either both present - * or both absent. - */ - if (!!old->active_lock.id != !!cur->active_lock.id) - return false; - - if (old->active_lock.id && - !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch)) - return false; - if (old->active_rcu_lock != cur->active_rcu_lock) return false; @@ -17880,7 +18121,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ - cur->jmp_history_cnt > 40; + cur->insn_hist_end - cur->insn_hist_start > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -18078,7 +18319,7 @@ hit: * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0, 0); + err = err ? : push_insn_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -18177,8 +18418,8 @@ next: cur->parent = new; cur->first_insn_idx = insn_idx; + cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; - clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -18346,7 +18587,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state, 0, 0); + err = push_insn_history(env, state, 0, 0); if (err) return err; } @@ -18506,7 +18747,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_lock.ptr) { + if (cur_func(env)->active_locks) { if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) { @@ -18555,30 +18796,14 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } process_bpf_exit_full: - if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) { - verbose(env, "bpf_spin_unlock is missing\n"); - return -EINVAL; - } - - if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) { - verbose(env, "bpf_rcu_read_unlock is missing\n"); - return -EINVAL; - } - - if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) { - verbose(env, "%d bpf_preempt_enable%s missing\n", - env->cur_state->active_preempt_lock, - env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are"); - return -EINVAL; - } - /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback * function, for which reference_state must * match caller reference state when it exits. */ - err = check_reference_leak(env, exception_exit); + err = check_resource_leak(env, exception_exit, !env->cur_state->curframe, + "BPF_EXIT instruction"); if (err) return err; @@ -20041,6 +20266,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; + if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) + func[i]->aux->jits_use_priv_stack = true; + func[i]->jit_requested = 1; func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; @@ -20069,6 +20297,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; + func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -21809,6 +22038,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } } + if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) { + verbose(env, "Private stack not supported by jit\n"); + return -EACCES; + } + /* btf_ctx_access() used this to provide argument type info */ prog->aux->ctx_arg_info = st_ops_desc->arg_info[member_idx].info; @@ -21894,6 +22128,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; + bool tgt_changes_pkt_data; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -21928,6 +22163,14 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension programs should be JITed\n"); return -EINVAL; } + tgt_changes_pkt_data = aux->func + ? aux->func[subprog]->aux->changes_pkt_data + : aux->changes_pkt_data; + if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) { + bpf_log(log, + "Extension program changes packet data, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22393,10 +22636,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = check_attach_btf_id(env); - if (ret) - goto skip_full_check; - ret = resolve_pseudo_ldimm64(env); if (ret < 0) goto skip_full_check; @@ -22411,6 +22650,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -22547,6 +22790,7 @@ err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); + kvfree(env->insn_hist); err_free_env: kvfree(env); return ret; |