diff options
author | 2025-01-08 18:06:37 -0800 | |
---|---|---|
committer | 2025-01-08 18:06:45 -0800 | |
commit | e8ec1c94866a44723c8ab1c90942503f3402ede8 (patch) | |
tree | f24d2737d0d2fcc7b7d45e00980dc70292ed5189 | |
parent | selftests/bpf: Add kprobe session recursion check test (diff) | |
parent | bpf: Remove migrate_{disable|enable} from bpf_selem_free() (diff) | |
download | wireguard-linux-e8ec1c94866a44723c8ab1c90942503f3402ede8.tar.xz wireguard-linux-e8ec1c94866a44723c8ab1c90942503f3402ede8.zip |
Merge branch 'bpf-reduce-the-use-of-migrate_-disable-enable'
Hou Tao says:
====================
The use of migrate_{disable|enable} pair in BPF is mainly due to the
introduction of bpf memory allocator and the use of per-CPU data struct
in its internal implementation. The caller needs to disable migration
before invoking the alloc or free APIs of bpf memory allocator, and
enable migration after the invocation.
The main users of bpf memory allocator are various kind of bpf maps in
which the map values or the special fields in the map values are
allocated by using bpf memory allocator.
At present, the running context for bpf program has already disabled
migration explictly or implictly, therefore, when these maps are
manipulated in bpf program, it is OK to not invoke migrate_disable()
and migrate_enable() pair. Howevers, it is not always the case when
these maps are manipulated through bpf syscall, therefore many
migrate_{disable|enable} pairs are added when the map can either be
manipulated by BPF program or BPF syscall.
The initial idea of reducing the use of migrate_{disable|enable} comes
from Alexei [1]. I turned it into a patch set that archives the goals
through the following three methods:
1. remove unnecessary migrate_{disable|enable} pair
when the BPF syscall path also disables migration, it is OK to remove
the pair. Patch #1~#3 fall into this category, while patch #4~#5 are
partially included.
2. move the migrate_{disable|enable} pair from inner callee to outer
caller
Instead of invoking migrate_disable() in the inner callee, invoking
migrate_disable() in the outer caller to simplify reasoning about when
migrate_disable() is needed. Patch #4~#5 and patch #6~#19 belongs to
this category.
3. add cant_migrate() check in the inner callee
Add cant_migrate() check in the inner callee to ensure the guarantee
that migration is disabled is not broken. Patch #1~#5, #13, #16~#19 also
belong to this category.
Please check the individual patches for more details. Comments are
always welcome.
Change Log:
v2:
* sqaush the ->map_free related patches (#10~#12, #15) into one patch
* remove unnecessary cant_migrate() checks.
v1: https://lore.kernel.org/bpf/20250106081900.1665573-1-houtao@huaweicloud.com
====================
Link: https://patch.msgid.link/20250108010728.207536-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r-- | kernel/bpf/arraymap.c | 6 | ||||
-rw-r--r-- | kernel/bpf/bpf_cgrp_storage.c | 15 | ||||
-rw-r--r-- | kernel/bpf/bpf_inode_storage.c | 9 | ||||
-rw-r--r-- | kernel/bpf/bpf_local_storage.c | 30 | ||||
-rw-r--r-- | kernel/bpf/bpf_task_storage.c | 15 | ||||
-rw-r--r-- | kernel/bpf/cpumask.c | 2 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 19 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 4 | ||||
-rw-r--r-- | kernel/bpf/lpm_trie.c | 20 | ||||
-rw-r--r-- | kernel/bpf/range_tree.c | 2 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 10 | ||||
-rw-r--r-- | net/core/bpf_sk_storage.c | 11 |
12 files changed, 55 insertions, 88 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 6cdbb4c33d31..eb28c0f219ee 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -735,13 +735,13 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback u64 ret = 0; void *val; + cant_migrate(); + if (flags != 0) return -EINVAL; is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; array = container_of(map, struct bpf_array, map); - if (is_percpu) - migrate_disable(); for (i = 0; i < map->max_entries; i++) { if (is_percpu) val = this_cpu_ptr(array->pptrs[i]); @@ -756,8 +756,6 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback break; } - if (is_percpu) - migrate_enable(); return num_elems; } diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 20f05de92e9c..d5dc65bb1755 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -15,22 +15,20 @@ static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); static void bpf_cgrp_storage_lock(void) { - migrate_disable(); + cant_migrate(); this_cpu_inc(bpf_cgrp_storage_busy); } static void bpf_cgrp_storage_unlock(void) { this_cpu_dec(bpf_cgrp_storage_busy); - migrate_enable(); } static bool bpf_cgrp_storage_trylock(void) { - migrate_disable(); + cant_migrate(); if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { this_cpu_dec(bpf_cgrp_storage_busy); - migrate_enable(); return false; } return true; @@ -47,17 +45,18 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; + migrate_disable(); rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); - if (!local_storage) { - rcu_read_unlock(); - return; - } + if (!local_storage) + goto out; bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); +out: rcu_read_unlock(); + migrate_enable(); } static struct bpf_local_storage_data * diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index a51c82dee1bd..15a3eb9b02d9 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -62,16 +62,17 @@ void bpf_inode_storage_free(struct inode *inode) if (!bsb) return; + migrate_disable(); rcu_read_lock(); local_storage = rcu_dereference(bsb->storage); - if (!local_storage) { - rcu_read_unlock(); - return; - } + if (!local_storage) + goto out; bpf_local_storage_destroy(local_storage); +out: rcu_read_unlock(); + migrate_enable(); } static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index e94820f6b0cd..fa56c30833ff 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -81,9 +81,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; if (smap->bpf_ma) { - migrate_disable(); selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); - migrate_enable(); if (selem) /* Keep the original bpf_map_kzalloc behavior * before started using the bpf_mem_cache_alloc. @@ -174,17 +172,14 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage, return; } - if (smap) { - migrate_disable(); + if (smap) bpf_mem_cache_free(&smap->storage_ma, local_storage); - migrate_enable(); - } else { + else /* smap could be NULL if the selem that triggered * this 'local_storage' creation had been long gone. * In this case, directly do call_rcu(). */ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); - } } /* rcu tasks trace callback for bpf_ma == false */ @@ -217,7 +212,10 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) selem = container_of(rcu, struct bpf_local_storage_elem, rcu); /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); + + migrate_disable(); bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + migrate_enable(); bpf_mem_cache_raw_free(selem); } @@ -256,9 +254,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, * bpf_mem_cache_free will be able to reuse selem * immediately. */ - migrate_disable(); bpf_mem_cache_free(&smap->selem_ma, selem); - migrate_enable(); return; } @@ -497,15 +493,11 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - if (smap->bpf_ma) { - migrate_disable(); + if (smap->bpf_ma) storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); - migrate_enable(); - } else { + else storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), gfp_flags | __GFP_NOWARN); - } - if (!storage) { err = -ENOMEM; goto uncharge; @@ -902,15 +894,11 @@ void bpf_local_storage_map_free(struct bpf_map *map, while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { - if (busy_counter) { - migrate_disable(); + if (busy_counter) this_cpu_inc(*busy_counter); - } bpf_selem_unlink(selem, true); - if (busy_counter) { + if (busy_counter) this_cpu_dec(*busy_counter); - migrate_enable(); - } cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index bf7fa15fdcc6..1109475953c0 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -24,22 +24,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy); static void bpf_task_storage_lock(void) { - migrate_disable(); + cant_migrate(); this_cpu_inc(bpf_task_storage_busy); } static void bpf_task_storage_unlock(void) { this_cpu_dec(bpf_task_storage_busy); - migrate_enable(); } static bool bpf_task_storage_trylock(void) { - migrate_disable(); + cant_migrate(); if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { this_cpu_dec(bpf_task_storage_busy); - migrate_enable(); return false; } return true; @@ -72,18 +70,19 @@ void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; + migrate_disable(); rcu_read_lock(); local_storage = rcu_dereference(task->bpf_storage); - if (!local_storage) { - rcu_read_unlock(); - return; - } + if (!local_storage) + goto out; bpf_task_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_task_storage_unlock(); +out: rcu_read_unlock(); + migrate_enable(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 33c473d676a5..cfa1c18e3a48 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -91,9 +91,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) if (!refcount_dec_and_test(&cpumask->usage)) return; - migrate_disable(); bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask); - migrate_enable(); } __bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ec941a0ea41..40095dda891d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -897,11 +897,9 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { check_and_free_fields(htab, l); - migrate_disable(); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); bpf_mem_cache_free(&htab->ma, l); - migrate_enable(); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -1502,10 +1500,9 @@ static void delete_all_elements(struct bpf_htab *htab) { int i; - /* It's called from a worker thread, so disable migration here, - * since bpf_mem_cache_free() relies on that. + /* It's called from a worker thread and migration has been disabled, + * therefore, it is OK to invoke bpf_mem_cache_free() directly. */ - migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; @@ -1517,7 +1514,6 @@ static void delete_all_elements(struct bpf_htab *htab) } cond_resched(); } - migrate_enable(); } static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) @@ -2208,17 +2204,18 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ bool is_percpu; u64 ret = 0; + cant_migrate(); + if (flags != 0) return -EINVAL; is_percpu = htab_is_percpu(htab); roundup_key_size = round_up(map->key_size, 8); - /* disable migration so percpu value prepared here will be the - * same as the one seen by the bpf program with bpf_map_lookup_elem(). + /* migration has been disabled, so percpu value prepared here will be + * the same as the one seen by the bpf program with + * bpf_map_lookup_elem(). */ - if (is_percpu) - migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { b = &htab->buckets[i]; rcu_read_lock(); @@ -2244,8 +2241,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ rcu_read_unlock(); } out: - if (is_percpu) - migrate_enable(); return num_elems; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cd5f9884d85b..bcda671feafd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2066,9 +2066,7 @@ unlock: /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ - migrate_disable(); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); - migrate_enable(); } } @@ -2105,9 +2103,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, obj -= field->graph_root.node_offset; - migrate_disable(); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); - migrate_enable(); } } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index f8bc1e096182..e8a772e64324 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -289,16 +289,11 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) } static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie, - const void *value, - bool disable_migration) + const void *value) { struct lpm_trie_node *node; - if (disable_migration) - migrate_disable(); node = bpf_mem_cache_alloc(&trie->ma); - if (disable_migration) - migrate_enable(); if (!node) return NULL; @@ -342,10 +337,8 @@ static long trie_update_elem(struct bpf_map *map, if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - /* Allocate and fill a new node. Need to disable migration before - * invoking bpf_mem_cache_alloc(). - */ - new_node = lpm_trie_node_alloc(trie, value, true); + /* Allocate and fill a new node */ + new_node = lpm_trie_node_alloc(trie, value); if (!new_node) return -ENOMEM; @@ -425,8 +418,7 @@ static long trie_update_elem(struct bpf_map *map, goto out; } - /* migration is disabled within the locked scope */ - im_node = lpm_trie_node_alloc(trie, NULL, false); + im_node = lpm_trie_node_alloc(trie, NULL); if (!im_node) { trie->n_entries--; ret = -ENOMEM; @@ -452,11 +444,9 @@ static long trie_update_elem(struct bpf_map *map, out: raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - migrate_disable(); if (ret) bpf_mem_cache_free(&trie->ma, new_node); bpf_mem_cache_free_rcu(&trie->ma, free_node); - migrate_enable(); return ret; } @@ -555,10 +545,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) out: raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - migrate_disable(); bpf_mem_cache_free_rcu(&trie->ma, free_parent); bpf_mem_cache_free_rcu(&trie->ma, free_node); - migrate_enable(); return ret; } diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 5bdf9aadca3a..37b80a23ae1a 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -259,9 +259,7 @@ void range_tree_destroy(struct range_tree *rt) while ((rn = range_it_iter_first(rt, 0, -1U))) { range_it_remove(rn, rt); - migrate_disable(); bpf_mem_free(&bpf_global_ma, rn); - migrate_enable(); } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e88797fdbeb..0daf098e3207 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -796,11 +796,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); - migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? pointee_struct_meta->record : NULL, fields[i].type == BPF_KPTR_PERCPU); - migrate_enable(); } else { field->kptr.dtor(xchgd_field); } @@ -835,8 +833,14 @@ static void bpf_map_free(struct bpf_map *map) struct btf_record *rec = map->record; struct btf *btf = map->btf; - /* implementation dependent freeing */ + /* implementation dependent freeing. Disabling migration to simplify + * the free of values or special fields allocated from bpf memory + * allocator. + */ + migrate_disable(); map->ops->map_free(map); + migrate_enable(); + /* Delay freeing of btf_record for maps, as map_free * callback usually needs access to them. It is better to do it here * than require each callback to do the free itself manually. diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 2f4ed83a75ae..7d41cde1bcca 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -50,15 +50,16 @@ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage) { - rcu_read_unlock(); - return; - } + if (!sk_storage) + goto out; bpf_local_storage_destroy(sk_storage); +out: rcu_read_unlock(); + migrate_enable(); } static void bpf_sk_storage_map_free(struct bpf_map *map) @@ -160,6 +161,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); + migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); @@ -212,6 +214,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) out: rcu_read_unlock(); + migrate_enable(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. |