diff options
author | 2025-01-20 09:09:02 -0800 | |
---|---|---|
committer | 2025-01-20 09:10:16 -0800 | |
commit | d10cafc5d54a0f70681ab2f739ea6c46282c86f9 (patch) | |
tree | a97c594da58593c690600d8153249f00390783d7 /kernel | |
parent | tools: Sync if_xdp.h uapi tooling header (diff) | |
parent | selftests/bpf: Add test case for the freeing of bpf_timer (diff) | |
download | wireguard-linux-d10cafc5d54a0f70681ab2f739ea6c46282c86f9.tar.xz wireguard-linux-d10cafc5d54a0f70681ab2f739ea6c46282c86f9.zip |
Merge branch 'free-htab-element-out-of-bucket-lock'
Hou Tao says:
====================
The patch set continues the previous work [1] to move all the freeings
of htab elements out of bucket lock. One motivation for the patch set is
the locking problem reported by Sebastian [2]: the freeing of bpf_timer
under PREEMPT_RT may acquire a spin-lock (namely softirq_expiry_lock).
However the freeing procedure for htab element has already held a
raw-spin-lock (namely bucket lock), and it will trigger the warning:
"BUG: scheduling while atomic" as demonstrated by the selftests patch.
Another motivation is to reduce the locked scope of bucket lock.
However, the patch set doesn't move all freeing of htab element out of
bucket lock, it still keep the free of special fields in pre-allocated
hash map under the protect of bucket lock in htab_map_update_elem(). The
patch set is structured as follows:
* Patch #1 moves the element freeing out of bucket lock for
htab_lru_map_delete_node(). However the freeing is still in the locked
scope of LRU raw spin lock.
* Patch #2~#3 move the element freeing out of bucket lock for
__htab_map_lookup_and_delete_elem()
* Patch #4 cancels the bpf_timer in two steps to fix the locking
problem in htab_map_update_elem() for PREEMPT_PRT.
* Patch #5 adds a selftest for the locking problem
Please see individual patches for more details. Comments are always
welcome.
---
v3:
* patch #1: update the commit message to state that the freeing of
special field is still in the locked scope of LRU raw spin lock
* patch #4: cancel the bpf_timer in two steps only for PREEMPT_RT
(suggested by Alexei)
v2: https://lore.kernel.org/bpf/20250109061901.2620825-1-houtao@huaweicloud.com
* cancels the bpf timer in two steps instead of breaking the reuse
the refill of per-cpu ->extra_elems into two steps
v1: https://lore.kernel.org/bpf/20250107085559.3081563-1-houtao@huaweicloud.com
[1]: https://lore.kernel.org/bpf/20241106063542.357743-1-houtao@huaweicloud.com
[2]: https://lore.kernel.org/bpf/20241106084527.4gPrMnHt@linutronix.de
====================
Link: https://patch.msgid.link/20250117101816.2101857-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/hashtab.c | 60 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 18 |
2 files changed, 48 insertions, 30 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 40095dda891d..4a9eeb7aef85 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -824,13 +824,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); - check_and_free_fields(htab, l); bpf_map_dec_elem_count(&htab->map); break; } htab_unlock_bucket(htab, b, tgt_l->hash, flags); + if (l == tgt_l) + check_and_free_fields(htab, l); return l == tgt_l; } @@ -1634,41 +1635,44 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, l = lookup_elem_raw(head, hash, key, key_size); if (!l) { ret = -ENOENT; - } else { - if (is_percpu) { - u32 roundup_value_size = round_up(map->value_size, 8); - void __percpu *pptr; - int off = 0, cpu; + goto out_unlock; + } - pptr = htab_elem_get_ptr(l, key_size); - for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); - check_and_init_map_value(&htab->map, value + off); - off += roundup_value_size; - } - } else { - u32 roundup_key_size = round_up(map->key_size, 8); + if (is_percpu) { + u32 roundup_value_size = round_up(map->value_size, 8); + void __percpu *pptr; + int off = 0, cpu; - if (flags & BPF_F_LOCK) - copy_map_value_locked(map, value, l->key + - roundup_key_size, - true); - else - copy_map_value(map, value, l->key + - roundup_key_size); - /* Zeroing special fields in the temp buffer */ - check_and_init_map_value(map, value); + pptr = htab_elem_get_ptr(l, key_size); + for_each_possible_cpu(cpu) { + copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, value + off); + off += roundup_value_size; } + } else { + u32 roundup_key_size = round_up(map->key_size, 8); - hlist_nulls_del_rcu(&l->hash_node); - if (!is_lru_map) - free_htab_elem(htab, l); + if (flags & BPF_F_LOCK) + copy_map_value_locked(map, value, l->key + + roundup_key_size, + true); + else + copy_map_value(map, value, l->key + + roundup_key_size); + /* Zeroing special fields in the temp buffer */ + check_and_init_map_value(map, value); } + hlist_nulls_del_rcu(&l->hash_node); +out_unlock: htab_unlock_bucket(htab, b, hash, bflags); - if (is_lru_map && l) - htab_lru_push_free(htab, l); + if (l) { + if (is_lru_map) + htab_lru_push_free(htab, l); + else + free_htab_elem(htab, l); + } return ret; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bcda671feafd..f27ce162427a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1593,10 +1593,24 @@ void bpf_timer_cancel_and_free(void *val) * To avoid these issues, punt to workqueue context when we are in a * timer callback. */ - if (this_cpu_read(hrtimer_running)) + if (this_cpu_read(hrtimer_running)) { queue_work(system_unbound_wq, &t->cb.delete_work); - else + return; + } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* If the timer is running on other CPU, also use a kworker to + * wait for the completion of the timer instead of trying to + * acquire a sleepable lock in hrtimer_cancel() to wait for its + * completion. + */ + if (hrtimer_try_to_cancel(&t->timer) >= 0) + kfree_rcu(t, cb.rcu); + else + queue_work(system_unbound_wq, &t->cb.delete_work); + } else { bpf_timer_delete_work(&t->cb.delete_work); + } } /* This function is called by map_delete/update_elem for individual element and |