From 55eff109e76a14e5ed10c8c3c3978d20a35e2a4d Mon Sep 17 00:00:00 2001 From: Junnan Wu Date: Fri, 14 Feb 2025 09:22:00 +0800 Subject: vsock/virtio: fix variables initialization during resuming When executing suspend to ram twice in a row, the `rx_buf_nr` and `rx_buf_max_nr` increase to three times vq->num_free. Then after virtqueue_get_buf and `rx_buf_nr` decreased in function virtio_transport_rx_work, the condition to fill rx buffer (rx_buf_nr < rx_buf_max_nr / 2) will never be met. It is because that `rx_buf_nr` and `rx_buf_max_nr` are initialized only in virtio_vsock_probe(), but they should be reset whenever virtqueues are recreated, like after a suspend/resume. Move the `rx_buf_nr` and `rx_buf_max_nr` initialization in virtio_vsock_vqs_init(), so we are sure that they are properly initialized, every time we initialize the virtqueues, either when we load the driver or after a suspend/resume. To prevent erroneous atomic load operations on the `queued_replies` in the virtio_transport_send_pkt_work() function which may disrupt the scheduling of vsock->rx_work when transmitting reply-required socket packets, this atomic variable must undergo synchronized initialization alongside the preceding two variables after a suspend/resume. Fixes: bd50c5dc182b ("vsock/virtio: add support for device suspend/resume") Link: https://lore.kernel.org/virtualization/20250207052033.2222629-1-junnan01.wu@samsung.com/ Co-developed-by: Ying Gao Signed-off-by: Ying Gao Signed-off-by: Junnan Wu Reviewed-by: Luigi Leonardi Acked-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20250214012200.1883896-1-junnan01.wu@samsung.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index b58c3818f284..f0e48e6911fc 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -670,6 +670,13 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) }; int ret; + mutex_lock(&vsock->rx_lock); + vsock->rx_buf_nr = 0; + vsock->rx_buf_max_nr = 0; + mutex_unlock(&vsock->rx_lock); + + atomic_set(&vsock->queued_replies, 0); + ret = virtio_find_vqs(vdev, VSOCK_VQ_MAX, vsock->vqs, vqs_info, NULL); if (ret < 0) return ret; @@ -779,9 +786,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->vdev = vdev; - vsock->rx_buf_nr = 0; - vsock->rx_buf_max_nr = 0; - atomic_set(&vsock->queued_replies, 0); mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); -- cgit v1.2.3-59-g8ed1b From 071ed42cff4fcdd89025d966d48eabef59913bf2 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Thu, 13 Feb 2025 23:36:10 +0100 Subject: net/sched: cls_api: fix error handling causing NULL dereference tcf_exts_miss_cookie_base_alloc() calls xa_alloc_cyclic() which can return 1 if the allocation succeeded after wrapping. This was treated as an error, with value 1 returned to caller tcf_exts_init_ex() which sets exts->actions to NULL and returns 1 to caller fl_change(). fl_change() treats err == 1 as success, calling tcf_exts_validate_ex() which calls tcf_action_init() with exts->actions as argument, where it is dereferenced. Example trace: BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 114 PID: 16151 Comm: handler114 Kdump: loaded Not tainted 5.14.0-503.16.1.el9_5.x86_64 #1 RIP: 0010:tcf_action_init+0x1f8/0x2c0 Call Trace: tcf_action_init+0x1f8/0x2c0 tcf_exts_validate_ex+0x175/0x190 fl_change+0x537/0x1120 [cls_flower] Fixes: 80cd22c35c90 ("net/sched: cls_api: Support hardware miss to tc action") Signed-off-by: Pierre Riteau Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250213223610.320278-1-pierre@stackhpc.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8e47e5355be6..4f648af8cfaa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -97,7 +97,7 @@ tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, n, xa_limit_32b, &next, GFP_KERNEL); - if (err) + if (err < 0) goto err_xa_alloc; exts->miss_cookie_node = n; -- cgit v1.2.3-59-g8ed1b From 07b598c0e6f06a0f254c88dafb4ad50f8a8c6eea Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Thu, 13 Feb 2025 15:20:55 +0000 Subject: drop_monitor: fix incorrect initialization order Syzkaller reports the following bug: BUG: spinlock bad magic on CPU#1, syz-executor.0/7995 lock: 0xffff88805303f3e0, .magic: 00000000, .owner: /-1, .owner_cpu: 0 CPU: 1 PID: 7995 Comm: syz-executor.0 Tainted: G E 5.10.209+ #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x119/0x179 lib/dump_stack.c:118 debug_spin_lock_before kernel/locking/spinlock_debug.c:83 [inline] do_raw_spin_lock+0x1f6/0x270 kernel/locking/spinlock_debug.c:112 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:117 [inline] _raw_spin_lock_irqsave+0x50/0x70 kernel/locking/spinlock.c:159 reset_per_cpu_data+0xe6/0x240 [drop_monitor] net_dm_cmd_trace+0x43d/0x17a0 [drop_monitor] genl_family_rcv_msg_doit+0x22f/0x330 net/netlink/genetlink.c:739 genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] genl_rcv_msg+0x341/0x5a0 net/netlink/genetlink.c:800 netlink_rcv_skb+0x14d/0x440 net/netlink/af_netlink.c:2497 genl_rcv+0x29/0x40 net/netlink/genetlink.c:811 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0x54b/0x800 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x914/0xe00 net/netlink/af_netlink.c:1916 sock_sendmsg_nosec net/socket.c:651 [inline] __sock_sendmsg+0x157/0x190 net/socket.c:663 ____sys_sendmsg+0x712/0x870 net/socket.c:2378 ___sys_sendmsg+0xf8/0x170 net/socket.c:2432 __sys_sendmsg+0xea/0x1b0 net/socket.c:2461 do_syscall_64+0x30/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x62/0xc7 RIP: 0033:0x7f3f9815aee9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f3f972bf0c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f3f9826d050 RCX: 00007f3f9815aee9 RDX: 0000000020000000 RSI: 0000000020001300 RDI: 0000000000000007 RBP: 00007f3f981b63bd R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000006e R14: 00007f3f9826d050 R15: 00007ffe01ee6768 If drop_monitor is built as a kernel module, syzkaller may have time to send a netlink NET_DM_CMD_START message during the module loading. This will call the net_dm_monitor_start() function that uses a spinlock that has not yet been initialized. To fix this, let's place resource initialization above the registration of a generic netlink family. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 9a8afc8d3962 ("Network Drop Monitor: Adding drop monitor implementation & Netlink protocol") Cc: stable@vger.kernel.org Signed-off-by: Ilia Gavrilov Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250213152054.2785669-1-Ilia.Gavrilov@infotecs.ru Signed-off-by: Jakub Kicinski --- net/core/drop_monitor.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 6efd4cccc9dd..212f0a048cab 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1734,30 +1734,30 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family(&net_drop_monitor_family); - if (rc) { - pr_err("Could not create drop monitor netlink family\n"); - return rc; + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } - WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); + return rc; + } + + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); goto out_unreg; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = 0; - for_each_possible_cpu(cpu) { - net_dm_cpu_data_init(cpu); - net_dm_hw_cpu_data_init(cpu); - } - goto out; out_unreg: - genl_unregister_family(&net_drop_monitor_family); + WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } @@ -1766,19 +1766,18 @@ static void exit_net_drop_monitor(void) { int cpu; - BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); - /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } - - BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); -- cgit v1.2.3-59-g8ed1b From 8fb5bb169d17cdd12c2dcc2e96830ed487d77a0f Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:49 +0100 Subject: sockmap, vsock: For connectible sockets allow only connected sockmap expects all vsocks to have a transport assigned, which is expressed in vsock_proto::psock_update_sk_prot(). However, there is an edge case where an unconnected (connectible) socket may lose its previously assigned transport. This is handled with a NULL check in the vsock/BPF recv path. Another design detail is that listening vsocks are not supposed to have any transport assigned at all. Which implies they are not supported by the sockmap. But this is complicated by the fact that a socket, before switching to TCP_LISTEN, may have had some transport assigned during a failed connect() attempt. Hence, we may end up with a listening vsock in a sockmap, which blows up quickly: KASAN: null-ptr-deref in range [0x0000000000000120-0x0000000000000127] CPU: 7 UID: 0 PID: 56 Comm: kworker/7:0 Not tainted 6.14.0-rc1+ Workqueue: vsock-loopback vsock_loopback_work RIP: 0010:vsock_read_skb+0x4b/0x90 Call Trace: sk_psock_verdict_data_ready+0xa4/0x2e0 virtio_transport_recv_pkt+0x1ca8/0x2acc vsock_loopback_work+0x27d/0x3f0 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x35a/0x700 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 For connectible sockets, instead of relying solely on the state of vsk->transport, tell sockmap to only allow those representing established connections. This aligns with the behaviour for AF_INET and AF_UNIX. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Michal Luczaj Acked-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- net/core/sock_map.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f1b9b3958792..2f1be9baad05 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -541,6 +541,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; + if (sk_is_vsock(sk) && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } -- cgit v1.2.3-59-g8ed1b From 857ae05549ee2542317e7084ecaa5f8536634dd9 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:50 +0100 Subject: vsock/bpf: Warn on socket without transport In the spirit of commit 91751e248256 ("vsock: prevent null-ptr-deref in vsock_*[has_data|has_space]"), armorize the "impossible" cases with a warning. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 3 +++ net/vmw_vsock/vsock_bpf.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 53a081d49d28..7e3db87ae433 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1189,6 +1189,9 @@ static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) { struct vsock_sock *vsk = vsock_sk(sk); + if (WARN_ON_ONCE(!vsk->transport)) + return -ENODEV; + return vsk->transport->read_skb(vsk, read_actor); } diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index f201d9eca1df..07b96d56f3a5 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -87,7 +87,7 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, lock_sock(sk); vsk = vsock_sk(sk); - if (!vsk->transport) { + if (WARN_ON_ONCE(!vsk->transport)) { copied = -ENODEV; goto out; } -- cgit v1.2.3-59-g8ed1b From f5da7c45188eea71394bf445655cae2df88a7788 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Feb 2025 15:29:05 -0800 Subject: tcp: adjust rcvq_space after updating scaling ratio Since commit under Fixes we set the window clamp in accordance to newly measured rcvbuf scaling_ratio. If the scaling_ratio decreased significantly we may put ourselves in a situation where windows become smaller than rcvq_space, preventing tcp_rcv_space_adjust() from increasing rcvbuf. The significant decrease of scaling_ratio is far more likely since commit 697a6c8cec03 ("tcp: increase the default TCP scaling ratio"), which increased the "default" scaling ratio from ~30% to 50%. Hitting the bad condition depends a lot on TCP tuning, and drivers at play. One of Meta's workloads hits it reliably under following conditions: - default rcvbuf of 125k - sender MTU 1500, receiver MTU 5000 - driver settles on scaling_ratio of 78 for the config above. Initial rcvq_space gets calculated as TCP_INIT_CWND * tp->advmss (10 * 5k = 50k). Once we find out the true scaling ratio and MSS we clamp the windows to 38k. Triggering the condition also depends on the message sequence of this workload. I can't repro the problem with simple iperf or TCP_RR-style tests. Fixes: a2cbb1603943 ("tcp: Update window clamping condition") Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250217232905.3162187-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb82e01da911..98b8cc740392 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -243,9 +243,15 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; - if (old_ratio != tcp_sk(sk)->scaling_ratio) - WRITE_ONCE(tcp_sk(sk)->window_clamp, - tcp_win_from_space(sk, sk->sk_rcvbuf)); + if (old_ratio != tcp_sk(sk)->scaling_ratio) { + struct tcp_sock *tp = tcp_sk(sk); + + val = tcp_win_from_space(sk, sk->sk_rcvbuf); + tcp_set_window_clamp(sk, val); + + if (tp->window_clamp < tp->rcvq_space.space) + tp->rcvq_space.space = tp->window_clamp; + } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); -- cgit v1.2.3-59-g8ed1b From e57a6320215c3967f51ab0edeff87db2095440e4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:27 -0800 Subject: net: Add net_passive_inc() and net_passive_dec(). net_drop_ns() is NULL when CONFIG_NET_NS is disabled. The next patch introduces a function that increments and decrements net->passive. As a prep, let's rename and export net_free() to net_passive_dec() and add net_passive_inc(). Suggested-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89i+oUCt2VGvrbrweniTendZFEh+nwS=uonc004-aPkWy-Q@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 11 +++++++++++ net/core/net_namespace.c | 8 ++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 7ba1402ca779..f467a66abc6b 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -297,6 +297,7 @@ static inline int check_net(const struct net *net) } void net_drop_ns(void *); +void net_passive_dec(struct net *net); #else @@ -326,8 +327,18 @@ static inline int check_net(const struct net *net) } #define net_drop_ns NULL + +static inline void net_passive_dec(struct net *net) +{ + refcount_dec(&net->passive); +} #endif +static inline void net_passive_inc(struct net *net) +{ + refcount_inc(&net->passive); +} + /* Returns true if the netns initialization is completed successfully */ static inline bool net_initialized(const struct net *net) { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb39a12b2f82..4303f2a49262 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -464,7 +464,7 @@ static void net_complete_free(void) } -static void net_free(struct net *net) +void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); @@ -482,7 +482,7 @@ void net_drop_ns(void *p) struct net *net = (struct net *)p; if (net) - net_free(net); + net_passive_dec(net); } struct net *copy_net_ns(unsigned long flags, @@ -523,7 +523,7 @@ put_userns: key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); - net_free(net); + net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -672,7 +672,7 @@ static void cleanup_net(struct work_struct *work) key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); - net_free(net); + net_passive_dec(net); } cleanup_net_task = NULL; } -- cgit v1.2.3-59-g8ed1b From 65161fb544aada499c912b6010a8f7d8e04f6130 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:28 -0800 Subject: net: Fix dev_net(dev) race in unregister_netdevice_notifier_dev_net(). After the cited commit, dev_net(dev) is fetched before holding RTNL and passed to __unregister_netdevice_notifier_net(). However, dev_net(dev) might be different after holding RTNL. In the reported case [0], while removing a VF device, its netns was being dismantled and the VF was moved to init_net. So the following sequence is basically illegal when dev was fetched without lookup: net = dev_net(dev); rtnl_net_lock(net); Let's use a new helper rtnl_net_dev_lock() to fix the race. It fetches dev_net_rcu(dev), bumps its net->passive, and checks if dev_net_rcu(dev) is changed after rtnl_net_lock(). [0]: BUG: KASAN: slab-use-after-free in notifier_call_chain (kernel/notifier.c:75 (discriminator 2)) Read of size 8 at addr ffff88810cefb4c8 by task test-bridge-lag/21127 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:123) print_report (mm/kasan/report.c:379 mm/kasan/report.c:489) kasan_report (mm/kasan/report.c:604) notifier_call_chain (kernel/notifier.c:75 (discriminator 2)) call_netdevice_notifiers_info (net/core/dev.c:2011) unregister_netdevice_many_notify (net/core/dev.c:11551) unregister_netdevice_queue (net/core/dev.c:11487) unregister_netdev (net/core/dev.c:11635) mlx5e_remove (drivers/net/ethernet/mellanox/mlx5/core/en_main.c:6552 drivers/net/ethernet/mellanox/mlx5/core/en_main.c:6579) mlx5_core auxiliary_bus_remove (drivers/base/auxiliary.c:230) device_release_driver_internal (drivers/base/dd.c:1275 drivers/base/dd.c:1296) bus_remove_device (./include/linux/kobject.h:193 drivers/base/base.h:73 drivers/base/bus.c:583) device_del (drivers/base/power/power.h:142 drivers/base/core.c:3855) mlx5_rescan_drivers_locked (./include/linux/auxiliary_bus.h:241 drivers/net/ethernet/mellanox/mlx5/core/dev.c:333 drivers/net/ethernet/mellanox/mlx5/core/dev.c:535 drivers/net/ethernet/mellanox/mlx5/core/dev.c:549) mlx5_core mlx5_unregister_device (drivers/net/ethernet/mellanox/mlx5/core/dev.c:468) mlx5_core mlx5_uninit_one (./include/linux/instrumented.h:68 ./include/asm-generic/bitops/instrumented-non-atomic.h:141 drivers/net/ethernet/mellanox/mlx5/core/main.c:1563) mlx5_core remove_one (drivers/net/ethernet/mellanox/mlx5/core/main.c:965 drivers/net/ethernet/mellanox/mlx5/core/main.c:2019) mlx5_core pci_device_remove (./include/linux/pm_runtime.h:129 drivers/pci/pci-driver.c:475) device_release_driver_internal (drivers/base/dd.c:1275 drivers/base/dd.c:1296) unbind_store (drivers/base/bus.c:245) kernfs_fop_write_iter (fs/kernfs/file.c:338) vfs_write (fs/read_write.c:587 (discriminator 1) fs/read_write.c:679 (discriminator 1)) ksys_write (fs/read_write.c:732) do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f6a4d5018b7 Fixes: 7fb1073300a2 ("net: Hold rtnl_net_lock() in (un)?register_netdevice_notifier_dev_net().") Reported-by: Yael Chemla Closes: https://lore.kernel.org/netdev/146eabfe-123c-4970-901e-e961b4c09bc3@nvidia.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b91658e8aedb..19e268568282 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2070,6 +2070,42 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } +static void rtnl_net_dev_lock(struct net_device *dev) +{ + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); +} + int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) @@ -2077,6 +2113,11 @@ int register_netdevice_notifier_dev_net(struct net_device *dev, struct net *net = dev_net(dev); int err; + /* rtnl_net_lock() assumes dev is not yet published by + * register_netdevice(). + */ + DEBUG_NET_WARN_ON_ONCE(!list_empty(&dev->dev_list)); + rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); if (!err) { @@ -2093,13 +2134,12 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { - struct net *net = dev_net(dev); int err; - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); list_del(&nn->list); - err = __unregister_netdevice_notifier_net(net, nb); - rtnl_net_unlock(net); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_net_dev_unlock(dev); return err; } -- cgit v1.2.3-59-g8ed1b From d4c6bfc83936cb61fac99e9891c406fbdd40f964 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:29 -0800 Subject: dev: Use rtnl_net_dev_lock() in unregister_netdev(). The following sequence is basically illegal when dev was fetched without lookup because dev_net(dev) might be different after holding rtnl_net_lock(): net = dev_net(dev); rtnl_net_lock(net); Let's use rtnl_net_dev_lock() in unregister_netdev(). Note that there is no real bug in unregister_netdev() for now because RTNL protects the scope even if dev_net(dev) is changed before/after RTNL. Fixes: 00fb9823939e ("dev: Hold per-netns RTNL in (un)?register_netdev().") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 19e268568282..fafd2f4b5d5d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11920,11 +11920,9 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - struct net *net = dev_net(dev); - - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_net_unlock(net); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); -- cgit v1.2.3-59-g8ed1b From 3e5796862c692ea608d96f0a1437f9290f44953a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:07 -0800 Subject: flow_dissector: Fix handling of mixed port and port-range keys This patch fixes a bug in TC flower filter where rules combining a specific destination port with a source port range weren't working correctly. The specific case was when users tried to configure rules like: tc filter add dev ens38 ingress protocol ip flower ip_proto udp \ dst_port 5000 src_port 2000-3000 action drop The root cause was in the flow dissector code. While both FLOW_DISSECTOR_KEY_PORTS and FLOW_DISSECTOR_KEY_PORTS_RANGE flags were being set correctly in the classifier, the __skb_flow_dissect_ports() function was only populating one of them: whichever came first in the enum check. This meant that when the code needed both a specific port and a port range, one of them would be left as 0, causing the filter to not match packets as expected. Fix it by removing the either/or logic and instead checking and populating both key types independently when they're in use. Fixes: 8ffb055beae5 ("cls_flower: Fix the behavior using port ranges with hw-offload") Reported-by: Qiang Zhang Closes: https://lore.kernel.org/netdev/CAPx+-5uvFxkhkz4=j_Xuwkezjn9U6kzKTD5jz4tZ9msSJ0fOJA@mail.gmail.com/ Cc: Yoshiki Komachi Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250218043210.732959-2-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/core/flow_dissector.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5db41bf2ed93..c33af3ef0b79 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -853,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { - enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; - struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS; - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); - if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + + if (!key_ports && !key_ports_range) return; - key_ports = skb_flow_dissector_target(flow_dissector, - dissector_ports, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); + + if (key_ports) + key_ports->ports = ports; + + if (key_ports_range) + key_ports_range->tp.ports = ports; } static void -- cgit v1.2.3-59-g8ed1b From 69ab34f705fbfabcace64b5d53bb7a4450fac875 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:09 -0800 Subject: flow_dissector: Fix port range key handling in BPF conversion Fix how port range keys are handled in __skb_flow_bpf_to_target() by: - Separating PORTS and PORTS_RANGE key handling - Using correct key_ports_range structure for range keys - Properly initializing both key types independently This ensures port range information is correctly stored in its dedicated structure rather than incorrectly using the regular ports key structure. Fixes: 59fb9b62fb6c ("flow_dissector: Fix to use new variables for port ranges in bpf hook") Reported-by: Qiang Zhang Closes: https://lore.kernel.org/netdev/CAPx+-5uvFxkhkz4=j_Xuwkezjn9U6kzKTD5jz4tZ9msSJ0fOJA@mail.gmail.com/ Cc: Yoshiki Komachi Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250218043210.732959-4-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/core/flow_dissector.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c33af3ef0b79..9cd8de6bebb5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -931,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -975,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE, - target_container); - - if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) { + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + key_ports_range->tp.src = flow_keys->sport; + key_ports_range->tp.dst = flow_keys->dport; + } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { -- cgit v1.2.3-59-g8ed1b From 606572eb22c1786a3957d24307f5760bb058ca19 Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Tue, 18 Feb 2025 16:12:16 +0800 Subject: sctp: Fix undefined behavior in left shift operation According to the C11 standard (ISO/IEC 9899:2011, 6.5.7): "If E1 has a signed type and E1 x 2^E2 is not representable in the result type, the behavior is undefined." Shifting 1 << 31 causes signed integer overflow, which leads to undefined behavior. Fix this by explicitly using '1U << 31' to ensure the shift operates on an unsigned type, avoiding undefined behavior. Signed-off-by: Yu-Chun Lin Link: https://patch.msgid.link/20250218081217.3468369-1-eleanor15x@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index c241cc552e8d..bfcff6d6a438 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -735,7 +735,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ - init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31); + init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1U << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); -- cgit v1.2.3-59-g8ed1b From 4b5a28b38c4a0106c64416a1b2042405166b26ce Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 18 Feb 2025 05:49:30 -0800 Subject: net: Add non-RCU dev_getbyhwaddr() helper Add dedicated helper for finding devices by hardware address when holding rtnl_lock, similar to existing dev_getbyhwaddr_rcu(). This prevents PROVE_LOCKING warnings when rtnl_lock is held but RCU read lock is not. Extract common address comparison logic into dev_addr_cmp(). The context about this change could be found in the following discussion: Link: https://lore.kernel.org/all/20250206-scarlet-ermine-of-improvement-1fcac5@leitao/ Cc: kuniyu@amazon.com Cc: ushankar@purestorage.com Suggested-by: Eric Dumazet Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-1-d3d6892db9e1@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0a86afb85da..94b7d4eca003 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3275,6 +3275,8 @@ static inline struct net_device *first_net_device_rcu(struct net *net) } int netdev_boot_setup_check(struct net_device *dev); +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *hwaddr); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); diff --git a/net/core/dev.c b/net/core/dev.c index fafd2f4b5d5d..72459dd02f38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1121,6 +1121,12 @@ out: return ret; } +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); +} + /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace @@ -1129,7 +1135,7 @@ out: * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -1141,14 +1147,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev_addr_cmp(dev, type, ha)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; -- cgit v1.2.3-59-g8ed1b From 4eae0ee0f1e6256d0b0b9dd6e72f1d9cf8f72e08 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 18 Feb 2025 05:49:31 -0800 Subject: arp: switch to dev_getbyhwaddr() in arp_req_set_public() The arp_req_set_public() function is called with the rtnl lock held, which provides enough synchronization protection. This makes the RCU variant of dev_getbyhwaddr() unnecessary. Switch to using the simpler dev_getbyhwaddr() function since we already have the required rtnl locking. This change helps maintain consistency in the networking code by using the appropriate helper function for the existing locking context. Since we're not holding the RCU read lock in arp_req_set_public() existing code could trigger false positive locking warnings. Fixes: 941666c2e3e0 ("net: RCU conversion of dev_getbyhwaddr() and arp_ioctl()") Suggested-by: Kuniyuki Iwashima Reviewed-by: Kuniyuki Iwashima Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-2-d3d6892db9e1@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/arp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f23a1ec6694c..814300eee39d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1077,7 +1077,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, + dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; -- cgit v1.2.3-59-g8ed1b From 9b6412e6979f6f9e0632075f8f008937b5cd4efd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 17 Feb 2025 11:23:35 +0100 Subject: tcp: drop secpath at the same time as we currently drop dst Xiumei reported hitting the WARN in xfrm6_tunnel_net_exit while running tests that boil down to: - create a pair of netns - run a basic TCP test over ipcomp6 - delete the pair of netns The xfrm_state found on spi_byaddr was not deleted at the time we delete the netns, because we still have a reference on it. This lingering reference comes from a secpath (which holds a ref on the xfrm_state), which is still attached to an skb. This skb is not leaked, it ends up on sk_receive_queue and then gets defer-free'd by skb_attempt_defer_free. The problem happens when we defer freeing an skb (push it on one CPU's defer_list), and don't flush that list before the netns is deleted. In that case, we still have a reference on the xfrm_state that we don't expect at this point. We already drop the skb's dst in the TCP receive path when it's no longer needed, so let's also drop the secpath. At this point, tcp_filter has already called into the LSM hooks that may require the secpath, so it should not be needed anymore. However, in some of those places, the MPTCP extension has just been attached to the skb, so we cannot simply drop all extensions. Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") Reported-by: Xiumei Mu Signed-off-by: Sabrina Dubroca Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/5055ba8f8f72bdcb602faa299faca73c280b7735.1739743613.git.sd@queasysnail.net Signed-off-by: Paolo Abeni --- include/net/tcp.h | 14 ++++++++++++++ net/ipv4/tcp_fastopen.c | 4 ++-- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_ipv4.c | 2 +- 4 files changed, 21 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b2b04835688..930cda5b5eb9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -683,6 +684,19 @@ void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); +static inline void tcp_cleanup_skb(struct sk_buff *skb) +{ + skb_dst_drop(skb); + secpath_reset(skb); +} + +static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb) +{ + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb)); + __skb_queue_tail(&sk->sk_receive_queue, skb); +} + /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 0f523cbfe329..32b28fc21b63 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -178,7 +178,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) if (!skb) return; - skb_dst_drop(skb); + tcp_cleanup_skb(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects @@ -195,7 +195,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 98b8cc740392..0cbf81bf3d45 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4976,7 +4976,7 @@ static void tcp_ofo_queue(struct sock *sk) tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); else kfree_skb_partial(skb, fragstolen); @@ -5168,7 +5168,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); skb_set_owner_r(skb, sk); } return eaten; @@ -5251,7 +5251,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); return; } - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -6232,7 +6232,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cc2b5194a18d..2632844d2c35 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2027,7 +2027,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, */ skb_condense(skb); - skb_dst_drop(skb); + tcp_cleanup_skb(skb); if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); -- cgit v1.2.3-59-g8ed1b From 14ad6ed30a10afbe91b0749d6378285f4225d482 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:39 +0100 Subject: net: allow small head cache usage with large MAX_SKB_FRAGS values Sabrina reported the following splat: WARNING: CPU: 0 PID: 1 at net/core/dev.c:6935 netif_napi_add_weight_locked+0x8f2/0xba0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.14.0-rc1-net-00092-g011b03359038 #996 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 RIP: 0010:netif_napi_add_weight_locked+0x8f2/0xba0 Code: e8 c3 e6 6a fe 48 83 c4 28 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc c7 44 24 10 ff ff ff ff e9 8f fb ff ff e8 9e e6 6a fe <0f> 0b e9 d3 fe ff ff e8 92 e6 6a fe 48 8b 04 24 be ff ff ff ff 48 RSP: 0000:ffffc9000001fc60 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff88806ce48128 RCX: 1ffff11001664b9e RDX: ffff888008f00040 RSI: ffffffff8317ca42 RDI: ffff88800b325cb6 RBP: ffff88800b325c40 R08: 0000000000000001 R09: ffffed100167502c R10: ffff88800b3a8163 R11: 0000000000000000 R12: ffff88800ac1c168 R13: ffff88800ac1c168 R14: ffff88800ac1c168 R15: 0000000000000007 FS: 0000000000000000(0000) GS:ffff88806ce00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888008201000 CR3: 0000000004c94001 CR4: 0000000000370ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: gro_cells_init+0x1ba/0x270 xfrm_input_init+0x4b/0x2a0 xfrm_init+0x38/0x50 ip_rt_init+0x2d7/0x350 ip_init+0xf/0x20 inet_init+0x406/0x590 do_one_initcall+0x9d/0x2e0 do_initcalls+0x23b/0x280 kernel_init_freeable+0x445/0x490 kernel_init+0x20/0x1d0 ret_from_fork+0x46/0x80 ret_from_fork_asm+0x1a/0x30 irq event stamp: 584330 hardirqs last enabled at (584338): [] __up_console_sem+0x77/0xb0 hardirqs last disabled at (584345): [] __up_console_sem+0x5c/0xb0 softirqs last enabled at (583242): [] netlink_insert+0x14d/0x470 softirqs last disabled at (583754): [] netif_napi_add_weight_locked+0x77d/0xba0 on kernel built with MAX_SKB_FRAGS=45, where SKB_WITH_OVERHEAD(1024) is smaller than GRO_MAX_HEAD. Such built additionally contains the revert of the single page frag cache so that napi_get_frags() ends up using the page frag allocator, triggering the splat. Note that the underlying issue is independent from the mentioned revert; address it ensuring that the small head cache will fit either TCP and GRO allocation and updating napi_alloc_skb() and __netdev_alloc_skb() to select kmalloc() usage for any allocation fitting such cache. Reported-by: Sabrina Dubroca Suggested-by: Eric Dumazet Fixes: 3948b05950fd ("net: introduce a config option to tweak MAX_SKB_FRAGS") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/gro.h | 3 +++ net/core/gro.c | 3 --- net/core/skbuff.c | 10 +++++++--- 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/gro.h b/include/net/gro.h index b9b58c1f8d19..7b548f91754b 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -11,6 +11,9 @@ #include #include +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + struct napi_gro_cb { union { struct { diff --git a/net/core/gro.c b/net/core/gro.c index d1f44084e978..78b320b63174 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -7,9 +7,6 @@ #define MAX_GRO_SKBS 8 -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(offload_lock); /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a441613a1e6c..f5a6d50570c4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +96,9 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) +#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ + GRO_MAX_HEAD_PAD)) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique @@ -736,7 +739,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); @@ -816,7 +819,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) * When the small frag allocator is available, prefer it over kmalloc * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + if ((!NAPI_HAS_SMALL_PAGE_FRAG && + len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, -- cgit v1.2.3-59-g8ed1b From 6bc7e4eb0499562ccd291712fd7be0d1a5aad00a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:40 +0100 Subject: Revert "net: skb: introduce and use a single page frag cache" After the previous commit is finally safe to revert commit dbae2b062824 ("net: skb: introduce and use a single page frag cache"): do it here. The intended goal of such change was to counter a performance regression introduced by commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to recent updates in the affected code. Suggested-by: Eric Dumazet Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 1 - net/core/dev.c | 17 ++++++++ net/core/skbuff.c | 104 +++------------------------------------------- 3 files changed, 22 insertions(+), 100 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 94b7d4eca003..ab550a89b9bf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4117,7 +4117,6 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) diff --git a/net/core/dev.c b/net/core/dev.c index 72459dd02f38..1b252e9459fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6991,6 +6991,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f5a6d50570c4..7b03b64fdcb2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -223,67 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif - struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -293,23 +235,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -816,11 +741,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && - len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -830,32 +752,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } + len = SKB_HEAD_ALIGN(len); + if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len = SKB_HEAD_ALIGN(len); - - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) -- cgit v1.2.3-59-g8ed1b