From 8542d6fac25c03b4bf36b2d762cfe60fda8491bb Mon Sep 17 00:00:00 2001 From: Tengteng Yang Date: Tue, 27 May 2025 11:04:19 +0800 Subject: Fix sock_exceed_buf_limit not being triggered in __sk_mem_raise_allocated When a process under memory pressure is not part of any cgroup and the charged flag is false, trace_sock_exceed_buf_limit was not called as expected. This regression was introduced by commit 2def8ff3fdb6 ("sock: Code cleanup on __sk_mem_raise_allocated()"). The fix changes the default value of charged to true while preserving existing logic. Fixes: 2def8ff3fdb6 ("sock: Code cleanup on __sk_mem_raise_allocated()") Signed-off-by: Abel Wu Signed-off-by: Tengteng Yang Link: https://patch.msgid.link/20250527030419.67693-1-yangtengteng@bytedance.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 341979874459..3b409bc8ef6d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3284,16 +3284,16 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; - bool charged = false; + bool charged = true; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (memcg) { - if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) + charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); + if (!charged) goto suppress_allocation; - charged = true; } /* Under limit. */ @@ -3378,7 +3378,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (charged) + if (memcg && charged) mem_cgroup_uncharge_skmem(memcg, amt); return 0; -- cgit v1.2.3-59-g8ed1b From 271683bb2cf32e5126c592b5d5e6a756fa374fd9 Mon Sep 17 00:00:00 2001 From: Dong Chenchen Date: Tue, 27 May 2025 19:41:52 +0800 Subject: page_pool: Fix use-after-free in page_pool_recycle_in_ring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzbot reported a uaf in page_pool_recycle_in_ring: BUG: KASAN: slab-use-after-free in lock_release+0x151/0xa30 kernel/locking/lockdep.c:5862 Read of size 8 at addr ffff8880286045a0 by task syz.0.284/6943 CPU: 0 UID: 0 PID: 6943 Comm: syz.0.284 Not tainted 6.13.0-rc3-syzkaller-gdfa94ce54f41 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x169/0x550 mm/kasan/report.c:489 kasan_report+0x143/0x180 mm/kasan/report.c:602 lock_release+0x151/0xa30 kernel/locking/lockdep.c:5862 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:165 [inline] _raw_spin_unlock_bh+0x1b/0x40 kernel/locking/spinlock.c:210 spin_unlock_bh include/linux/spinlock.h:396 [inline] ptr_ring_produce_bh include/linux/ptr_ring.h:164 [inline] page_pool_recycle_in_ring net/core/page_pool.c:707 [inline] page_pool_put_unrefed_netmem+0x748/0xb00 net/core/page_pool.c:826 page_pool_put_netmem include/net/page_pool/helpers.h:323 [inline] page_pool_put_full_netmem include/net/page_pool/helpers.h:353 [inline] napi_pp_put_page+0x149/0x2b0 net/core/skbuff.c:1036 skb_pp_recycle net/core/skbuff.c:1047 [inline] skb_free_head net/core/skbuff.c:1094 [inline] skb_release_data+0x6c4/0x8a0 net/core/skbuff.c:1125 skb_release_all net/core/skbuff.c:1190 [inline] __kfree_skb net/core/skbuff.c:1204 [inline] sk_skb_reason_drop+0x1c9/0x380 net/core/skbuff.c:1242 kfree_skb_reason include/linux/skbuff.h:1263 [inline] __skb_queue_purge_reason include/linux/skbuff.h:3343 [inline] root cause is: page_pool_recycle_in_ring ptr_ring_produce spin_lock(&r->producer_lock); WRITE_ONCE(r->queue[r->producer++], ptr) //recycle last page to pool page_pool_release page_pool_scrub page_pool_empty_ring ptr_ring_consume page_pool_return_page //release all page __page_pool_destroy free_percpu(pool->recycle_stats); free(pool) //free spin_unlock(&r->producer_lock); //pool->ring uaf read recycle_stat_inc(pool, ring); page_pool can be free while page pool recycle the last page in ring. Add producer-lock barrier to page_pool_release to prevent the page pool from being free before all pages have been recycled. recycle_stat_inc() is empty when CONFIG_PAGE_POOL_STATS is not enabled, which will trigger Wempty-body build warning. Add definition for pool stat macro to fix warning. Suggested-by: Jakub Kicinski Link: https://lore.kernel.org/netdev/20250513083123.3514193-1-dongchenchen2@huawei.com Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") Reported-by: syzbot+204a4382fcb3311f3858@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=204a4382fcb3311f3858 Signed-off-by: Dong Chenchen Reviewed-by: Toke Høiland-Jørgensen Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250527114152.3119109-1-dongchenchen2@huawei.com Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 4011eb305cee..ba7cf3e3c32f 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -153,9 +153,9 @@ u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) EXPORT_SYMBOL(page_pool_ethtool_stats_get); #else -#define alloc_stat_inc(pool, __stat) -#define recycle_stat_inc(pool, __stat) -#define recycle_stat_add(pool, __stat, val) +#define alloc_stat_inc(...) do { } while (0) +#define recycle_stat_inc(...) do { } while (0) +#define recycle_stat_add(...) do { } while (0) #endif static bool page_pool_producer_lock(struct page_pool *pool) @@ -741,19 +741,16 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { - int ret; - /* BH protection not needed if current is softirq */ - if (in_softirq()) - ret = ptr_ring_produce(&pool->ring, (__force void *)netmem); - else - ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem); + bool in_softirq, ret; - if (!ret) { + /* BH protection not needed if current is softirq */ + in_softirq = page_pool_producer_lock(pool); + ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); + if (ret) recycle_stat_inc(pool, ring); - return true; - } + page_pool_producer_unlock(pool, in_softirq); - return false; + return ret; } /* Only allow direct recycling in special circumstances, into the @@ -1150,10 +1147,14 @@ static void page_pool_scrub(struct page_pool *pool) static int page_pool_release(struct page_pool *pool) { + bool in_softirq; int inflight; page_pool_scrub(pool); inflight = page_pool_inflight(pool, true); + /* Acquire producer lock to make sure producers have exited. */ + in_softirq = page_pool_producer_lock(pool); + page_pool_producer_unlock(pool, in_softirq); if (!inflight) __page_pool_destroy(pool); -- cgit v1.2.3-59-g8ed1b From fd579a2ebbe4b7e6b388915a50d904e772a35c61 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 27 May 2025 16:01:43 +0100 Subject: rxrpc: Fix return from none_validate_challenge() Fix the return value of none_validate_challenge() to be explicitly true (which indicates the source packet should simply be discarded) rather than implicitly true (because rxrpc_abort_conn() always returns -EPROTO which gets converted to true). Note that this change doesn't change the behaviour of the code (which is correct by accident) and, in any case, we *shouldn't* get a CHALLENGE packet to an rxnull connection (ie. no security). Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2025-April/009738.html Signed-off-by: David Howells cc: Marc Dionne cc: Jakub Kicinski cc: "David S. Miller" cc: Eric Dumazet cc: Paolo Abeni cc: Simon Horman cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/10720.1748358103@warthog.procyon.org.uk Fixes: 5800b1cf3fd8 ("rxrpc: Allow CHALLENGEs to the passed to the app for a RESPONSE") Signed-off-by: Paolo Abeni --- net/rxrpc/insecure.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 1f7c136d6d0e..0a260df45d25 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -45,8 +45,9 @@ static void none_free_call_crypto(struct rxrpc_call *call) static bool none_validate_challenge(struct rxrpc_connection *conn, struct sk_buff *skb) { - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxrpc_eproto_rxnull_challenge); + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxrpc_eproto_rxnull_challenge); + return true; } static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, -- cgit v1.2.3-59-g8ed1b From f29ccaa07cf3d35990f4d25028cc55470d29372b Mon Sep 17 00:00:00 2001 From: Charalampos Mitrodimas Date: Tue, 27 May 2025 16:35:44 +0000 Subject: net: tipc: fix refcount warning in tipc_aead_encrypt syzbot reported a refcount warning [1] caused by calling get_net() on a network namespace that is being destroyed (refcount=0). This happens when a TIPC discovery timer fires during network namespace cleanup. The recently added get_net() call in commit e279024617134 ("net/tipc: fix slab-use-after-free Read in tipc_aead_encrypt_done") attempts to hold a reference to the network namespace. However, if the namespace is already being destroyed, its refcount might be zero, leading to the use-after-free warning. Replace get_net() with maybe_get_net(), which safely checks if the refcount is non-zero before incrementing it. If the namespace is being destroyed, return -ENODEV early, after releasing the bearer reference. [1]: https://lore.kernel.org/all/68342b55.a70a0220.253bc2.0091.GAE@google.com/T/#m12019cf9ae77e1954f666914640efa36d52704a2 Reported-by: syzbot+f0c4a4aba757549ae26c@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68342b55.a70a0220.253bc2.0091.GAE@google.com/T/#m12019cf9ae77e1954f666914640efa36d52704a2 Fixes: e27902461713 ("net/tipc: fix slab-use-after-free Read in tipc_aead_encrypt_done") Signed-off-by: Charalampos Mitrodimas Reviewed-by: Tung Nguyen Link: https://patch.msgid.link/20250527-net-tipc-warning-v2-1-df3dc398a047@posteo.net Signed-off-by: Paolo Abeni --- net/tipc/crypto.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index f4cfe88670f5..ea5bb131ebd0 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -818,7 +818,11 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, } /* Get net to avoid freed tipc_crypto when delete namespace */ - get_net(aead->crypto->net); + if (!maybe_get_net(aead->crypto->net)) { + tipc_bearer_put(b); + rc = -ENODEV; + goto exit; + } /* Now, do encrypt */ rc = crypto_aead_encrypt(req); -- cgit v1.2.3-59-g8ed1b From 03f1700b9b4d4f2fed3165370f3c23db76553178 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 22 May 2025 21:16:02 +0300 Subject: Bluetooth: MGMT: reject malformed HCI_CMD_SYNC commands In 'mgmt_hci_cmd_sync()', check whether the size of parameters passed in 'struct mgmt_cp_hci_cmd_sync' matches the total size of the data (i.e. 'sizeof(struct mgmt_cp_hci_cmd_sync)' plus trailing bytes). Otherwise, large invalid 'params_len' will cause 'hci_cmd_sync_alloc()' to do 'skb_put_data()' from an area beyond the one actually passed to 'mgmt_hci_cmd_sync()'. Reported-by: syzbot+5fe2d5bfbfbec0b675a0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=5fe2d5bfbfbec0b675a0 Fixes: 827af4787e74 ("Bluetooth: MGMT: Add initial implementation of MGMT_OP_HCI_CMD_SYNC") Signed-off-by: Dmitry Antipov Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 261926dccc7e..14a9462fced5 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2566,7 +2566,8 @@ static int mgmt_hci_cmd_sync(struct sock *sk, struct hci_dev *hdev, struct mgmt_pending_cmd *cmd; int err; - if (len < sizeof(*cp)) + if (len != (offsetof(struct mgmt_cp_hci_cmd_sync, params) + + le16_to_cpu(cp->params_len))) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, MGMT_STATUS_INVALID_PARAMS); -- cgit v1.2.3-59-g8ed1b From 03dba9cea72f977e873e4e60e220fa596959dd8f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 28 May 2025 14:53:11 -0400 Subject: Bluetooth: L2CAP: Fix not responding with L2CAP_CR_LE_ENCRYPTION Depending on the security set the response to L2CAP_LE_CONN_REQ shall be just L2CAP_CR_LE_ENCRYPTION if only encryption when BT_SECURITY_MEDIUM is selected since that means security mode 2 which doesn't require authentication which is something that is covered in the qualification test L2CAP/LE/CFC/BV-25-C. Link: https://github.com/bluez/bluez/issues/1270 Fixes: 27e2d4c8d28b ("Bluetooth: Add basic LE L2CAP connect request receiving support") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 042d3ac3b4a3..a5bde5db58ef 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4870,7 +4870,8 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, if (!smp_sufficient_security(conn->hcon, pchan->sec_level, SMP_ALLOW_STK)) { - result = L2CAP_CR_LE_AUTHENTICATION; + result = pchan->sec_level == BT_SECURITY_MEDIUM ? + L2CAP_CR_LE_ENCRYPTION : L2CAP_CR_LE_AUTHENTICATION; chan = NULL; goto response_unlock; } -- cgit v1.2.3-59-g8ed1b From efdddc4484859082da6c7877ed144c8121c8ea55 Mon Sep 17 00:00:00 2001 From: Álvaro Fernández Rojas Date: Thu, 29 May 2025 14:44:06 +0200 Subject: net: dsa: tag_brcm: legacy: fix pskb_may_pull length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BRCM_LEG_PORT_ID was incorrectly used for pskb_may_pull length. The correct check is BRCM_LEG_TAG_LEN + VLAN_HLEN, or 10 bytes. Fixes: 964dbf186eaa ("net: dsa: tag_brcm: add support for legacy tags") Signed-off-by: Álvaro Fernández Rojas Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250529124406.2513779-1-noltari@gmail.com Signed-off-by: Jakub Kicinski --- net/dsa/tag_brcm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 8c3c068728e5..fe75821623a4 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -257,7 +257,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, int source_port; u8 *brcm_tag; - if (unlikely(!pskb_may_pull(skb, BRCM_LEG_PORT_ID))) + if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN))) return NULL; brcm_tag = dsa_etype_header_pos_rx(skb); -- cgit v1.2.3-59-g8ed1b From c1f4cb8a8d48db5c8396e1976c5b2e7922d944b9 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 28 May 2025 21:10:58 +0000 Subject: net: Fix net_devmem_bind_dmabuf for non-devmem configs Fix the signature of the net_devmem_bind_dmabuf API for CONFIG_NET_DEVMEM=n. Fixes: bd61848900bf ("net: devmem: Implement TX path") Signed-off-by: Pranjal Shrivastava Link: https://patch.msgid.link/20250528211058.1826608-1-praan@google.com Signed-off-by: Jakub Kicinski --- net/core/devmem.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/devmem.h b/net/core/devmem.h index e7ba77050b8f..0a3b28ba5c13 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -170,8 +170,9 @@ static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq) } static inline struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, +net_devmem_bind_dmabuf(struct net_device *dev, enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { -- cgit v1.2.3-59-g8ed1b From 3ec523304976648b45a3eef045e97d17122ff1b2 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Thu, 29 May 2025 03:18:30 -0700 Subject: hv_netvsc: fix potential deadlock in netvsc_vf_setxdp() The MANA driver's probe registers netdevice via the following call chain: mana_probe() register_netdev() register_netdevice() register_netdevice() calls notifier callback for netvsc driver, holding the netdev mutex via netdev_lock_ops(). Further this netvsc notifier callback end up attempting to acquire the same lock again in dev_xdp_propagate() leading to deadlock. netvsc_netdev_event() netvsc_vf_setxdp() dev_xdp_propagate() This deadlock was not observed so far because net_shaper_ops was never set, and thus the lock was effectively a no-op in this case. Fix this by using netif_xdp_propagate() instead of dev_xdp_propagate() to avoid recursive locking in this path. And, since no deadlock is observed on the other path which is via netvsc_probe, add the lock exclusivly for that path. Also, clean up the unregistration path by removing the unnecessary call to netvsc_vf_setxdp(), since unregister_netdevice_many_notify() already performs this cleanup via dev_xdp_uninstall(). Fixes: 97246d6d21c2 ("net: hold netdev instance lock during ndo_bpf") Cc: stable@vger.kernel.org Signed-off-by: Saurabh Sengar Tested-by: Erni Sri Satya Vennela Reviewed-by: Haiyang Zhang Reviewed-by: Subbaraya Sundeep Link: https://patch.msgid.link/1748513910-23963-1-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/hyperv/netvsc_bpf.c | 2 +- drivers/net/hyperv/netvsc_drv.c | 4 ++-- net/core/dev.c | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index e01c5997a551..1dd3755d9e6d 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -183,7 +183,7 @@ int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog) xdp.command = XDP_SETUP_PROG; xdp.prog = prog; - ret = dev_xdp_propagate(vf_netdev, &xdp); + ret = netif_xdp_propagate(vf_netdev, &xdp); if (ret && prog) bpf_prog_put(prog); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 14a0d04e21ae..c41a025c66f0 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2462,8 +2462,6 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev) netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name); - netvsc_vf_setxdp(vf_netdev, NULL); - reinit_completion(&net_device_ctx->vf_add); netdev_rx_handler_unregister(vf_netdev); netdev_upper_dev_unlink(vf_netdev, ndev); @@ -2631,7 +2629,9 @@ static int netvsc_probe(struct hv_device *dev, continue; netvsc_prepare_bonding(vf_netdev); + netdev_lock_ops(vf_netdev); netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE); + netdev_unlock_ops(vf_netdev); __netvsc_vf_setup(net, vf_netdev); break; } diff --git a/net/core/dev.c b/net/core/dev.c index 2b514d95c528..a388f459a366 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9968,6 +9968,7 @@ int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) return dev->netdev_ops->ndo_bpf(dev, bpf); } +EXPORT_SYMBOL_GPL(netif_xdp_propagate); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { -- cgit v1.2.3-59-g8ed1b From 6043b794c7668c19dabc4a93c75b924a19474d59 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 29 May 2025 12:28:05 +0200 Subject: net: Fix checksum update for ILA adj-transport During ILA address translations, the L4 checksums can be handled in different ways. One of them, adj-transport, consist in parsing the transport layer and updating any found checksum. This logic relies on inet_proto_csum_replace_by_diff and produces an incorrect skb->csum when in state CHECKSUM_COMPLETE. This bug can be reproduced with a simple ILA to SIR mapping, assuming packets are received with CHECKSUM_COMPLETE: $ ip a show dev eth0 14: eth0@if15: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 62:ae:35:9e:0f:8d brd ff:ff:ff:ff:ff:ff link-netnsid 0 inet6 3333:0:0:1::c078/64 scope global valid_lft forever preferred_lft forever inet6 fd00:10:244:1::c078/128 scope global nodad valid_lft forever preferred_lft forever inet6 fe80::60ae:35ff:fe9e:f8d/64 scope link proto kernel_ll valid_lft forever preferred_lft forever $ ip ila add loc_match fd00:10:244:1 loc 3333:0:0:1 \ csum-mode adj-transport ident-type luid dev eth0 Then I hit [fd00:10:244:1::c078]:8000 with a server listening only on [3333:0:0:1::c078]:8000. With the bug, the SYN packet is dropped with SKB_DROP_REASON_TCP_CSUM after inet_proto_csum_replace_by_diff changed skb->csum. The translation and drop are visible on pwru [1] traces: IFACE TUPLE FUNC eth0:9 [fd00:10:244:3::3d8]:51420->[fd00:10:244:1::c078]:8000(tcp) ipv6_rcv eth0:9 [fd00:10:244:3::3d8]:51420->[fd00:10:244:1::c078]:8000(tcp) ip6_rcv_core eth0:9 [fd00:10:244:3::3d8]:51420->[fd00:10:244:1::c078]:8000(tcp) nf_hook_slow eth0:9 [fd00:10:244:3::3d8]:51420->[fd00:10:244:1::c078]:8000(tcp) inet_proto_csum_replace_by_diff eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) tcp_v6_early_demux eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ip6_route_input eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ip6_input eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ip6_input_finish eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ip6_protocol_deliver_rcu eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) raw6_local_deliver eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ipv6_raw_deliver eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) tcp_v6_rcv eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) __skb_checksum_complete eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) kfree_skb_reason(SKB_DROP_REASON_TCP_CSUM) eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) skb_release_head_state eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) skb_release_data eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) skb_free_head eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) kfree_skbmem This is happening because inet_proto_csum_replace_by_diff is updating skb->csum when it shouldn't. The L4 checksum is updated such that it "cancels" the IPv6 address change in terms of checksum computation, so the impact on skb->csum is null. Note this would be different for an IPv4 packet since three fields would be updated: the IPv4 address, the IP checksum, and the L4 checksum. Two would cancel each other and skb->csum would still need to be updated to take the L4 checksum change into account. This patch fixes it by passing an ipv6 flag to inet_proto_csum_replace_by_diff, to skip the skb->csum update if we're in the IPv6 case. Note the behavior of the only other user of inet_proto_csum_replace_by_diff, the BPF subsystem, is left as is in this patch and fixed in the subsequent patch. With the fix, using the reproduction from above, I can confirm skb->csum is not touched by inet_proto_csum_replace_by_diff and the TCP SYN proceeds to the application after the ILA translation. Link: https://github.com/cilium/pwru [1] Fixes: 65d7ab8de582 ("net: Identifier Locator Addressing module") Signed-off-by: Paul Chaignon Acked-by: Daniel Borkmann Link: https://patch.msgid.link/b5539869e3550d46068504feb02d37653d939c0b.1748509484.git.paul.chaignon@gmail.com Signed-off-by: Jakub Kicinski --- include/net/checksum.h | 2 +- net/core/filter.c | 2 +- net/core/utils.c | 4 ++-- net/ipv6/ila/ila_common.c | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/checksum.h b/include/net/checksum.h index e57986b173f8..3cbab35de5ab 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -152,7 +152,7 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr); + __wsum diff, bool pseudohdr, bool ipv6); static __always_inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, diff --git a/net/core/filter.c b/net/core/filter.c index ab456bf1056e..f1de7bd8b547 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1987,7 +1987,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, if (unlikely(from != 0)) return -EINVAL; - inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, false); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); diff --git a/net/core/utils.c b/net/core/utils.c index e47feeaa5a49..5e63b0ea21f3 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -473,11 +473,11 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, EXPORT_SYMBOL(inet_proto_csum_replace16); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr) + __wsum diff, bool pseudohdr, bool ipv6) { if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_replace_by_diff(sum, diff); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6) skb->csum = ~csum_sub(diff, skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 95e9146918cc..b8d43ed4689d 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -86,7 +86,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&th->check, skb, - diff, true); + diff, true, true); } break; case NEXTHDR_UDP: @@ -97,7 +97,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&uh->check, skb, - diff, true); + diff, true, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } @@ -111,7 +111,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, - diff, true); + diff, true, true); } break; } -- cgit v1.2.3-59-g8ed1b From ead7f9b8de65632ef8060b84b0c55049a33cfea1 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 29 May 2025 12:28:35 +0200 Subject: bpf: Fix L4 csum update on IPv6 in CHECKSUM_COMPLETE In Cilium, we use bpf_csum_diff + bpf_l4_csum_replace to, among other things, update the L4 checksum after reverse SNATing IPv6 packets. That use case is however not currently supported and leads to invalid skb->csum values in some cases. This patch adds support for IPv6 address changes in bpf_l4_csum_update via a new flag. When calling bpf_l4_csum_replace in Cilium, it ends up calling inet_proto_csum_replace_by_diff: 1: void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, 2: __wsum diff, bool pseudohdr) 3: { 4: if (skb->ip_summed != CHECKSUM_PARTIAL) { 5: csum_replace_by_diff(sum, diff); 6: if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) 7: skb->csum = ~csum_sub(diff, skb->csum); 8: } else if (pseudohdr) { 9: *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); 10: } 11: } The bug happens when we're in the CHECKSUM_COMPLETE state. We've just updated one of the IPv6 addresses. The helper now updates the L4 header checksum on line 5. Next, it updates skb->csum on line 7. It shouldn't. For an IPv6 packet, the updates of the IPv6 address and of the L4 checksum will cancel each other. The checksums are set such that computing a checksum over the packet including its checksum will result in a sum of 0. So the same is true here when we update the L4 checksum on line 5. We'll update it as to cancel the previous IPv6 address update. Hence skb->csum should remain untouched in this case. The same bug doesn't affect IPv4 packets because, in that case, three fields are updated: the IPv4 address, the IP checksum, and the L4 checksum. The change to the IPv4 address and one of the checksums still cancel each other in skb->csum, but we're left with one checksum update and should therefore update skb->csum accordingly. That's exactly what inet_proto_csum_replace_by_diff does. This special case for IPv6 L4 checksums is also described atop inet_proto_csum_replace16, the function we should be using in this case. This patch introduces a new bpf_l4_csum_replace flag, BPF_F_IPV6, to indicate that we're updating the L4 checksum of an IPv6 packet. When the flag is set, inet_proto_csum_replace_by_diff will skip the skb->csum update. Fixes: 7d672345ed295 ("bpf: add generic bpf_csum_diff helper") Signed-off-by: Paul Chaignon Acked-by: Daniel Borkmann Link: https://patch.msgid.link/96a6bc3a443e6f0b21ff7b7834000e17fb549e05.1748509484.git.paul.chaignon@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 5 +++-- tools/include/uapi/linux/bpf.h | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85180e4aaa5a..0b4a2f124d11 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2056,6 +2056,7 @@ union bpf_attr { * for updates resulting in a null checksum the value is set to * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates * that the modified header field is part of the pseudo-header. + * Flag **BPF_F_IPV6** should be set for IPv6 packets. * * This helper works in combination with **bpf_csum_diff**\ (), * which does not update the checksum in-place, but offers more @@ -6072,6 +6073,7 @@ enum { BPF_F_PSEUDO_HDR = (1ULL << 4), BPF_F_MARK_MANGLED_0 = (1ULL << 5), BPF_F_MARK_ENFORCE = (1ULL << 6), + BPF_F_IPV6 = (1ULL << 7), }; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ diff --git a/net/core/filter.c b/net/core/filter.c index f1de7bd8b547..327ca73f9cd7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1968,10 +1968,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; + bool is_ipv6 = flags & BPF_F_IPV6; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | - BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; @@ -1987,7 +1988,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, if (unlikely(from != 0)) return -EINVAL; - inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, false); + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85180e4aaa5a..0b4a2f124d11 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2056,6 +2056,7 @@ union bpf_attr { * for updates resulting in a null checksum the value is set to * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates * that the modified header field is part of the pseudo-header. + * Flag **BPF_F_IPV6** should be set for IPv6 packets. * * This helper works in combination with **bpf_csum_diff**\ (), * which does not update the checksum in-place, but offers more @@ -6072,6 +6073,7 @@ enum { BPF_F_PSEUDO_HDR = (1ULL << 4), BPF_F_MARK_MANGLED_0 = (1ULL << 5), BPF_F_MARK_ENFORCE = (1ULL << 6), + BPF_F_IPV6 = (1ULL << 7), }; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ -- cgit v1.2.3-59-g8ed1b From 3382a1ed7f778db841063f5d7e317ac55f9e7f72 Mon Sep 17 00:00:00 2001 From: Shiming Cheng Date: Fri, 30 May 2025 09:26:08 +0800 Subject: net: fix udp gso skb_segment after pull from frag_list Commit a1e40ac5b5e9 ("net: gso: fix udp gso fraglist segmentation after pull from frag_list") detected invalid geometry in frag_list skbs and redirects them from skb_segment_list to more robust skb_segment. But some packets with modified geometry can also hit bugs in that code. We don't know how many such cases exist. Addressing each one by one also requires touching the complex skb_segment code, which risks introducing bugs for other types of skbs. Instead, linearize all these packets that fail the basic invariants on gso fraglist skbs. That is more robust. If only part of the fraglist payload is pulled into head_skb, it will always cause exception when splitting skbs by skb_segment. For detailed call stack information, see below. Valid SKB_GSO_FRAGLIST skbs - consist of two or more segments - the head_skb holds the protocol headers plus first gso_size - one or more frag_list skbs hold exactly one segment - all but the last must be gso_size Optional datapath hooks such as NAT and BPF (bpf_skb_pull_data) can modify fraglist skbs, breaking these invariants. In extreme cases they pull one part of data into skb linear. For UDP, this causes three payloads with lengths of (11,11,10) bytes were pulled tail to become (12,10,10) bytes. The skbs no longer meets the above SKB_GSO_FRAGLIST conditions because payload was pulled into head_skb, it needs to be linearized before pass to regular skb_segment. skb_segment+0xcd0/0xd14 __udp_gso_segment+0x334/0x5f4 udp4_ufo_fragment+0x118/0x15c inet_gso_segment+0x164/0x338 skb_mac_gso_segment+0xc4/0x13c __skb_gso_segment+0xc4/0x124 validate_xmit_skb+0x9c/0x2c0 validate_xmit_skb_list+0x4c/0x80 sch_direct_xmit+0x70/0x404 __dev_queue_xmit+0x64c/0xe5c neigh_resolve_output+0x178/0x1c4 ip_finish_output2+0x37c/0x47c __ip_finish_output+0x194/0x240 ip_finish_output+0x20/0xf4 ip_output+0x100/0x1a0 NF_HOOK+0xc4/0x16c ip_forward+0x314/0x32c ip_rcv+0x90/0x118 __netif_receive_skb+0x74/0x124 process_backlog+0xe8/0x1a4 __napi_poll+0x5c/0x1f8 net_rx_action+0x154/0x314 handle_softirqs+0x154/0x4b8 [118.376811] [C201134] rxq0_pus: [name:bug&]kernel BUG at net/core/skbuff.c:4278! [118.376829] [C201134] rxq0_pus: [name:traps&]Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP [118.470774] [C201134] rxq0_pus: [name:mrdump&]Kernel Offset: 0x178cc00000 from 0xffffffc008000000 [118.470810] [C201134] rxq0_pus: [name:mrdump&]PHYS_OFFSET: 0x40000000 [118.470827] [C201134] rxq0_pus: [name:mrdump&]pstate: 60400005 (nZCv daif +PAN -UAO) [118.470848] [C201134] rxq0_pus: [name:mrdump&]pc : [0xffffffd79598aefc] skb_segment+0xcd0/0xd14 [118.470900] [C201134] rxq0_pus: [name:mrdump&]lr : [0xffffffd79598a5e8] skb_segment+0x3bc/0xd14 [118.470928] [C201134] rxq0_pus: [name:mrdump&]sp : ffffffc008013770 Fixes: a1e40ac5b5e9 ("gso: fix udp gso fraglist segmentation after pull from frag_list") Signed-off-by: Shiming Cheng Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 9c775f8aa438..85b5aa82d7d7 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -495,6 +495,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, bool copy_dtor; __sum16 check; __be16 newlen; + int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) @@ -523,6 +524,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) return __udp_gso_segment_list(gso_skb, features, is_ipv6); + ret = __skb_linearize(gso_skb); + if (ret) + return ERR_PTR(ret); + /* Setup csum, as fraglist skips this in udp4_gro_receive. */ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; gso_skb->csum_offset = offsetof(struct udphdr, check); -- cgit v1.2.3-59-g8ed1b From ea77c397bff8b6d59f6d83dae1425b08f465e8b5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 23 May 2025 14:20:44 +0200 Subject: netfilter: nf_set_pipapo_avx2: fix initial map fill If the first field doesn't cover the entire start map, then we must zero out the remainder, else we leak those bits into the next match round map. The early fix was incomplete and did only fix up the generic C implementation. A followup patch adds a test case to nft_concat_range.sh. Fixes: 791a615b7ad2 ("netfilter: nf_set_pipapo: fix initial map fill") Signed-off-by: Florian Westphal Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo_avx2.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index c15db28c5ebc..be7c16c79f71 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1113,6 +1113,25 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, return true; } +/** + * pipapo_resmap_init_avx2() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Like pipapo_resmap_init() but do not set start map bits covered by the first field. + */ +static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + /* Starting map doesn't need to be set to all-ones for this implementation, + * but we do need to zero the remaining bits, if any. + */ + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} + /** * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace @@ -1171,7 +1190,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, res = scratch->map + (map_index ? m->bsize_max : 0); fill = scratch->map + (map_index ? 0 : m->bsize_max); - /* Starting map doesn't need to be set for this implementation */ + pipapo_resmap_init_avx2(m, res); nft_pipapo_avx2_prepare(); -- cgit v1.2.3-59-g8ed1b From 50d9ce9679dd50df2dc51ada717fa875bc248fad Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 30 May 2025 12:34:02 +0200 Subject: netfilter: nf_nat: also check reverse tuple to obtain clashing entry The logic added in the blamed commit was supposed to only omit nat source port allocation if neither the existing nor the new entry are subject to NAT. However, its not enough to lookup the conntrack based on the proposed tuple, we must also check the reverse direction. Otherwise there are esoteric cases where the collision is in the reverse direction because that colliding connection has a port rewrite, but the new entry doesn't. In this case, we only check the new entry and then erronously conclude that no clash exists anymore. The existing (udp) tuple is: a:p -> b:P, with nat translation to s:P, i.e. pure daddr rewrite, reverse tuple in conntrack table is s:P -> a:p. When another UDP packet is sent directly to s, i.e. a:p->s:P, this is correctly detected as a colliding entry: tuple is taken by existing reply tuple in reverse direction. But the colliding conntrack is only searched for with unreversed direction, and we can't find such entry matching a:p->s:P. The incorrect conclusion is that the clashing entry has timed out and that no port address translation is required. Such conntrack will then be discarded at nf_confirm time because the proposed reverse direction clashes with an existing mapping in the conntrack table. Search for the reverse tuple too, this will then check the NAT bits of the colliding entry and triggers port reallocation. Followp patch extends nft_nat.sh selftest to cover this scenario. The IPS_SEQ_ADJUST change is also a bug fix: Instead of checking for SEQ_ADJ this tested for SEEN_REPLY and ASSURED by accident -- _BIT is only for use with the test_bit() API. This bug has little consequence in practice, because the sequence number adjustments are only useful for TCP which doesn't support clash resolution. The existing test case (conntrack_reverse_clash.sh) exercise a race condition path (parallel conntrack creation on different CPUs), so the colliding entries have neither SEEN_REPLY nor ASSURED set. Thanks to Yafang Shao and Shaun Brady for an initial investigation of this bug. Fixes: d8f84a9bc7c4 ("netfilter: nf_nat: don't try nat source port reallocation for reverse dir clash") Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1795 Reported-by: Yafang Shao Reported-by: Shaun Brady Signed-off-by: Florian Westphal Tested-by: Yafang Shao Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index aad84aabd7f1..f391cd267922 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -248,7 +248,7 @@ static noinline bool nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_ct) { - static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT; + static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; const struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conn *ct; @@ -287,8 +287,14 @@ nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, zone = nf_ct_zone(ignored_ct); thash = nf_conntrack_find_get(net, zone, tuple); - if (unlikely(!thash)) /* clashing entry went away */ - return false; + if (unlikely(!thash)) { + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuple(&reply, tuple); + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) /* clashing entry went away */ + return false; + } ct = nf_ct_tuplehash_to_ctrack(thash); -- cgit v1.2.3-59-g8ed1b From 1e1f706fc2ce90eaaf3480b3d5f27885960d751c Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Tue, 3 Jun 2025 15:35:38 +1000 Subject: wifi: cfg80211/mac80211: correctly parse S1G beacon optional elements S1G beacons are not traditional beacons but a type of extension frame. Extension frames contain the frame control and duration fields, followed by zero or more optional fields before the frame body. These optional fields are distinct from the variable length elements. The presence of optional fields is indicated in the frame control field. To correctly locate the elements offset, the frame control must be parsed to identify which optional fields are present. Currently, mac80211 parses S1G beacons based on fixed assumptions about the frame layout, without inspecting the frame control field. This can result in incorrect offsets to the "variable" portion of the frame. Properly parse S1G beacon frames by using the field lengths defined in IEEE 802.11-2024, section 9.3.4.3, ensuring that the elements offset is calculated accurately. Fixes: 9eaffe5078ca ("cfg80211: convert S1G beacon to scan results") Fixes: cd418ba63f0c ("mac80211: convert S1G beacon to scan results") Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250603053538.468562-1-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 79 +++++++++++++++++++++++++++++++++++++++++------ net/mac80211/mlme.c | 7 ++--- net/mac80211/scan.c | 11 +++---- net/wireless/scan.c | 18 +++++------ 4 files changed, 83 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 420c7f9aa6ee..ce377f7fb912 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -111,6 +111,8 @@ /* bits unique to S1G beacon */ #define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 +#define IEEE80211_S1G_BCN_CSSID 0x200 +#define IEEE80211_S1G_BCN_ANO 0x400 /* see 802.11ah-2016 9.9 NDP CMAC frames */ #define IEEE80211_S1G_1MHZ_NDP_BITS 25 @@ -153,9 +155,6 @@ #define IEEE80211_ANO_NETTYPE_WILD 15 -/* bits unique to S1G beacon */ -#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 - /* control extension - for IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTL_EXT */ #define IEEE80211_CTL_EXT_POLL 0x2000 #define IEEE80211_CTL_EXT_SPR 0x3000 @@ -627,6 +626,42 @@ static inline bool ieee80211_is_s1g_beacon(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); } +/** + * ieee80211_s1g_has_next_tbtt - check if IEEE80211_S1G_BCN_NEXT_TBTT + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * next TBTT field + */ +static inline bool ieee80211_s1g_has_next_tbtt(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT)); +} + +/** + * ieee80211_s1g_has_ano - check if IEEE80211_S1G_BCN_ANO + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * ANO field + */ +static inline bool ieee80211_s1g_has_ano(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_ANO)); +} + +/** + * ieee80211_s1g_has_cssid - check if IEEE80211_S1G_BCN_CSSID + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * compressed SSID field + */ +static inline bool ieee80211_s1g_has_cssid(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID)); +} + /** * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon * @fc: frame control bytes in little-endian byteorder @@ -1245,16 +1280,40 @@ struct ieee80211_ext { u8 change_seq; u8 variable[0]; } __packed s1g_beacon; - struct { - u8 sa[ETH_ALEN]; - __le32 timestamp; - u8 change_seq; - u8 next_tbtt[3]; - u8 variable[0]; - } __packed s1g_short_beacon; } u; } __packed __aligned(2); +/** + * ieee80211_s1g_optional_len - determine length of optional S1G beacon fields + * @fc: frame control bytes in little-endian byteorder + * Return: total length in bytes of the optional fixed-length fields + * + * S1G beacons may contain up to three optional fixed-length fields that + * precede the variable-length elements. Whether these fields are present + * is indicated by flags in the frame control field. + * + * From IEEE 802.11-2024 section 9.3.4.3: + * - Next TBTT field may be 0 or 3 bytes + * - Short SSID field may be 0 or 4 bytes + * - Access Network Options (ANO) field may be 0 or 1 byte + */ +static inline size_t +ieee80211_s1g_optional_len(__le16 fc) +{ + size_t len = 0; + + if (ieee80211_s1g_has_next_tbtt(fc)) + len += 3; + + if (ieee80211_s1g_has_cssid(fc)) + len += 4; + + if (ieee80211_s1g_has_ano(fc)) + len += 1; + + return len; +} + #define IEEE80211_TWT_CONTROL_NDP BIT(0) #define IEEE80211_TWT_CONTROL_RESP_MODE BIT(1) #define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST BIT(3) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b84150dbfe8c..948909a242d6 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7220,11 +7220,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, bssid = ieee80211_get_bssid(hdr, len, sdata->vif.type); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { struct ieee80211_ext *ext = (void *) mgmt; - - if (ieee80211_is_s1g_short_beacon(ext->frame_control)) - variable = ext->u.s1g_short_beacon.variable; - else - variable = ext->u.s1g_beacon.variable; + variable = ext->u.s1g_beacon.variable + + ieee80211_s1g_optional_len(ext->frame_control); } baselen = (u8 *) variable - (u8 *) mgmt; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 7b8da40a912d..cd8385ecafd9 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -276,6 +276,7 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; + struct ieee80211_ext *ext; size_t min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); @@ -285,12 +286,10 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_short_beacon.variable); - else - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_beacon); + ext = (struct ieee80211_ext *)mgmt; + min_hdr_len = + offsetof(struct ieee80211_ext, u.s1g_beacon.variable) + + ieee80211_s1g_optional_len(ext->frame_control); } if (skb->len < min_hdr_len) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ddd3a97f6609..e8a4fe44ec2d 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3250,6 +3250,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, const u8 *ie; size_t ielen; u64 tsf; + size_t s1g_optional_len; if (WARN_ON(!mgmt)) return NULL; @@ -3264,12 +3265,11 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *) mgmt; - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_short_beacon.variable); - else - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_beacon.variable); + s1g_optional_len = + ieee80211_s1g_optional_len(ext->frame_control); + min_hdr_len = + offsetof(struct ieee80211_ext, u.s1g_beacon.variable) + + s1g_optional_len; } else { /* same for beacons */ min_hdr_len = offsetof(struct ieee80211_mgmt, @@ -3285,11 +3285,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, const struct ieee80211_s1g_bcn_compat_ie *compat; const struct element *elem; - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - ie = ext->u.s1g_short_beacon.variable; - else - ie = ext->u.s1g_beacon.variable; - + ie = ext->u.s1g_beacon.variable + s1g_optional_len; elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, ie, ielen); if (!elem) return NULL; -- cgit v1.2.3-59-g8ed1b From 535caaca921c653b1f9838fbd5c4e9494cafc3d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Jun 2025 09:39:28 +0000 Subject: net: annotate data-races around cleanup_net_task from_cleanup_net() reads cleanup_net_task locklessly. Add READ_ONCE()/WRITE_ONCE() annotations to avoid a potential KCSAN warning, even if the race is harmless. Fixes: 0734d7c3d93c ("net: expedite synchronize_net() for cleanup_net()") Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250604093928.1323333-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- net/core/net_namespace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a388f459a366..be97c440ecd5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10499,7 +10499,7 @@ static void dev_index_release(struct net *net, int ifindex) static bool from_cleanup_net(void) { #ifdef CONFIG_NET_NS - return current == cleanup_net_task; + return current == READ_ONCE(cleanup_net_task); #else return false; #endif diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 42ee7fce3d95..ae54f26709ca 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -654,7 +654,7 @@ static void cleanup_net(struct work_struct *work) struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); - cleanup_net_task = current; + WRITE_ONCE(cleanup_net_task, current); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -704,7 +704,7 @@ static void cleanup_net(struct work_struct *work) put_user_ns(net->user_ns); net_passive_dec(net); } - cleanup_net_task = NULL; + WRITE_ONCE(cleanup_net_task, NULL); } /** -- cgit v1.2.3-59-g8ed1b From feafc73f3e6ae73371777a037d41d2e31c929636 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Jun 2025 10:58:15 +0000 Subject: net: prevent a NULL deref in rtnl_create_link() At the time rtnl_create_link() is running, dev->netdev_ops is NULL, we must not use netdev_lock_ops() or risk a NULL deref if CONFIG_NET_SHAPER is defined. Use netif_set_group() instead of dev_set_group(). RIP: 0010:netdev_need_ops_lock include/net/netdev_lock.h:33 [inline] RIP: 0010:netdev_lock_ops include/net/netdev_lock.h:41 [inline] RIP: 0010:dev_set_group+0xc0/0x230 net/core/dev_api.c:82 Call Trace: rtnl_create_link+0x748/0xd10 net/core/rtnetlink.c:3674 rtnl_newlink_create+0x25c/0xb00 net/core/rtnetlink.c:3813 __rtnl_newlink net/core/rtnetlink.c:3940 [inline] rtnl_newlink+0x16d6/0x1c70 net/core/rtnetlink.c:4055 rtnetlink_rcv_msg+0x7cf/0xb70 net/core/rtnetlink.c:6944 netlink_rcv_skb+0x208/0x470 net/netlink/af_netlink.c:2534 netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] netlink_unicast+0x75b/0x8d0 net/netlink/af_netlink.c:1339 netlink_sendmsg+0x805/0xb30 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:712 [inline] Reported-by: syzbot+9fc858ba0312b42b577e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6840265f.a00a0220.d4325.0009.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Fixes: 7e4d784f5810 ("net: hold netdev instance lock during rtnetlink operations") Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250604105815.1516973-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f9a35bdc58ad..c57692eb8da9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3671,7 +3671,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) -- cgit v1.2.3-59-g8ed1b From 7632fedb266d93ed0ed9f487133e6c6314a9b2d1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 4 Jun 2025 14:32:52 +0300 Subject: seg6: Fix validation of nexthop addresses The kernel currently validates that the length of the provided nexthop address does not exceed the specified length. This can lead to the kernel reading uninitialized memory if user space provided a shorter length than the specified one. Fix by validating that the provided length exactly matches the specified one. Fixes: d1df6fd8a1d2 ("ipv6: sr: define core operations for seg6local lightweight tunnel") Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20250604113252.371528-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_local.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index ac1dbd492c22..a11a02b4ba95 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1644,10 +1644,8 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, [SEG6_LOCAL_TABLE] = { .type = NLA_U32 }, [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 }, - [SEG6_LOCAL_NH4] = { .type = NLA_BINARY, - .len = sizeof(struct in_addr) }, - [SEG6_LOCAL_NH6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, + [SEG6_LOCAL_NH4] = NLA_POLICY_EXACT_LEN(sizeof(struct in_addr)), + [SEG6_LOCAL_NH6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, -- cgit v1.2.3-59-g8ed1b From 3cae906e1a6184cdc9e4d260e4dbdf9a118d94ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Jun 2025 13:38:26 +0000 Subject: calipso: unlock rcu before returning -EAFNOSUPPORT syzbot reported that a recent patch forgot to unlock rcu in the error path. Adopt the convention that netlbl_conn_setattr() is already using. Fixes: 6e9f2df1c550 ("calipso: Don't call calipso functions for AF_INET sk.") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Kuniyuki Iwashima Acked-by: Paul Moore Link: https://patch.msgid.link/20250604133826.1667664-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/netlabel/netlabel_kapi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 6ea16138582c..33b77084a4e5 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1165,8 +1165,10 @@ int netlbl_conn_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - if (sk->sk_family != AF_INET6) - return -EAFNOSUPPORT; + if (sk->sk_family != AF_INET6) { + ret_val = -EAFNOSUPPORT; + goto conn_setattr_return; + } addr6 = (struct sockaddr_in6 *)addr; entry = netlbl_domhsh_getentry_af6(secattr->domain, -- cgit v1.2.3-59-g8ed1b