aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
commit8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf (patch)
treefec3039a08284cd87f4ec9c3bea5b5a439f1859f /net/core/skbuff.c
parentMerge tag 'sysctl-6.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl (diff)
parentdpll: zl3073x: Fix build failure (diff)
downloadwireguard-linux-8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf.tar.xz
wireguard-linux-8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf.zip
Merge tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Wrap datapath globals into net_aligned_data, to avoid false sharing - Preserve MSG_ZEROCOPY in forwarding (e.g. out of a container) - Add SO_INQ and SCM_INQ support to AF_UNIX - Add SIOCINQ support to AF_VSOCK - Add TCP_MAXSEG sockopt to MPTCP - Add IPv6 force_forwarding sysctl to enable forwarding per interface - Make TCP validation of whether packet fully fits in the receive window and the rcv_buf more strict. With increased use of HW aggregation a single "packet" can be multiple 100s of kB - Add MSG_MORE flag to optimize large TCP transmissions via sockmap, improves latency up to 33% for sockmap users - Convert TCP send queue handling from tasklet to BH workque - Improve BPF iteration over TCP sockets to see each socket exactly once - Remove obsolete and unused TCP RFC3517/RFC6675 loss recovery code - Support enabling kernel threads for NAPI processing on per-NAPI instance basis rather than a whole device. Fully stop the kernel NAPI thread when threaded NAPI gets disabled. Previously thread would stick around until ifdown due to tricky synchronization - Allow multicast routing to take effect on locally-generated packets - Add output interface argument for End.X in segment routing - MCTP: add support for gateway routing, improve bind() handling - Don't require rtnl_lock when fetching an IPv6 neighbor over Netlink - Add a new neighbor flag ("extern_valid"), which cedes refresh responsibilities to userspace. This is needed for EVPN multi-homing where a neighbor entry for a multi-homed host needs to be synced across all the VTEPs among which the host is multi-homed - Support NUD_PERMANENT for proxy neighbor entries - Add a new queuing discipline for IETF RFC9332 DualQ Coupled AQM - Add sequence numbers to netconsole messages. Unregister netconsole's console when all net targets are removed. Code refactoring. Add a number of selftests - Align IPSec inbound SA lookup to RFC 4301. Only SPI and protocol should be used for an inbound SA lookup - Support inspecting ref_tracker state via DebugFS - Don't force bonding advertisement frames tx to ~333 ms boundaries. Add broadcast_neighbor option to send ARP/ND on all bonded links - Allow providing upcall pid for the 'execute' command in openvswitch - Remove DCCP support from Netfilter's conntrack - Disallow multiple packet duplications in the queuing layer - Prevent use of deprecated iptables code on PREEMPT_RT Driver API: - Support RSS and hashing configuration over ethtool Netlink - Add dedicated ethtool callbacks for getting and setting hashing fields - Add support for power budget evaluation strategy in PSE / Power-over-Ethernet. Generate Netlink events for overcurrent etc - Support DPLL phase offset monitoring across all device inputs. Support providing clock reference and SYNC over separate DPLL inputs - Support traffic classes in devlink rate API for bandwidth management - Remove rtnl_lock dependency from UDP tunnel port configuration Device drivers: - Add a new Broadcom driver for 800G Ethernet (bnge) - Add a standalone driver for Microchip ZL3073x DPLL - Remove IBM's NETIUCV device driver - Ethernet high-speed NICs: - Broadcom (bnxt): - support zero-copy Tx of DMABUF memory - take page size into account for page pool recycling rings - Intel (100G, ice, idpf): - idpf: XDP and AF_XDP support preparations - idpf: add flow steering - add link_down_events statistic - clean up the TSPLL code - preparations for live VM migration - nVidia/Mellanox: - support zero-copy Rx/Tx interfaces (DMABUF and io_uring) - optimize context memory usage for matchers - expose serial numbers in devlink info - support PCIe congestion metrics - Meta (fbnic): - add 25G, 50G, and 100G link modes to phylink - support dumping FW logs - Marvell/Cavium: - support for CN20K generation of the Octeon chips - Amazon: - add HW clock (without timestamping, just hypervisor time access) - Ethernet virtual: - VirtIO net: - support segmentation of UDP-tunnel-encapsulated packets - Google (gve): - support packet timestamping and clock synchronization - Microsoft vNIC: - add handler for device-originated servicing events - allow dynamic MSI-X vector allocation - support Tx bandwidth clamping - Ethernet NICs consumer, and embedded: - AMD: - amd-xgbe: hardware timestamping and PTP clock support - Broadcom integrated MACs (bcmgenet, bcmasp): - use napi_complete_done() return value to support NAPI polling - add support for re-starting auto-negotiation - Broadcom switches (b53): - support BCM5325 switches - add bcm63xx EPHY power control - Synopsys (stmmac): - lots of code refactoring and cleanups - TI: - icssg-prueth: read firmware-names from device tree - icssg: PRP offload support - Microchip: - lan78xx: convert to PHYLINK for improved PHY and MAC management - ksz: add KSZ8463 switch support - Intel: - support similar queue priority scheme in multi-queue and time-sensitive networking (taprio) - support packet pre-emption in both - RealTek (r8169): - enable EEE at 5Gbps on RTL8126 - Airoha: - add PPPoE offload support - MDIO bus controller for Airoha AN7583 - Ethernet PHYs: - support for the IPQ5018 internal GE PHY - micrel KSZ9477 switch-integrated PHYs: - add MDI/MDI-X control support - add RX error counters - add cable test support - add Signal Quality Indicator (SQI) reporting - dp83tg720: improve reset handling and reduce link recovery time - support bcm54811 (and its MII-Lite interface type) - air_en8811h: support resume/suspend - support PHY counters for QCA807x and QCA808x - support WoL for QCA807x - CAN drivers: - rcar_canfd: support for Transceiver Delay Compensation - kvaser: report FW versions via devlink dev info - WiFi: - extended regulatory info support (6 GHz) - add statistics and beacon monitor for Multi-Link Operation (MLO) - support S1G aggregation, improve S1G support - add Radio Measurement action fields - support per-radio RTS threshold - some work around how FIPS affects wifi, which was wrong (RC4 is used by TKIP, not only WEP) - improvements for unsolicited probe response handling - WiFi drivers: - RealTek (rtw88): - IBSS mode for SDIO devices - RealTek (rtw89): - BT coexistence for MLO/WiFi7 - concurrent station + P2P support - support for USB devices RTL8851BU/RTL8852BU - Intel (iwlwifi): - use embedded PNVM in (to be released) FW images to fix compatibility issues - many cleanups (unused FW APIs, PCIe code, WoWLAN) - some FIPS interoperability - MediaTek (mt76): - firmware recovery improvements - more MLO work - Qualcomm/Atheros (ath12k): - fix scan on multi-radio devices - more EHT/Wi-Fi 7 features - encapsulation/decapsulation offload - Broadcom (brcm80211): - support SDIO 43751 device - Bluetooth: - hci_event: add support for handling LE BIG Sync Lost event - ISO: add socket option to report packet seqnum via CMSG - ISO: support SCM_TIMESTAMPING for ISO TS - Bluetooth drivers: - intel_pcie: support Function Level Reset - nxpuart: add support for 4M baudrate - nxpuart: implement powerup sequence, reset, FW dump, and FW loading" * tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1742 commits) dpll: zl3073x: Fix build failure selftests: bpf: fix legacy netfilter options ipv6: annotate data-races around rt->fib6_nsiblings ipv6: fix possible infinite loop in fib6_info_uses_dev() ipv6: prevent infinite loop in rt6_nlmsg_size() ipv6: add a retry logic in net6_rt_notify() vrf: Drop existing dst reference in vrf_ip6_input_dst net/sched: taprio: align entry index attr validation with mqprio net: fsl_pq_mdio: use dev_err_probe selftests: rtnetlink.sh: remove esp4_offload after test vsock: remove unnecessary null check in vsock_getname() igb: xsk: solve negative overflow of nb_pkts in zerocopy mode stmmac: xsk: fix negative overflow of budget in zerocopy mode dt-bindings: ieee802154: Convert at86rf230.txt yaml format net: dsa: microchip: Disable PTP function of KSZ8463 net: dsa: microchip: Setup fiber ports for KSZ8463 net: dsa: microchip: Write switch MAC address differently for KSZ8463 net: dsa: microchip: Use different registers for KSZ8463 net: dsa: microchip: Add KSZ8463 switch support to KSZ DSA driver dt-bindings: net: dsa: microchip: Add KSZ8463 switch support ...
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r--net/core/skbuff.c38
1 files changed, 18 insertions, 20 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d6420b74ea9c..ee0274417948 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -384,8 +384,7 @@ static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
skb_set_kcov_handle(skb, kcov_common_handle());
}
-static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
- unsigned int *size)
+static inline void *__slab_build_skb(void *data, unsigned int *size)
{
void *resized;
@@ -418,7 +417,7 @@ struct sk_buff *slab_build_skb(void *data)
return NULL;
memset(skb, 0, offsetof(struct sk_buff, tail));
- data = __slab_build_skb(skb, data, &size);
+ data = __slab_build_skb(data, &size);
__finalize_skb_around(skb, data, size);
return skb;
@@ -435,7 +434,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data,
* using slab buffer should use slab_build_skb() instead.
*/
if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
- data = __slab_build_skb(skb, data, &size);
+ data = __slab_build_skb(data, &size);
__finalize_skb_around(skb, data, size);
}
@@ -3060,10 +3059,8 @@ static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
-static bool spd_fill_page(struct splice_pipe_desc *spd,
- struct pipe_inode_info *pipe, struct page *page,
- unsigned int *len, unsigned int offset,
- bool linear,
+static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+ unsigned int *len, unsigned int offset, bool linear,
struct sock *sk)
{
if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
@@ -3091,8 +3088,7 @@ static bool __splice_segment(struct page *page, unsigned int poff,
unsigned int plen, unsigned int *off,
unsigned int *len,
struct splice_pipe_desc *spd, bool linear,
- struct sock *sk,
- struct pipe_inode_info *pipe)
+ struct sock *sk)
{
if (!*len)
return true;
@@ -3111,8 +3107,7 @@ static bool __splice_segment(struct page *page, unsigned int poff,
do {
unsigned int flen = min(*len, plen);
- if (spd_fill_page(spd, pipe, page, &flen, poff,
- linear, sk))
+ if (spd_fill_page(spd, page, &flen, poff, linear, sk))
return true;
poff += flen;
plen -= flen;
@@ -3130,8 +3125,8 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
unsigned int *offset, unsigned int *len,
struct splice_pipe_desc *spd, struct sock *sk)
{
- int seg;
struct sk_buff *iter;
+ int seg;
/* map the linear part :
* If skb->head_frag is set, this 'linear' part is backed by a
@@ -3143,7 +3138,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
skb_headlen(skb),
offset, len, spd,
skb_head_is_locked(skb),
- sk, pipe))
+ sk))
return true;
/*
@@ -3160,7 +3155,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
if (__splice_segment(skb_frag_page(f),
skb_frag_off(f), skb_frag_size(f),
- offset, len, spd, false, sk, pipe))
+ offset, len, spd, false, sk))
return true;
}
@@ -3235,6 +3230,7 @@ typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
int len, sendmsg_func sendmsg, int flags)
{
+ int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0;
unsigned int orig_len = len;
struct sk_buff *head = skb;
unsigned short fragidx;
@@ -3252,6 +3248,8 @@ do_frag_list:
kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
msg.msg_flags = MSG_DONTWAIT | flags;
+ if (slen < len)
+ msg.msg_flags |= more_hint;
iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
@@ -3292,6 +3290,8 @@ do_frag_list:
flags,
};
+ if (slen < len)
+ msg.msg_flags |= more_hint;
bvec_set_page(&bvec, skb_frag_page(frag), slen,
skb_frag_off(frag) + offset);
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
@@ -6763,8 +6763,7 @@ static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
/* carve out the first eat bytes from skb's frag_list. May recurse into
* pskb_carve()
*/
-static int pskb_carve_frag_list(struct sk_buff *skb,
- struct skb_shared_info *shinfo, int eat,
+static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat,
gfp_t gfp_mask)
{
struct sk_buff *list = shinfo->frag_list;
@@ -6869,7 +6868,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_clone_fraglist(skb);
/* split line is in frag list */
- if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
+ if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) {
/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
if (skb_has_frag_list(skb))
kfree_skb_list(skb_shinfo(skb)->frag_list);
@@ -7234,7 +7233,6 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
* @skb: The buffer to add pages to
* @iter: Iterator representing the pages to be added
* @maxsize: Maximum amount of pages to be added
- * @gfp: Allocation flags
*
* This is a common helper function for supporting MSG_SPLICE_PAGES. It
* extracts pages from an iterator and adds them to the socket buffer if
@@ -7245,7 +7243,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
* insufficient space in the buffer to transfer anything.
*/
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
- ssize_t maxsize, gfp_t gfp)
+ ssize_t maxsize)
{
size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
struct page *pages[8], **ppages = pages;