From f410cbea9f3d2675b4c8e52af1d1985b11b387d1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Apr 2024 11:42:31 +0000 Subject: tcp: annotate data-races around tp->window_clamp tp->window_clamp can be read locklessly, add READ_ONCE() and WRITE_ONCE() annotations. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20240404114231.2195171-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e3167ad96567..9282fafc0e61 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -203,16 +203,17 @@ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, - __u32 *rcv_wnd, __u32 *window_clamp, + __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); + u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ - if (*window_clamp == 0) - (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); - space = min(*window_clamp, space); + if (window_clamp == 0) + window_clamp = (U16_MAX << TCP_MAX_WSCALE); + space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) @@ -239,12 +240,13 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); - space = min_t(u32, space, *window_clamp); + space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ - (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); + WRITE_ONCE(*__window_clamp, + min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_SYMBOL(tcp_select_initial_window); @@ -3855,7 +3857,7 @@ static void tcp_connect_init(struct sock *sk) tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) - tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); @@ -3863,7 +3865,7 @@ static void tcp_connect_init(struct sock *sk) /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) - tp->window_clamp = tcp_full_space(sk); + WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) -- cgit v1.2.3-59-g8ed1b From f6d827b180bda01f8805bf5e85307419b0d6f890 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 10 Apr 2024 12:05:01 -0700 Subject: net: move skb ref helpers to new header Add a new header, linux/skbuff_ref.h, which contains all the skb_*_ref() helpers. Many of the consumers of skbuff.h do not actually use any of the skb ref helpers, and we can speed up compilation a bit by minimizing this header file. Additionally in the later patch in the series we add page_pool support to skb_frag_ref(), which requires some page_pool dependencies. We can now add these dependencies to skbuff_ref.h instead of a very ubiquitous skbuff.h Signed-off-by: Mina Almasry Link: https://lore.kernel.org/r/20240410190505.1225848-2-almasrymina@google.com Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 1 + drivers/net/ethernet/marvell/sky2.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_rx.c | 1 + drivers/net/ethernet/sun/cassini.c | 1 + drivers/net/veth.c | 1 + drivers/net/xen-netback/netback.c | 1 + include/linux/skbuff.h | 63 ------------------ include/linux/skbuff_ref.h | 75 ++++++++++++++++++++++ net/core/gro.c | 1 + net/core/skbuff.c | 1 + net/ipv4/esp4.c | 1 + net/ipv4/tcp_output.c | 1 + net/ipv6/esp6.c | 1 + net/tls/tls_device.c | 1 + net/tls/tls_device_fallback.c | 1 + net/tls/tls_strp.c | 1 + 16 files changed, 89 insertions(+), 63 deletions(-) create mode 100644 include/linux/skbuff_ref.h (limited to 'net/ipv4/tcp_output.c') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 6482728794dd..e8e460a92e0e 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "chcr_ktls.h" static LIST_HEAD(uld_ctx_list); diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 07720841a8d7..f3f7f4cc27b3 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -34,6 +34,7 @@ #include #include #include +#include #include diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index eac49657bd07..8328df8645d5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_IPV6) diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index bfb903506367..8f1f43dbb76d 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/veth.c b/drivers/net/veth.c index bcdfbf61eb66..426e68a95067 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #define DRV_NAME "veth" diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index ef76850d9bcd..48254fc07d64 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -38,6 +38,7 @@ #include #include #include +#include #include diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7135a3e94afd..4072a7ee3859 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3492,73 +3492,10 @@ static inline struct page *skb_frag_page(const skb_frag_t *frag) return netmem_to_page(frag->netmem); } -/** - * __skb_frag_ref - take an addition reference on a paged fragment. - * @frag: the paged fragment - * - * Takes an additional reference on the paged fragment @frag. - */ -static inline void __skb_frag_ref(skb_frag_t *frag) -{ - get_page(skb_frag_page(frag)); -} - -/** - * skb_frag_ref - take an addition reference on a paged fragment of an skb. - * @skb: the buffer - * @f: the fragment offset. - * - * Takes an additional reference on the @f'th paged fragment of @skb. - */ -static inline void skb_frag_ref(struct sk_buff *skb, int f) -{ - __skb_frag_ref(&skb_shinfo(skb)->frags[f]); -} - int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, struct bpf_prog *prog); -bool napi_pp_put_page(struct page *page); - -static inline void -skb_page_unref(struct page *page, bool recycle) -{ -#ifdef CONFIG_PAGE_POOL - if (recycle && napi_pp_put_page(page)) - return; -#endif - put_page(page); -} - -/** - * __skb_frag_unref - release a reference on a paged fragment. - * @frag: the paged fragment - * @recycle: recycle the page if allocated via page_pool - * - * Releases a reference on the paged fragment @frag - * or recycles the page via the page_pool API. - */ -static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) -{ - skb_page_unref(skb_frag_page(frag), recycle); -} - -/** - * skb_frag_unref - release a reference on a paged fragment of an skb. - * @skb: the buffer - * @f: the fragment offset - * - * Releases a reference on the @f'th paged fragment of @skb. - */ -static inline void skb_frag_unref(struct sk_buff *skb, int f) -{ - struct skb_shared_info *shinfo = skb_shinfo(skb); - - if (!skb_zcopy_managed(skb)) - __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); -} - /** * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h new file mode 100644 index 000000000000..11f0a4063403 --- /dev/null +++ b/include/linux/skbuff_ref.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Skb ref helpers. + * + */ + +#ifndef _LINUX_SKBUFF_REF_H +#define _LINUX_SKBUFF_REF_H + +#include + +/** + * __skb_frag_ref - take an addition reference on a paged fragment. + * @frag: the paged fragment + * + * Takes an additional reference on the paged fragment @frag. + */ +static inline void __skb_frag_ref(skb_frag_t *frag) +{ + get_page(skb_frag_page(frag)); +} + +/** + * skb_frag_ref - take an addition reference on a paged fragment of an skb. + * @skb: the buffer + * @f: the fragment offset. + * + * Takes an additional reference on the @f'th paged fragment of @skb. + */ +static inline void skb_frag_ref(struct sk_buff *skb, int f) +{ + __skb_frag_ref(&skb_shinfo(skb)->frags[f]); +} + +bool napi_pp_put_page(struct page *page); + +static inline void +skb_page_unref(struct page *page, bool recycle) +{ +#ifdef CONFIG_PAGE_POOL + if (recycle && napi_pp_put_page(page)) + return; +#endif + put_page(page); +} + +/** + * __skb_frag_unref - release a reference on a paged fragment. + * @frag: the paged fragment + * @recycle: recycle the page if allocated via page_pool + * + * Releases a reference on the paged fragment @frag + * or recycles the page via the page_pool API. + */ +static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) +{ + skb_page_unref(skb_frag_page(frag), recycle); +} + +/** + * skb_frag_unref - release a reference on a paged fragment of an skb. + * @skb: the buffer + * @f: the fragment offset + * + * Releases a reference on the @f'th paged fragment of @skb. + */ +static inline void skb_frag_unref(struct sk_buff *skb, int f) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (!skb_zcopy_managed(skb)) + __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); +} + +#endif /* _LINUX_SKBUFF_REF_H */ diff --git a/net/core/gro.c b/net/core/gro.c index 83f35d99a682..2459ab697f7f 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -3,6 +3,7 @@ #include #include #include +#include #define MAX_GRO_SKBS 8 diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ab970ded8a7b..2554a6f5f386 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@ #endif #include #include +#include #include #include #include diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 40330253f076..dff04580318f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -20,6 +20,7 @@ #include #include #include +#include #include diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9282fafc0e61..61119d42b0fd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -44,6 +44,7 @@ #include #include #include +#include #include diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index fb431d0a3475..6bc0a84c8d05 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include #include #include +#include #include diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index bf8ed36b1ad6..ab6e694f7bc2 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "tls.h" #include "trace.h" diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 4e7228f275fa..f9e3d3d90dcf 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "tls.h" diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index ca1e0e198ceb..58c4b06f4f0c 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -2,6 +2,7 @@ /* Copyright (c) 2016 Tom Herbert */ #include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 22555032c513e62fe744d4cdd553539897e8e922 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Apr 2024 21:45:58 +0000 Subject: tcp: remove dubious FIN exception from tcp_cwnd_test() tcp_cwnd_test() has a special handing for the last packet in the write queue if it is smaller than one MSS and has the FIN flag. This is in violation of TCP RFC, and seems quite dubious. This packet can be sent only if the current CWND is bigger than the number of packets in flight. Making tcp_cwnd_test() result independent of the first skb in the write queue is needed for the last patch of the series. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240418214600.1291486-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 61119d42b0fd..acbc76ca3e64 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2073,16 +2073,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, - const struct sk_buff *skb) +static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; - /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && - tcp_skb_pcount(skb) == 1) - return 1; - in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) @@ -2706,10 +2700,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; - int cwnd_quota; + u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; - u32 max_segs; sent_pkts = 0; @@ -2743,7 +2736,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); - cwnd_quota = tcp_cwnd_test(tp, skb); + cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ @@ -2772,9 +2765,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - max_segs), + min(cwnd_quota, + max_segs), nonagle); if (skb->len > limit && -- cgit v1.2.3-59-g8ed1b From d5b38a71d3334bc87931997f1b0a16464bbbe2ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Apr 2024 21:45:59 +0000 Subject: tcp: call tcp_set_skb_tso_segs() from tcp_write_xmit() tcp_write_xmit() calls tcp_init_tso_segs() to set gso_size and gso_segs on the packet. tcp_init_tso_segs() requires the stack to maintain an up to date tcp_skb_pcount(), and this makes sense for packets in rtx queue. Not so much for packets still in the write queue. In the following patch, we don't want to deal with tcp_skb_pcount() when moving payload from 2nd skb to 1st skb in the write queue. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240418214600.1291486-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index acbc76ca3e64..5e8665241f93 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1502,18 +1502,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { + int tso_segs; + if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ - tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->tcp_gso_size = 0; - } else { - tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tcp_skb_pcount_set(skb, 1); + return 1; } + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, tso_segs); + return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various @@ -2097,10 +2101,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); - if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(skb, mss_now); - tso_segs = tcp_skb_pcount(skb); - } + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) + return tcp_set_skb_tso_segs(skb, mss_now); + return tso_segs; } @@ -2733,9 +2736,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (tcp_pacing_check(sk)) break; - tso_segs = tcp_init_tso_segs(skb, mss_now); - BUG_ON(!tso_segs); - cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) @@ -2745,6 +2745,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } + tso_segs = tcp_set_skb_tso_segs(skb, mss_now); + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; break; -- cgit v1.2.3-59-g8ed1b From 8ee602c635206ed012f979370094015857c02359 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Apr 2024 21:46:00 +0000 Subject: tcp: try to send bigger TSO packets While investigating TCP performance, I found that TCP would sometimes send big skbs followed by a single MSS skb, in a 'locked' pattern. For instance, BIG TCP is enabled, MSS is set to have 4096 bytes of payload per segment. gso_max_size is set to 181000. This means that an optimal TCP packet size should contain 44 * 4096 = 180224 bytes of payload, However, I was seeing packets sizes interleaved in this pattern: 172032, 8192, 172032, 8192, 172032, 8192, tcp_tso_should_defer() heuristic is defeated, because after a split of a packet in write queue for whatever reason (this might be a too small CWND or a small enough pacing_rate), the leftover packet in the queue is smaller than the optimal size. It is time to try to make 'leftover packets' bigger so that tcp_tso_should_defer() can give its full potential. After this patch, we can see the following output: 14:13:34.009273 IP6 sender > receiver: Flags [P.], seq 4048380:4098360, ack 1, win 256, options [nop,nop,TS val 3425678144 ecr 1561784500], length 49980 14:13:34.010272 IP6 sender > receiver: Flags [P.], seq 4098360:4148340, ack 1, win 256, options [nop,nop,TS val 3425678145 ecr 1561784501], length 49980 14:13:34.011271 IP6 sender > receiver: Flags [P.], seq 4148340:4198320, ack 1, win 256, options [nop,nop,TS val 3425678146 ecr 1561784502], length 49980 14:13:34.012271 IP6 sender > receiver: Flags [P.], seq 4198320:4248300, ack 1, win 256, options [nop,nop,TS val 3425678147 ecr 1561784503], length 49980 14:13:34.013272 IP6 sender > receiver: Flags [P.], seq 4248300:4298280, ack 1, win 256, options [nop,nop,TS val 3425678148 ecr 1561784504], length 49980 14:13:34.014271 IP6 sender > receiver: Flags [P.], seq 4298280:4348260, ack 1, win 256, options [nop,nop,TS val 3425678149 ecr 1561784505], length 49980 14:13:34.015272 IP6 sender > receiver: Flags [P.], seq 4348260:4398240, ack 1, win 256, options [nop,nop,TS val 3425678150 ecr 1561784506], length 49980 14:13:34.016270 IP6 sender > receiver: Flags [P.], seq 4398240:4448220, ack 1, win 256, options [nop,nop,TS val 3425678151 ecr 1561784507], length 49980 14:13:34.017269 IP6 sender > receiver: Flags [P.], seq 4448220:4498200, ack 1, win 256, options [nop,nop,TS val 3425678152 ecr 1561784508], length 49980 14:13:34.018276 IP6 sender > receiver: Flags [P.], seq 4498200:4548180, ack 1, win 256, options [nop,nop,TS val 3425678153 ecr 1561784509], length 49980 14:13:34.019259 IP6 sender > receiver: Flags [P.], seq 4548180:4598160, ack 1, win 256, options [nop,nop,TS val 3425678154 ecr 1561784510], length 49980 With 200 concurrent flows on a 100Gbit NIC, we can see a reduction of TSO packets (and ACK packets) of about 30 %. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240418214600.1291486-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5e8665241f93..99a1d88f7f47 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2683,6 +2683,36 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } +/* First skb in the write queue is smaller than ideal packet size. + * Check if we can move payload from the second skb in the queue. + */ +static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) +{ + struct sk_buff *next_skb = skb->next; + unsigned int nlen; + + if (tcp_skb_is_last(sk, skb)) + return; + + if (!tcp_skb_can_collapse(skb, next_skb)) + return; + + nlen = min_t(u32, amount, next_skb->len); + if (!nlen || !skb_shift(skb, next_skb, nlen)) + return; + + TCP_SKB_CB(skb)->end_seq += nlen; + TCP_SKB_CB(next_skb)->seq += nlen; + + if (!next_skb->len) { + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; + TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; + tcp_unlink_write_queue(next_skb, sk); + tcp_wmem_free_skb(sk, next_skb); + } +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2723,6 +2753,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; + int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ @@ -2744,6 +2775,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, else break; } + cwnd_quota = min(cwnd_quota, max_segs); + missing_bytes = cwnd_quota * mss_now - skb->len; + if (missing_bytes > 0) + tcp_grow_skb(sk, skb, missing_bytes); tso_segs = tcp_set_skb_tso_segs(skb, mss_now); @@ -2767,8 +2802,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - min(cwnd_quota, - max_segs), + cwnd_quota, nonagle); if (skb->len > limit && -- cgit v1.2.3-59-g8ed1b From 2bf90a57f0e682872c5cfb66ffa45e432bb9c7ae Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Sun, 21 Apr 2024 12:20:09 +0800 Subject: tcp: update sacked after tracepoint in __tcp_retransmit_skb Marking TCP_SKB_CB(skb)->sacked with TCPCB_EVER_RETRANS after the traceopint (trace_tcp_retransmit_skb), then we can get the retransmission efficiency by counting skbs w/ and w/o TCPCB_EVER_RETRANS mark in this tracepoint. We have discussed to achieve this with BPF_SOCK_OPS in [0], and using tracepoint is thought to be a better solution. [0] https://lore.kernel.org/all/20240417124622.35333-1-lulie@linux.alibaba.com/ Signed-off-by: Philo Lu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 99a1d88f7f47..ce59e4499b66 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3418,11 +3418,6 @@ start: err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } - /* To avoid taking spuriously low RTT samples based on a timestamp - * for a transmit that never happened, always mark EVER_RETRANS - */ - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; - if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); @@ -3432,6 +3427,12 @@ start: } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } + + /* To avoid taking spuriously low RTT samples based on a timestamp + * for a transmit that never happened, always mark EVER_RETRANS + */ + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + return err; } -- cgit v1.2.3-59-g8ed1b From 5691276b39daf90294c6a81fb6d62d667f634c92 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 25 Apr 2024 11:13:36 +0800 Subject: rstreason: prepare for active reset Like what we did to passive reset: only passing possible reset reason in each active reset path. No functional changes. Signed-off-by: Jason Xing Acked-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/tcp.h | 3 ++- net/ipv4/tcp.c | 15 ++++++++++----- net/ipv4/tcp_output.c | 3 ++- net/ipv4/tcp_timer.c | 9 ++++++--- net/mptcp/protocol.c | 4 +++- net/mptcp/subflow.c | 5 +++-- 6 files changed, 26 insertions(+), 13 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index ffc9371fe9de..a9eb21251195 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -670,7 +670,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, void tcp_send_probe0(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); -void tcp_send_active_reset(struct sock *sk, gfp_t priority); +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f23b97777ea5..4ec0f4feee00 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -275,6 +275,7 @@ #include #include #include +#include #include #include @@ -2811,7 +2812,8 @@ void __tcp_close(struct sock *sk, long timeout) /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, sk->sk_allocation); + tcp_send_active_reset(sk, sk->sk_allocation, + SK_RST_REASON_NOT_SPECIFIED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -2885,7 +2887,8 @@ adjudge_to_death: struct tcp_sock *tp = tcp_sk(sk); if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { @@ -2903,7 +2906,8 @@ adjudge_to_death: if (sk->sk_state != TCP_CLOSE) { if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { @@ -3007,7 +3011,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ - tcp_send_active_reset(sk, gfp_any()); + tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_NOT_SPECIFIED); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); @@ -4564,7 +4568,8 @@ int tcp_abort(struct sock *sk, int err) smp_wmb(); sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_done(sk); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ce59e4499b66..41c352bf3394 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3615,7 +3615,8 @@ void tcp_send_fin(struct sock *sk) * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ -void tcp_send_active_reset(struct sock *sk, gfp_t priority) +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason) { struct sk_buff *skb; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 976db57b95d4..83fe7f62f7f1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,6 +22,7 @@ #include #include #include +#include static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { @@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) (!tp->snd_wnd && !tp->packets_out)) do_reset = true; if (do_reset) - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; @@ -768,7 +770,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } } - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); goto death; } @@ -795,7 +797,8 @@ static void tcp_keepalive_timer (struct timer_list *t) icsk->icsk_probes_out > 0) || (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_write_err(sk); goto out; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f8bc34f0d973..065967086492 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #endif #include #include +#include #include #include "protocol.h" #include "mib.h" @@ -2569,7 +2570,8 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { - tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + tcp_send_active_reset(tcp_sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_set_state(tcp_sk, TCP_CLOSE); } unlock_sock_fast(tcp_sk, slow); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 32fe2ef36d56..ac867d277860 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -414,7 +414,7 @@ void mptcp_subflow_reset(struct sock *ssk) /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); - tcp_send_active_reset(ssk, GFP_ATOMIC); + tcp_send_active_reset(ssk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); @@ -1350,7 +1350,8 @@ reset: tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); - tcp_send_active_reset(ssk, GFP_ATOMIC); + tcp_send_active_reset(ssk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); WRITE_ONCE(subflow->data_avail, false); return false; } -- cgit v1.2.3-59-g8ed1b From b533fb9cf4f7c6ca2aa255a5a1fdcde49fff2b24 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 25 Apr 2024 11:13:40 +0800 Subject: rstreason: make it work in trace world At last, we should let it work by introducing this reset reason in trace world. One of the possible expected outputs is: ... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx state=TCP_ESTABLISHED reason=NOT_SPECIFIED Signed-off-by: Jason Xing Reviewed-by: Steven Rostedt (Google) Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/trace/events/tcp.h | 26 ++++++++++++++++++++++---- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 4 files changed, 25 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 5c04a61a11c2..49b5ee091cf6 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * tcp event with arguments sk and skb @@ -74,20 +75,32 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_ARGS(sk, skb) ); +#undef FN +#define FN(reason) TRACE_DEFINE_ENUM(SK_RST_REASON_##reason); +DEFINE_RST_REASON(FN, FN) + +#undef FN +#undef FNe +#define FN(reason) { SK_RST_REASON_##reason, #reason }, +#define FNe(reason) { SK_RST_REASON_##reason, #reason } + /* * skb of trace_tcp_send_reset is the skb that caused RST. In case of * active reset, skb should be NULL */ TRACE_EVENT(tcp_send_reset, - TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + TP_PROTO(const struct sock *sk, + const struct sk_buff *skb, + const enum sk_rst_reason reason), - TP_ARGS(sk, skb), + TP_ARGS(sk, skb, reason), TP_STRUCT__entry( __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) + __field(enum sk_rst_reason, reason) __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), @@ -113,14 +126,19 @@ TRACE_EVENT(tcp_send_reset, */ TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, entry->saddr); } + __entry->reason = reason; ), - TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s", + TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s reason=%s", __entry->skbaddr, __entry->skaddr, __entry->saddr, __entry->daddr, - __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN") + __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN", + __print_symbolic(__entry->reason, DEFINE_RST_REASON(FN, FNe))) ); +#undef FN +#undef FNe + /* * tcp event with arguments sk * diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 621879ba916d..0427deca3e0e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; - trace_tcp_send_reset(sk, skb); + trace_tcp_send_reset(sk, skb, reason); BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 41c352bf3394..ece4726c8751 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3641,7 +3641,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ - trace_tcp_send_reset(sk, NULL); + trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED); } /* Send a crossed SYN-ACK during socket establishment. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 317d7a6e6b01..77958adf2e16 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1133,7 +1133,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, label = ip6_flowlabel(ipv6h); } - trace_tcp_send_reset(sk, skb); + trace_tcp_send_reset(sk, skb, reason); tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, ipv6_get_dsfield(ipv6h), label, priority, txhash, -- cgit v1.2.3-59-g8ed1b From 1bede0a12d3a45bd366d3cf9e1c7611d86f1bc1f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Apr 2024 19:34:50 +0000 Subject: tcp: fix tcp_grow_skb() vs tstamps I forgot to call tcp_skb_collapse_tstamp() in the case we consume the second skb in write queue. Neal suggested to create a common helper used by tcp_mtu_probe() and tcp_grow_skb(). Fixes: 8ee602c63520 ("tcp: try to send bigger TSO packets") Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Link: https://lore.kernel.org/r/20240425193450.411640-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ece4726c8751..ea7ad7d99245 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2403,6 +2403,21 @@ commit: return 0; } +/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if + * all its payload was moved to another one (dst). + * Make sure to transfer tcp_flags, eor, and tstamp. + */ +static void tcp_eat_one_skb(struct sock *sk, + struct sk_buff *dst, + struct sk_buff *src) +{ + TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; + TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; + tcp_skb_collapse_tstamp(dst, src); + tcp_unlink_write_queue(src, sk); + tcp_wmem_free_skb(sk, src); +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -2508,16 +2523,7 @@ static int tcp_mtu_probe(struct sock *sk) copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { - /* We've eaten all the data from this skb. - * Throw it away. */ - TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; - /* If this is the last SKB we copy and eor is set - * we need to propagate it to the new skb. - */ - TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; - tcp_skb_collapse_tstamp(nskb, skb); - tcp_unlink_write_queue(skb, sk); - tcp_wmem_free_skb(sk, skb); + tcp_eat_one_skb(sk, nskb, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); @@ -2705,11 +2711,10 @@ static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) TCP_SKB_CB(next_skb)->seq += nlen; if (!next_skb->len) { + /* In case FIN is set, we need to update end_seq */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; - TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; - TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; - tcp_unlink_write_queue(next_skb, sk); - tcp_wmem_free_skb(sk, next_skb); + + tcp_eat_one_skb(sk, skb, next_skb); } } -- cgit v1.2.3-59-g8ed1b From f3d93817fba30a8d3508fa990405039c0820dca3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Apr 2024 13:40:24 +0000 Subject: net: add Move some proto memory definitions out of Very few files need them, and following patch will include from Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240429134025.1233626-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/proto_memory.h | 83 ++++++++++++++++++++++++++++++++++++++++++++++ include/net/sock.h | 78 ------------------------------------------- net/core/sock.c | 1 + net/core/sysctl_net_core.c | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_output.c | 1 + net/sctp/sm_statefuns.c | 1 + 9 files changed, 90 insertions(+), 78 deletions(-) create mode 100644 include/net/proto_memory.h (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h new file mode 100644 index 000000000000..41404d4bb6f0 --- /dev/null +++ b/include/net/proto_memory.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PROTO_MEMORY_H +#define _PROTO_MEMORY_H + +#include + +/* 1 MB per cpu, in page units */ +#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) +extern int sysctl_mem_pcpu_rsv; + +static inline bool sk_has_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure != NULL; +} + +static inline bool +proto_memory_pressure(const struct proto *prot) +{ + if (!prot->memory_pressure) + return false; + return !!READ_ONCE(*prot->memory_pressure); +} + +static inline bool sk_under_global_memory_pressure(const struct sock *sk) +{ + return proto_memory_pressure(sk->sk_prot); +} + +static inline bool sk_under_memory_pressure(const struct sock *sk) +{ + if (!sk->sk_prot->memory_pressure) + return false; + + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + mem_cgroup_under_socket_pressure(sk->sk_memcg)) + return true; + + return !!READ_ONCE(*sk->sk_prot->memory_pressure); +} + +static inline long +proto_memory_allocated(const struct proto *prot) +{ + return max(0L, atomic_long_read(prot->memory_allocated)); +} + +static inline long +sk_memory_allocated(const struct sock *sk) +{ + return proto_memory_allocated(sk->sk_prot); +} + +static inline void proto_memory_pcpu_drain(struct proto *proto) +{ + int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); + + if (val) + atomic_long_add(val, proto->memory_allocated); +} + +static inline void +sk_memory_allocated_add(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val >= READ_ONCE(sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +static inline void +sk_memory_allocated_sub(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val <= -READ_ONCE(sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +#endif /* _PROTO_MEMORY_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 48bcc845202f..0450494a1766 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1371,75 +1371,6 @@ static inline int sk_under_cgroup_hierarchy(struct sock *sk, #endif } -static inline bool sk_has_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure != NULL; -} - -static inline bool sk_under_global_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure && - !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline bool sk_under_memory_pressure(const struct sock *sk) -{ - if (!sk->sk_prot->memory_pressure) - return false; - - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) - return true; - - return !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline long -proto_memory_allocated(const struct proto *prot) -{ - return max(0L, atomic_long_read(prot->memory_allocated)); -} - -static inline long -sk_memory_allocated(const struct sock *sk) -{ - return proto_memory_allocated(sk->sk_prot); -} - -/* 1 MB per cpu, in page units */ -#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) -extern int sysctl_mem_pcpu_rsv; - -static inline void proto_memory_pcpu_drain(struct proto *proto) -{ - int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); - - if (val) - atomic_long_add(val, proto->memory_allocated); -} - -static inline void -sk_memory_allocated_add(const struct sock *sk, int val) -{ - struct proto *proto = sk->sk_prot; - - val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); - - if (unlikely(val >= READ_ONCE(sysctl_mem_pcpu_rsv))) - proto_memory_pcpu_drain(proto); -} - -static inline void -sk_memory_allocated_sub(const struct sock *sk, int val) -{ - struct proto *proto = sk->sk_prot; - - val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); - - if (unlikely(val <= -READ_ONCE(sysctl_mem_pcpu_rsv))) - proto_memory_pcpu_drain(proto); -} - #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) @@ -1466,15 +1397,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot) return percpu_counter_sum_positive(prot->sockets_allocated); } -static inline bool -proto_memory_pressure(struct proto *prot) -{ - if (!prot->memory_pressure) - return false; - return !!READ_ONCE(*prot->memory_pressure); -} - - #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { diff --git a/net/core/sock.c b/net/core/sock.c index fe9195186c13..e0692b752369 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -127,6 +127,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 118c78615543..a452a330d0ed 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "dev.h" diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 914bc9c35cc7..6c4664c681ca 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0a3aa3047083..e1f0efbb29d6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -272,6 +272,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53e1150f706f..ad8fa129fcfe 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ea7ad7d99245..57edf66ff91b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,6 +39,7 @@ #include #include +#include #include #include diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 08fdf1251f46..5adf0c0a6c1a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 94062790aedb505bdda209b10bea47b294d6394f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 May 2024 12:54:48 +0000 Subject: tcp: defer shutdown(SEND_SHUTDOWN) for TCP_SYN_RECV sockets TCP_SYN_RECV state is really special, it is only used by cross-syn connections, mostly used by fuzzers. In the following crash [1], syzbot managed to trigger a divide by zero in tcp_rcv_space_adjust() A socket makes the following state transitions, without ever calling tcp_init_transfer(), meaning tcp_init_buffer_space() is also not called. TCP_CLOSE connect() TCP_SYN_SENT TCP_SYN_RECV shutdown() -> tcp_shutdown(sk, SEND_SHUTDOWN) TCP_FIN_WAIT1 To fix this issue, change tcp_shutdown() to not perform a TCP_SYN_RECV -> TCP_FIN_WAIT1 transition, which makes no sense anyway. When tcp_rcv_state_process() later changes socket state from TCP_SYN_RECV to TCP_ESTABLISH, then look at sk->sk_shutdown to finally enter TCP_FIN_WAIT1 state, and send a FIN packet from a sane socket state. This means tcp_send_fin() can now be called from BH context, and must use GFP_ATOMIC allocations. [1] divide error: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 1 PID: 5084 Comm: syz-executor358 Not tainted 6.9.0-rc6-syzkaller-00022-g98369dccd2f8 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 RIP: 0010:tcp_rcv_space_adjust+0x2df/0x890 net/ipv4/tcp_input.c:767 Code: e3 04 4c 01 eb 48 8b 44 24 38 0f b6 04 10 84 c0 49 89 d5 0f 85 a5 03 00 00 41 8b 8e c8 09 00 00 89 e8 29 c8 48 0f af c3 31 d2 <48> f7 f1 48 8d 1c 43 49 8d 96 76 08 00 00 48 89 d0 48 c1 e8 03 48 RSP: 0018:ffffc900031ef3f0 EFLAGS: 00010246 RAX: 0c677a10441f8f42 RBX: 000000004fb95e7e RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000027d4b11f R08: ffffffff89e535a4 R09: 1ffffffff25e6ab7 R10: dffffc0000000000 R11: ffffffff8135e920 R12: ffff88802a9f8d30 R13: dffffc0000000000 R14: ffff88802a9f8d00 R15: 1ffff1100553f2da FS: 00005555775c0380(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1155bf2304 CR3: 000000002b9f2000 CR4: 0000000000350ef0 Call Trace: tcp_recvmsg_locked+0x106d/0x25a0 net/ipv4/tcp.c:2513 tcp_recvmsg+0x25d/0x920 net/ipv4/tcp.c:2578 inet6_recvmsg+0x16a/0x730 net/ipv6/af_inet6.c:680 sock_recvmsg_nosec net/socket.c:1046 [inline] sock_recvmsg+0x109/0x280 net/socket.c:1068 ____sys_recvmsg+0x1db/0x470 net/socket.c:2803 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x474/0xae0 net/socket.c:2939 __sys_recvmmsg net/socket.c:3018 [inline] __do_sys_recvmmsg net/socket.c:3041 [inline] __se_sys_recvmmsg net/socket.c:3034 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3034 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7faeb6363db9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 c1 17 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffcc1997168 EFLAGS: 00000246 ORIG_RAX: 000000000000012b RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007faeb6363db9 RDX: 0000000000000001 RSI: 0000000020000bc0 RDI: 0000000000000005 RBP: 0000000000000000 R08: 0000000000000000 R09: 000000000000001c R10: 0000000000000122 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000001 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Link: https://lore.kernel.org/r/20240501125448.896529-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 2 ++ net/ipv4/tcp_output.c | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e767721b3a58..66d77faca64f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2710,7 +2710,7 @@ void tcp_shutdown(struct sock *sk, int how) /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | - TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { + TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) tcp_send_fin(sk); @@ -2819,7 +2819,7 @@ void __tcp_close(struct sock *sk, long timeout) * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 - * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d874817a78d..a140d9f7a0a3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6761,6 +6761,8 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp); + if (sk->sk_shutdown & SEND_SHUTDOWN) + tcp_shutdown(sk, SEND_SHUTDOWN); break; case TCP_FIN_WAIT1: { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e3167ad96567..02caeb7bcf63 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3563,7 +3563,9 @@ void tcp_send_fin(struct sock *sk) return; } } else { - skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + skb = alloc_skb_fclone(MAX_TCP_HEADER, + sk_gfp_mask(sk, GFP_ATOMIC | + __GFP_NOWARN)); if (unlikely(!skb)) return; -- cgit v1.2.3-59-g8ed1b From 378979e94e953c2070acb4f0e0c98d29260bd09d Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Tue, 21 May 2024 21:42:20 +0800 Subject: tcp: remove 64 KByte limit for initial tp->rcv_wnd value Recently, we had some servers upgraded to the latest kernel and noticed the indicator from the user side showed worse results than before. It is caused by the limitation of tp->rcv_wnd. In 2018 commit a337531b942b ("tcp: up initial rmem to 128KB and SYN rwin to around 64KB") limited the initial value of tp->rcv_wnd to 65535, most CDN teams would not benefit from this change because they cannot have a large window to receive a big packet, which will be slowed down especially in long RTT. Small rcv_wnd means slow transfer speed, to some extent. It's the side effect for the latency/time-sensitive users. To avoid future confusion, current change doesn't affect the initial receive window on the wire in a SYN or SYN+ACK packet which are set within 65535 bytes according to RFC 7323 also due to the limit in __tcp_transmit_skb(): th->window = htons(min(tp->rcv_wnd, 65535U)); In one word, __tcp_transmit_skb() already ensures that constraint is respected, no matter how large tp->rcv_wnd is. The change doesn't violate RFC. Let me provide one example if with or without the patch: Before: client --- SYN: rwindow=65535 ---> server client <--- SYN+ACK: rwindow=65535 ---- server client --- ACK: rwindow=65536 ---> server Note: for the last ACK, the calculation is 512 << 7. After: client --- SYN: rwindow=65535 ---> server client <--- SYN+ACK: rwindow=65535 ---- server client --- ACK: rwindow=175232 ---> server Note: I use the following command to make it work: ip route change default via [ip] dev eth0 metric 100 initrwnd 120 For the last ACK, the calculation is 1369 << 7. When we apply such a patch, having a large rcv_wnd if the user tweak this knob can help transfer data more rapidly and save some rtts. Fixes: a337531b942b ("tcp: up initial rmem to 128KB and SYN rwin to around 64KB") Signed-off-by: Jason Xing Reviewed-by: Eric Dumazet Acked-by: Neal Cardwell Link: https://lore.kernel.org/r/20240521134220.12510-1-kerneljasonxing@gmail.com Signed-off-by: Paolo Abeni --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 95caf8aaa8be..95618d0e78e4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -232,7 +232,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else - (*rcv_wnd) = min_t(u32, space, U16_MAX); + (*rcv_wnd) = space; if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); -- cgit v1.2.3-59-g8ed1b