diff options
author | 2025-06-17 16:19:04 -0700 | |
---|---|---|
committer | 2025-06-17 16:19:05 -0700 | |
commit | 4b7da7341bea62d24fe2f06320f39368dec9603d (patch) | |
tree | 14cc0ade0798fa4be0354a733325bb8e911b010d | |
parent | net: bcmgenet: update PHY power down (diff) | |
parent | tcp: remove RFC3517/RFC6675 tcp_clear_retrans_hints_partial() (diff) | |
download | wireguard-linux-4b7da7341bea62d24fe2f06320f39368dec9603d.tar.xz wireguard-linux-4b7da7341bea62d24fe2f06320f39368dec9603d.zip |
Merge branch 'tcp-remove-obsolete-rfc3517-rfc6675-code'
Neal Cardwell says:
====================
tcp: remove obsolete RFC3517/RFC6675 code
RACK-TLP loss detection has been enabled as the default loss detection
algorithm for Linux TCP since 2018, in:
commit b38a51fec1c1 ("tcp: disable RFC6675 loss detection")
In case users ran into unexpected bugs or performance regressions,
that commit allowed Linux system administrators to revert to using
RFC3517/RFC6675 loss recovery by setting net.ipv4.tcp_recovery to 0.
In the seven years since 2018, our team has not heard reports of
anyone reverting Linux TCP to use RFC3517/RFC6675 loss recovery, and
we can't find any record in web searches of such a revert.
RACK-TLP was published as a standards-track RFC, RFC8985, in February
2021.
Several other major TCP implementations have default-enabled RACK-TLP
at this point as well.
RACK-TLP offers several significant performance advantages over
RFC3517/RFC6675 loss recovery, including much better performance in
the common cases of tail drops, lost retransmissions, and reordering.
It is now time to remove the obsolete and unused RFC3517/RFC6675 loss
recovery code. This will allow a substantial simplification of the
Linux TCP code base, and removes 12 bytes of state in every tcp_sock
for 64-bit machines (8 bytes on 32-bit machines).
To arrange the commits in reasonable sizes, this patch series is split
into 3 commits:
(1) Removes the core RFC3517/RFC6675 logic.
(2) Removes the RFC3517/RFC6675 hint state and the first layer of logic that
updates that state.
(3) Removes the emptied-out tcp_clear_retrans_hints_partial() helper function
and all of its call sites.
====================
Link: https://patch.msgid.link/20250615001435.2390793-1-ncardwell.sw@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r-- | Documentation/networking/ip-sysctl.rst | 8 | ||||
-rw-r--r-- | Documentation/networking/net_cachelines/tcp_sock.rst | 2 | ||||
-rw-r--r-- | include/linux/tcp.h | 3 | ||||
-rw-r--r-- | include/net/tcp.h | 6 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 158 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 6 |
7 files changed, 16 insertions, 170 deletions
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 0f1251cce314..466bc3f5186e 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -645,9 +645,11 @@ tcp_recovery - INTEGER features. ========= ============================================================= - RACK: 0x1 enables the RACK loss detection for fast detection of lost - retransmissions and tail drops. It also subsumes and disables - RFC6675 recovery for SACK connections. + RACK: 0x1 enables RACK loss detection, for fast detection of lost + retransmissions and tail drops, and resilience to + reordering. currently, setting this bit to 0 has no + effect, since RACK is the only supported loss detection + algorithm. RACK: 0x2 makes RACK's reordering window static (min_rtt/4). diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index bc9b2131bf7a..7bbda5944ee2 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -115,7 +115,6 @@ u32 lost_out read_mostly read_m u32 sacked_out read_mostly read_mostly tcp_left_out(tx);tcp_packets_in_flight(tx/rx);tcp_clean_rtx_queue(rx) struct hrtimer pacing_timer struct hrtimer compressed_ack_timer -struct sk_buff* lost_skb_hint read_mostly tcp_clean_rtx_queue struct sk_buff* retransmit_skb_hint read_mostly tcp_clean_rtx_queue struct rb_root out_of_order_queue read_mostly tcp_data_queue,tcp_fast_path_check struct sk_buff* ooo_last_skb @@ -123,7 +122,6 @@ struct tcp_sack_block[1] duplicate_sack struct tcp_sack_block[4] selective_acks struct tcp_sack_block[4] recv_sack_cache struct sk_buff* highest_sack read_write tcp_event_new_data_sent -int lost_cnt_hint u32 prior_ssthresh u32 high_seq u32 retrans_stamp diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 29f59d50dc73..1a5737b3753d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -208,7 +208,6 @@ struct tcp_sock { u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u16 gso_segs; /* Max number of segs per GSO packet */ /* from STCP, retrans queue hinting */ - struct sk_buff *lost_skb_hint; struct sk_buff *retransmit_skb_hint; __cacheline_group_end(tcp_sock_read_tx); @@ -419,8 +418,6 @@ struct tcp_sock { struct tcp_sack_block recv_sack_cache[4]; - int lost_cnt_hint; - u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 5078ad868fee..9f852f5f8b95 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1811,14 +1811,8 @@ static inline void tcp_mib_init(struct net *net) } /* from STCP */ -static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) -{ - tp->lost_skb_hint = NULL; -} - static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { - tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f64f8276a73c..27d3ef83ce7b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5053,9 +5053,8 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32); /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8ec92dec321a..05b9571c9c92 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1451,11 +1451,6 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; - - /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ - if (tp->lost_skb_hint && - before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) - tp->lost_cnt_hint += pcount; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1496,9 +1491,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); - if (skb == tp->lost_skb_hint) - tp->lost_cnt_hint += pcount; - TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; @@ -1531,10 +1523,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; - if (skb == tp->lost_skb_hint) { - tp->lost_skb_hint = prev; - tp->lost_cnt_hint -= tcp_skb_pcount(prev); - } TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; @@ -2151,12 +2139,6 @@ static inline void tcp_init_undo(struct tcp_sock *tp) tp->undo_retrans = -1; } -static bool tcp_is_rack(const struct sock *sk) -{ - return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & - TCP_RACK_LOSS_DETECTION; -} - /* If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. @@ -2182,8 +2164,7 @@ static void tcp_timeout_mark_lost(struct sock *sk) skb_rbtree_walk_from(skb) { if (is_reneg) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; - else if (tcp_is_rack(sk) && skb != head && - tcp_rack_skb_timeout(tp, skb, 0) > 0) + else if (skb != head && tcp_rack_skb_timeout(tp, skb, 0) > 0) continue; /* Don't mark recently sent ones lost yet */ tcp_mark_skb_lost(sk, skb); } @@ -2264,22 +2245,6 @@ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) return false; } -/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs - * counter when SACK is enabled (without SACK, sacked_out is used for - * that purpose). - * - * With reordering, holes may still be in flight, so RFC3517 recovery - * uses pure sacked_out (total number of SACKed segments) even though - * it violates the RFC that uses duplicate ACKs, often these are equal - * but when e.g. out-of-window ACKs or packet duplication occurs, - * they differ. Since neither occurs due to loss, TCP should really - * ignore them. - */ -static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) -{ - return tp->sacked_out + 1; -} - /* Linux NewReno/SACK/ECN state machine. * -------------------------------------- * @@ -2332,13 +2297,7 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) * * If the receiver supports SACK: * - * RFC6675/3517: It is the conventional algorithm. A packet is - * considered lost if the number of higher sequence packets - * SACKed is greater than or equal the DUPACK thoreshold - * (reordering). This is implemented in tcp_mark_head_lost and - * tcp_update_scoreboard. - * - * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm + * RACK (RFC8985): RACK is a newer loss detection algorithm * (2017-) that checks timing instead of counting DUPACKs. * Essentially a packet is considered lost if it's not S/ACKed * after RTT + reordering_window, where both metrics are @@ -2353,8 +2312,8 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * - * Really tricky (and requiring careful tuning) part of algorithm - * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). + * The really tricky (and requiring careful tuning) part of the algorithm + * is hidden in the RACK code in tcp_recovery.c and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, * hence, slow down forward transmission. In fact, it determines the moment * when we decide that hole is caused by loss, rather than by a reorder. @@ -2381,79 +2340,8 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - /* Trick#1: The loss is proven. */ - if (tp->lost_out) - return true; - - /* Not-A-Trick#2 : Classic rule... */ - if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering) - return true; - - return false; -} - -/* Detect loss in event "A" above by marking head of queue up as lost. - * For RFC3517 SACK, a segment is considered lost if it - * has at least tp->reordering SACKed seqments above it; "packets" refers to - * the maximum SACKed segments to pass before reaching this limit. - */ -static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int cnt; - /* Use SACK to deduce losses of new sequences sent during recovery */ - const u32 loss_high = tp->snd_nxt; - - WARN_ON(packets > tp->packets_out); - skb = tp->lost_skb_hint; - if (skb) { - /* Head already handled? */ - if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) - return; - cnt = tp->lost_cnt_hint; - } else { - skb = tcp_rtx_queue_head(sk); - cnt = 0; - } - - skb_rbtree_walk_from(skb) { - /* TODO: do this better */ - /* this is not the most efficient way to do this... */ - tp->lost_skb_hint = skb; - tp->lost_cnt_hint = cnt; - - if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) - break; - - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) - cnt += tcp_skb_pcount(skb); - - if (cnt > packets) - break; - - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) - tcp_mark_skb_lost(sk, skb); - - if (mark_head) - break; - } - tcp_verify_left_out(tp); -} - -/* Account newly detected lost packet(s) */ - -static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (tcp_is_sack(tp)) { - int sacked_upto = tp->sacked_out - tp->reordering; - if (sacked_upto >= 0) - tcp_mark_head_lost(sk, sacked_upto, 0); - else if (fast_rexmit) - tcp_mark_head_lost(sk, 1, 1); - } + /* Has loss detection marked at least one packet lost? */ + return tp->lost_out != 0; } static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) @@ -2881,8 +2769,6 @@ void tcp_simple_retransmit(struct sock *sk) tcp_mark_skb_lost(sk, skb); } - tcp_clear_retrans_hints_partial(tp); - if (!tp->lost_out) return; @@ -2990,17 +2876,8 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, *rexmit = REXMIT_LOST; } -static bool tcp_force_fast_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - return after(tcp_highest_sack_seq(tp), - tp->snd_una + tp->reordering * tp->mss_cache); -} - /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, - bool *do_lost) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); @@ -3025,9 +2902,6 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); - } else { - /* Partial ACK arrived. Force fast retransmit. */ - *do_lost = tcp_force_fast_retransmit(sk); } return false; } @@ -3041,7 +2915,7 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) if (unlikely(tcp_is_reno(tp))) { tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED); - } else if (tcp_is_rack(sk)) { + } else { u32 prior_retrans = tp->retrans_out; if (tcp_rack_mark_lost(sk)) @@ -3068,10 +2942,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int fast_rexmit = 0, flag = *ack_flag; + int flag = *ack_flag; bool ece_ack = flag & FLAG_ECE; - bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && - tcp_force_fast_retransmit(sk)); if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; @@ -3120,7 +2992,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) tcp_add_reno_sack(sk, num_dupack, ece_ack); - } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) + } else if (tcp_try_undo_partial(sk, prior_snd_una)) return; if (tcp_try_undo_dsack(sk)) @@ -3175,11 +3047,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* Otherwise enter Recovery state */ tcp_enter_recovery(sk, ece_ack); - fast_rexmit = 1; } - if (!tcp_is_rack(sk) && do_lost) - tcp_update_scoreboard(sk, fast_rexmit); *rexmit = REXMIT_LOST; } @@ -3435,8 +3304,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; - if (unlikely(skb == tp->lost_skb_hint)) - tp->lost_skb_hint = NULL; tcp_highest_sack_replace(sk, skb, next); tcp_rtx_queue_unlink_and_free(skb, sk); } @@ -3494,14 +3361,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (flag & FLAG_RETRANS_DATA_ACKED) flag &= ~FLAG_ORIG_SACK_ACKED; } else { - int delta; - /* Non-retransmitted hole got filled? That's reordering */ if (before(reord, prior_fack)) tcp_check_sack_reordering(sk, reord, 0); - - delta = prior_sacked - tp->sacked_out; - tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3ac8d2d17e1f..eb50746dc482 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1554,11 +1554,6 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); - if (tp->lost_skb_hint && - before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - tp->lost_cnt_hint -= decr; - tcp_verify_left_out(tp); } @@ -3252,7 +3247,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ - tcp_clear_retrans_hints_partial(tp); if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; |