diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 1185 |
1 files changed, 749 insertions, 436 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6b6b57000dad..0640453fce54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ +#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -138,6 +139,69 @@ void clean_acked_data_flush(void) EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif +#ifdef CONFIG_CGROUP_BPF +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); + struct bpf_sock_ops_kern sock_ops; + + if (likely(!unknown_opt && !parse_all_opt)) + return; + + /* The skb will be handled in the + * bpf_skops_established() or + * bpf_skops_write_hdr_opt(). + */ + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_SYN_SENT: + case TCP_LISTEN: + return; + } + + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); +} + +static void bpf_skops_established(struct sock *sk, int bpf_op, + struct sk_buff *skb) +{ + struct bpf_sock_ops_kern sock_ops; + + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = bpf_op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); +} +#else +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ +} + +static void bpf_skops_established(struct sock *sk, int bpf_op, + struct sk_buff *skb) +{ +} +#endif + static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, unsigned int len) { @@ -261,7 +325,8 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) * cwnd may be very low (even just 1 packet), so we should ACK * immediately. */ - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } } @@ -349,7 +414,7 @@ static void tcp_sndbuf_expand(struct sock *sk) per_mss = roundup_pow_of_two(per_mss) + SKB_DATA_ALIGN(sizeof(struct sk_buff)); - nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); + nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp)); nr_segs = max_t(u32, nr_segs, tp->reordering + 1); /* Fast Recovery (RFC 5681 3.2) : @@ -361,7 +426,7 @@ static void tcp_sndbuf_expand(struct sock *sk) if (sk->sk_sndbuf < sndmem) WRITE_ONCE(sk->sk_sndbuf, - min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); + min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -390,12 +455,13 @@ static void tcp_sndbuf_expand(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, + unsigned int skbtruesize) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; - int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; + int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; + int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -407,46 +473,74 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) return 0; } -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing + * can play nice with us, as sk_buff and skb->head might be either + * freed or shared with up to MAX_SKB_FRAGS segments. + * Only give a boost to drivers using page frag(s) to hold the frame(s), + * and if no payload was pulled in skb->head before reaching us. + */ +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) +{ + u32 truesize = skb->truesize; + + if (adjust && !skb_headlen(skb)) { + truesize -= SKB_TRUESIZE(skb_end_offset(skb)); + /* paranoid check, some drivers might be buggy */ + if (unlikely((int)truesize < (int)skb->len)) + truesize = skb->truesize; + } + return truesize; +} + +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, + bool adjust) { struct tcp_sock *tp = tcp_sk(sk); int room; room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; + if (room <= 0) + return; + /* Check #1 */ - if (room > 0 && !tcp_under_memory_pressure(sk)) { + if (!tcp_under_memory_pressure(sk)) { + unsigned int truesize = truesize_adjust(adjust, skb); int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ - if (tcp_win_from_space(sk, skb->truesize) <= skb->len) + if (tcp_win_from_space(sk, truesize) <= skb->len) incr = 2 * tp->advmss; else - incr = __tcp_grow_window(sk, skb); + incr = __tcp_grow_window(sk, skb, truesize); if (incr) { incr = max_t(int, incr, 2 * skb->len); tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } + } else { + /* Under pressure: + * Adjust rcv_ssthresh according to reserved mem + */ + tcp_adjust_rcv_ssthresh(sk); } } /* 3. Try to fixup all. It is made immediately after connection enters * established state. */ -void tcp_init_buffer_space(struct sock *sk) +static void tcp_init_buffer_space(struct sock *sk) { - int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; + int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); struct tcp_sock *tp = tcp_sk(sk); int maxwin; if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); - tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -470,6 +564,8 @@ void tcp_init_buffer_space(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; + tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd, + (u32)TCP_INIT_CWND * tp->advmss); } /* 4. Recalculate window clamp after socket hit its memory bounds. */ @@ -478,16 +574,17 @@ static void tcp_clamp_window(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); + int rmem2; icsk->icsk_ack.quick = 0; + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); - if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < rmem2 && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { WRITE_ONCE(sk->sk_rcvbuf, - min(atomic_read(&sk->sk_rmem_alloc), - net->ipv4.sysctl_tcp_rmem[2])); + min(atomic_read(&sk->sk_rmem_alloc), rmem2)); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -517,7 +614,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. - * <http://public.lanl.gov/radiant/pubs.html#DRS> + * <https://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at * <http://staff.psc.edu/jheffner/>, @@ -628,7 +725,7 @@ void tcp_rcv_space_adjust(struct sock *sk) * <prev RTT . ><current RTT .. ><next RTT .... > */ - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvmem, rcvbuf; u64 rcvwin, grow; @@ -649,7 +746,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(rcvwin, tp->advmss); rcvbuf = min_t(u64, rcvwin * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); @@ -709,7 +806,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); - sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; @@ -717,7 +813,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_ecn_check_ce(sk, skb); if (skb->len >= 128) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, true); } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -813,12 +909,12 @@ static void tcp_update_pacing_rate(struct sock *sk) * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching * end of slow start and should slow down. */ - if (tp->snd_cwnd < tp->snd_ssthresh / 2) - rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; + if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2) + rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio); else - rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; + rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio); - rate *= max(tp->snd_cwnd, tp->packets_out); + rate *= max(tcp_snd_cwnd(tp), tp->packets_out); if (likely(tp->srtt_us)) do_div(rate, tp->srtt_us); @@ -870,12 +966,63 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -/* Take a notice that peer is sending D-SACKs */ -static void tcp_dsack_seen(struct tcp_sock *tp) +struct tcp_sacktag_state { + /* Timestamps for earliest and latest never-retransmitted segment + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ + u64 first_sackt; + u64 last_sackt; + u32 reord; + u32 sack_delivered; + int flag; + unsigned int mss_now; + struct rate_sample *rate; +}; + +/* Take a notice that peer is sending D-SACKs. Skip update of data delivery + * and spurious retransmission information if this DSACK is unlikely caused by + * sender's action: + * - DSACKed sequence range is larger than maximum receiver's window. + * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. + */ +static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, + u32 end_seq, struct tcp_sacktag_state *state) { + u32 seq_len, dup_segs = 1; + + if (!before(start_seq, end_seq)) + return 0; + + seq_len = end_seq - start_seq; + /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ + if (seq_len > tp->max_window) + return 0; + if (seq_len > tp->mss_cache) + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); + else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq) + state->flag |= FLAG_DSACK_TLP; + + tp->dsack_dups += dup_segs; + /* Skip the DSACK if dup segs weren't retransmitted by sender */ + if (tp->dsack_dups > tp->total_retrans) + return 0; + tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; - tp->rack.dsack_seen = 1; - tp->dsack_dups++; + /* We increase the RACK ordering window in rounds where we receive + * DSACKs that may have been due to reordering causing RACK to trigger + * a spurious fast recovery. Thus RACK ignores DSACKs that happen + * without having seen reordering, or that match TLP probes (TLP + * is timer-driven, not triggered by RACK). + */ + if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP)) + tp->rack.dsack_seen = 1; + + state->flag |= FLAG_DSACKING_ACK; + /* A spurious retransmission is delivered */ + state->sack_delivered += dup_segs; + + return dup_segs; } /* It's reordering when higher sequence was delivered (i.e. sacked) before @@ -904,7 +1051,7 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, tp->undo_marker ? tp->undo_retrans : 0); #endif tp->reordering = min_t(u32, (metric + mss - 1) / mss, - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); } /* This exciting event is worth to be remembered. 8) */ @@ -913,7 +1060,11 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } -/* This must be called before lost_out is incremented */ + /* This must be called before lost_out or retrans_out are updated + * on a new loss, because we want to know if all skbs previously + * known to be lost have already been retransmitted, indicating + * that this newly lost skb is our next skb to retransmit. + */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || @@ -923,42 +1074,46 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) tp->retransmit_skb_hint = skb; } -/* Sum the number of packets on the wire we have marked as lost. - * There are two cases we care about here: - * a) Packet hasn't been marked lost (nor retransmitted), - * and this is the first loss. - * b) Packet has been marked both lost and retransmitted, - * and this means we think it was lost again. +/* Sum the number of packets on the wire we have marked as lost, and + * notify the congestion control module that the given skb was marked lost. */ -static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; - - if (!(sacked & TCPCB_LOST) || - ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) - tp->lost += tcp_skb_pcount(skb); + tp->lost += tcp_skb_pcount(skb); } -static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { - tcp_verify_retransmit_hint(tp, skb); + __u8 sacked = TCP_SKB_CB(skb)->sacked; + struct tcp_sock *tp = tcp_sk(sk); + + if (sacked & TCPCB_SACKED_ACKED) + return; + tcp_verify_retransmit_hint(tp, skb); + if (sacked & TCPCB_LOST) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* Account for retransmits that are lost again */ + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, + tcp_skb_pcount(skb)); + tcp_notify_skb_loss_event(tp, skb); + } + } else { tp->lost_out += tcp_skb_pcount(skb); - tcp_sum_lost(tp, skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tcp_notify_skb_loss_event(tp, skb); } } -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) { - tcp_verify_retransmit_hint(tp, skb); - - tcp_sum_lost(tp, skb); - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { - tp->lost_out += tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - } + tp->delivered += delivered; + if (ece_ack) + tp->delivered_ce += delivered; } /* This procedure tags the retransmission queue when SACKs arrive. @@ -1093,52 +1248,43 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, - u32 prior_snd_una) + u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); - bool dup_sack = false; + u32 dup_segs; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { - dup_sack = true; - tcp_dsack_seen(tp); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); - if (!after(end_seq_0, end_seq_1) && - !before(start_seq_0, start_seq_1)) { - dup_sack = true; - tcp_dsack_seen(tp); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPDSACKOFORECV); - } + if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) + return false; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); + } else { + return false; + } + + dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); + if (!dup_segs) { /* Skip dubious DSACK */ + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); + return false; } + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); + /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && + if (tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) - tp->undo_retrans--; + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); - return dup_sack; + return true; } -struct tcp_sacktag_state { - u32 reord; - /* Timestamps for earliest and latest never-retransmitted segment - * that was SACKed. RTO needs the earliest RTT to stay conservative, - * but congestion control should still get an accurate delay signal. - */ - u64 first_sackt; - u64 last_sackt; - struct rate_sample *rate; - int flag; - unsigned int mss_now; -}; - /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment @@ -1208,7 +1354,7 @@ static u8 tcp_sacktag_one(struct sock *sk, if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) - tp->undo_retrans--; + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount); if ((sacked & TCPCB_SACKED_ACKED) && before(start_seq, state->reord)) state->reord = start_seq; @@ -1257,7 +1403,8 @@ static u8 tcp_sacktag_one(struct sock *sk, sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; - tp->delivered += pcount; /* Out-of-order packets delivered */ + /* Out-of-order packets delivered */ + state->sack_delivered += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (tp->lost_skb_hint && @@ -1513,6 +1660,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, (mss != tcp_skb_seglen(skb))) goto out; + if (!tcp_skb_can_collapse(prev, skb)) + goto out; len = skb->len; pcount = tcp_skb_pcount(skb); if (tcp_skb_shift(prev, skb, pcount, len)) @@ -1680,11 +1829,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, tcp_highest_sack_reset(sk); found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, - num_sacks, prior_snd_una); - if (found_dup_sack) { - state->flag |= FLAG_DSACKING_ACK; - tp->delivered++; /* A spurious retransmission is delivered */ - } + num_sacks, prior_snd_una, state); /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1885,14 +2030,14 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) return; tp->reordering = min_t(u32, tp->packets_out + addend, - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); tp->reord_seen++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk, int num_dupack) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) { if (num_dupack) { struct tcp_sock *tp = tcp_sk(sk); @@ -1903,20 +2048,21 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack) tcp_check_reno_reordering(sk, 0); delivered = tp->sacked_out - prior_sacked; if (delivered > 0) - tp->delivered += delivered; + tcp_count_delivered(tp, delivered, ece_ack); tcp_verify_left_out(tp); } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ -static void tcp_remove_reno_sacks(struct sock *sk, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ - tp->delivered += max_t(int, acked - tp->sacked_out, 1); + tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), + ece_ack); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else @@ -1949,7 +2095,8 @@ static inline void tcp_init_undo(struct tcp_sock *tp) static bool tcp_is_rack(const struct sock *sk) { - return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; + return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_LOSS_DETECTION; } /* If we detect SACK reneging, forget all SACK information @@ -1993,6 +2140,7 @@ void tcp_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; + u8 reordering; tcp_timeout_mark_lost(sk); @@ -2001,22 +2149,24 @@ void tcp_enter_loss(struct sock *sk) !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->prior_cwnd = tp->snd_cwnd; + tp->prior_cwnd = tcp_snd_cwnd(tp); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); } - tp->snd_cwnd = tcp_packets_in_flight(tp) + 1; + tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1); tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; /* Timeout in disordered state after receiving substantial DUPACKs * suggests that the degree of reordering is over-estimated. */ + reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); if (icsk->icsk_ca_state <= TCP_CA_Disorder && - tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) + tp->sacked_out >= reordering) tp->reordering = min_t(unsigned int, tp->reordering, - net->ipv4.sysctl_tcp_reordering); + reordering); + tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); @@ -2025,7 +2175,7 @@ void tcp_enter_loss(struct sock *sk) * loss recovery is underway except recurring timeout(s) on * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing */ - tp->frto = net->ipv4.sysctl_tcp_frto && + tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) && (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } @@ -2042,7 +2192,8 @@ void tcp_enter_loss(struct sock *sk) */ static bool tcp_check_sack_reneging(struct sock *sk, int flag) { - if (flag & FLAG_SACK_RENEGING) { + if (flag & FLAG_SACK_RENEGING && + flag & FLAG_SND_UNA_ADVANCED) { struct tcp_sock *tp = tcp_sk(sk); unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); @@ -2183,8 +2334,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) } /* Detect loss in event "A" above by marking head of queue up as lost. - * For non-SACK(Reno) senders, the first "packets" number of segments - * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. */ @@ -2192,10 +2342,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt, oldcnt, lost; - unsigned int mss; + int cnt; /* Use SACK to deduce losses of new sequences sent during recovery */ - const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; + const u32 loss_high = tp->snd_nxt; WARN_ON(packets > tp->packets_out); skb = tp->lost_skb_hint; @@ -2218,28 +2367,14 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) break; - oldcnt = cnt; - if (tcp_is_reno(tp) || - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) cnt += tcp_skb_pcount(skb); - if (cnt > packets) { - if (tcp_is_sack(tp) || - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || - (oldcnt >= packets)) - break; - - mss = tcp_skb_mss(skb); - /* If needed, chop off the prefix to mark as lost. */ - lost = (packets - oldcnt) * mss; - if (lost < skb->len && - tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, - lost, mss, GFP_ATOMIC) < 0) - break; - cnt = packets; - } + if (cnt > packets) + break; - tcp_skb_mark_lost(tp, skb); + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) + tcp_mark_skb_lost(sk, skb); if (mark_head) break; @@ -2328,7 +2463,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", msg, &inet->inet_daddr, ntohs(inet->inet_dport), - tp->snd_cwnd, tcp_left_out(tp), + tcp_snd_cwnd(tp), tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } @@ -2337,7 +2472,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", msg, &sk->sk_v6_daddr, ntohs(inet->inet_dport), - tp->snd_cwnd, tcp_left_out(tp), + tcp_snd_cwnd(tp), tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } @@ -2362,7 +2497,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); - tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); + tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk)); if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; @@ -2379,6 +2514,21 @@ static inline bool tcp_may_undo(const struct tcp_sock *tp) return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } +static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + return true; + } + return false; +} + /* People celebrate: "We love our President!" */ static bool tcp_try_undo_recovery(struct sock *sk) { @@ -2401,14 +2551,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) } else if (tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_persist--; } - if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { - /* Hold old state until something *above* high_seq - * is ACKed. For Reno it is MUST to prevent false - * fast retransmits (RFC2582). SACK TCP is safe. */ - if (!tcp_any_retrans_done(sk)) - tp->retrans_stamp = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) return true; - } tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; return false; @@ -2444,6 +2588,8 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) + return true; if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; @@ -2459,7 +2605,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. * 2) Otherwise PRR uses packet conservation to send as much as delivered. - * But when the retransmits are acked without further losses, PRR + * But when SND_UNA is acked without further losses, * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) @@ -2469,14 +2615,14 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tp->high_seq = tp->snd_nxt; tp->tlp_high_seq = 0; tp->snd_cwnd_cnt = 0; - tp->prior_cwnd = tp->snd_cwnd; + tp->prior_cwnd = tcp_snd_cwnd(tp); tp->prr_delivered = 0; tp->prr_out = 0; tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); tcp_ecn_queue_cwr(tp); } -void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2490,17 +2636,16 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == - FLAG_RETRANS_DATA_ACKED) { - sndcnt = min_t(int, delta, - max_t(int, tp->prr_delivered - tp->prr_out, - newly_acked_sacked) + 1); } else { - sndcnt = min(delta, newly_acked_sacked); + sndcnt = max_t(int, tp->prr_delivered - tp->prr_out, + newly_acked_sacked); + if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) + sndcnt++; + sndcnt = min(delta, sndcnt); } /* Force a fast retransmit upon entering fast recovery */ sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); - tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; + tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt); } static inline void tcp_end_cwnd_reduction(struct sock *sk) @@ -2513,7 +2658,7 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { - tp->snd_cwnd = tp->snd_ssthresh; + tcp_snd_cwnd_set(tp, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_jiffies32; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); @@ -2577,12 +2722,15 @@ static void tcp_mtup_probe_success(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + u64 val; - /* FIXME: breaks with very large cwnd */ tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_cwnd = tp->snd_cwnd * - tcp_mss_to_mtu(sk, tp->mss_cache) / - icsk->icsk_mtup.probe_size; + + val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache); + do_div(val, icsk->icsk_mtup.probe_size); + DEBUG_NET_WARN_ON_ONCE((u32)val != val); + tcp_snd_cwnd_set(tp, max_t(u32, 1U, val)); + tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_ssthresh = tcp_current_ssthresh(sk); @@ -2602,17 +2750,26 @@ void tcp_simple_retransmit(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk); + int mss; + + /* A fastopen SYN request is stored as two separate packets within + * the retransmit queue, this is done by tcp_send_syn_data(). + * As a result simply checking the MSS of the frames in the queue + * will not work for the SYN packet. + * + * Us being here is an indication of a path MTU issue so we can + * assume that the fastopen SYN was lost and just mark all the + * frames in the retransmit queue as lost. We will use an MSS of + * -1 to mark all frames as lost, otherwise compute the current MSS. + */ + if (tp->syn_data && sk->sk_state == TCP_SYN_SENT) + mss = -1; + else + mss = tcp_current_mss(sk); skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { - if (tcp_skb_seglen(skb) > mss && - !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - } - tcp_skb_mark_lost_uncond_verify(tp, skb); - } + if (tcp_skb_seglen(skb) > mss) + tcp_mark_skb_lost(sk, skb); } tcp_clear_retrans_hints_partial(tp); @@ -2713,15 +2870,24 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, * delivered. Lower inflight to clock out (re)tranmissions. */ if (after(tp->snd_nxt, tp->high_seq) && num_dupack) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } *rexmit = REXMIT_LOST; } +static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, + bool *do_lost) { struct tcp_sock *tp = tcp_sk(sk); @@ -2746,7 +2912,9 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); - return true; + } else { + /* Partial ACK arrived. Force fast retransmit. */ + *do_lost = tcp_force_fast_retransmit(sk); } return false; } @@ -2763,20 +2931,13 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) } else if (tcp_is_rack(sk)) { u32 prior_retrans = tp->retrans_out; - tcp_rack_mark_lost(sk); + if (tcp_rack_mark_lost(sk)) + *ack_flag &= ~FLAG_SET_XMIT_TIMER; if (prior_retrans > tp->retrans_out) *ack_flag |= FLAG_LOST_RETRANS; } } -static bool tcp_force_fast_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - return after(tcp_highest_sack_seq(tp), - tp->snd_una + tp->reordering * tp->mss_cache); -} - /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2795,6 +2956,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; + bool ece_ack = flag & FLAG_ECE; bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && tcp_force_fast_retransmit(sk)); @@ -2803,7 +2965,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ - if (flag & FLAG_ECE) + if (ece_ack) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ @@ -2816,7 +2978,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - WARN_ON(tp->retrans_out != 0); + WARN_ON(tp->retrans_out != 0 && !tp->syn_data); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { @@ -2844,19 +3006,22 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) - tcp_add_reno_sack(sk, num_dupack); - } else { - if (tcp_try_undo_partial(sk, prior_snd_una)) - return; - /* Partial ACK arrived. Force fast retransmit. */ - do_lost = tcp_is_reno(tp) || - tcp_force_fast_retransmit(sk); - } - if (tcp_try_undo_dsack(sk)) { - tcp_try_keep_open(sk); + tcp_add_reno_sack(sk, num_dupack, ece_ack); + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) return; - } + + if (tcp_try_undo_dsack(sk)) + tcp_try_keep_open(sk); + tcp_identify_packet_loss(sk, ack_flag); + if (icsk->icsk_ca_state != TCP_CA_Recovery) { + if (!tcp_time_to_recover(sk, flag)) + return; + /* Undo reverts the recovery state. If loss is evident, + * starts a new recovery (e.g. reordering then loss); + */ + tcp_enter_recovery(sk, ece_ack); + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); @@ -2865,12 +3030,12 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ - /* fall through */ + fallthrough; default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -2888,13 +3053,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tp->snd_una == tp->mtu_probe.probe_seq_start) { tcp_mtup_probe_failed(sk); /* Restores the reduction we did in tcp_mtup_probe() */ - tp->snd_cwnd++; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); tcp_simple_retransmit(sk); return; } /* Otherwise enter Recovery state */ - tcp_enter_recovery(sk, (flag & FLAG_ECE)); + tcp_enter_recovery(sk, ece_ack); fast_rexmit = 1; } @@ -2905,7 +3070,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { - u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; + u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ; struct tcp_sock *tp = tcp_sk(sk); if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { @@ -2944,6 +3109,8 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); ca_rtt_us = seq_rtt_us; } @@ -3014,7 +3181,7 @@ void tcp_rearm_rto(struct sock *sk) rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, - TCP_RTO_MAX, tcp_rtx_queue_head(sk)); + TCP_RTO_MAX); } } @@ -3047,7 +3214,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) } static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, - u32 prior_snd_una) + const struct sk_buff *ack_skb, u32 prior_snd_una) { const struct skb_shared_info *shinfo; @@ -3059,7 +3226,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, if (!before(shinfo->tskey, prior_snd_una) && before(shinfo->tskey, tcp_sk(sk)->snd_una)) { tcp_skb_tsorted_save(skb) { - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK); } tcp_skb_tsorted_restore(skb); } } @@ -3068,9 +3235,9 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, - u32 prior_snd_una, - struct tcp_sacktag_state *sack) +static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, + u32 prior_fack, u32 prior_snd_una, + struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; @@ -3083,7 +3250,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, long seq_rtt_us = -1L; long ca_rtt_us = -1L; u32 pkts_acked = 0; - u32 last_in_flight = 0; bool rtt_update; int flag = 0; @@ -3095,8 +3261,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u8 sacked = scb->sacked; u32 acked_pcount; - tcp_ack_tstamp(sk, skb, prior_snd_una); - /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || @@ -3121,7 +3285,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!first_ackt) first_ackt = last_ackt; - last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; if (before(start_seq, reord)) reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) @@ -3131,7 +3294,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; } else if (tcp_is_sack(tp)) { - tp->delivered += acked_pcount; + tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, tcp_skb_timestamp_us(skb)); @@ -3160,6 +3323,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!fully_acked) break; + tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; @@ -3175,15 +3340,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; - if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - flag |= FLAG_SACK_RENEGING; + if (skb) { + tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + flag |= FLAG_SACK_RENEGING; + } if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); - if (pkts_acked == 1 && last_in_flight < tp->mss_cache && - last_in_flight && !prior_sacked && fully_acked && + if (pkts_acked == 1 && fully_acked && !prior_sacked && + (tp->snd_una - prior_snd_una) < tp->mss_cache && sack->rate->prior_delivered + 1 == tp->delivered && !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { /* Conservatively mark a delayed ACK. It's typically @@ -3208,7 +3376,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, } if (tcp_is_reno(tp)) { - tcp_remove_reno_sacks(sk, pkts_acked); + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); /* If any of the cumulatively ACKed segments was * retransmitted, non-SACK case cannot confirm that @@ -3240,9 +3408,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = sack->rate->rtt_us, - .in_flight = last_in_flight }; + .rtt_us = sack->rate->rtt_us }; + sample.in_flight = tp->mss_cache * + (tp->delivered - sack->rate->prior_delivered); icsk->icsk_ca_ops->pkts_acked(sk, &sample); } @@ -3283,6 +3452,7 @@ static void tcp_ack_probe(struct sock *sk) return; if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! @@ -3290,8 +3460,8 @@ static void tcp_ack_probe(struct sock *sk) } else { unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - when, TCP_RTO_MAX, NULL); + when = tcp_clamp_probe0_to_user_timeout(sk, when); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); } } @@ -3310,7 +3480,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ - if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) + if (tcp_sk(sk)->reordering > + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering)) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED; @@ -3333,7 +3504,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, if (tcp_in_cwnd_reduction(sk)) { /* Reduce cwnd if state mandates */ - tcp_cwnd_reduction(sk, acked_sacked, flag); + tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag); } else if (tcp_may_raise_cwnd(sk, flag)) { /* Advance cwnd if state allows */ tcp_cong_avoid(sk, ack, acked_sacked); @@ -3422,7 +3593,8 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, if (*last_oow_ack_time) { s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); - if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { + if (0 <= elapsed && + elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } @@ -3452,14 +3624,11 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, } /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) +static void tcp_send_challenge_ack(struct sock *sk) { - /* unprotected vars, we dont care of overwrites */ - static u32 challenge_timestamp; - static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - u32 count, now; + u32 count, now, ack_limit; /* First check our per-socket dupack rate limit. */ if (__tcp_oow_rate_limited(net, @@ -3467,18 +3636,22 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) &tp->last_oow_ack_time)) return; + ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); + if (ack_limit == INT_MAX) + goto send_ack; + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; - if (now != challenge_timestamp) { - u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; + if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) { u32 half = (ack_limit + 1) >> 1; - challenge_timestamp = now; - WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); + WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now); + WRITE_ONCE(net->ipv4.tcp_challenge_count, half + prandom_u32_max(ack_limit)); } - count = READ_ONCE(challenge_count); + count = READ_ONCE(net->ipv4.tcp_challenge_count); if (count > 0) { - WRITE_ONCE(challenge_count, count - 1); + WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); +send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } @@ -3505,10 +3678,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } } -/* This routine deals with acks during a TLP episode. - * We mark the end of a TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. +/* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { @@ -3517,7 +3688,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (before(ack, tp->tlp_high_seq)) return; - if (flag & FLAG_DSACKING_ACK) { + if (!tp->tlp_retrans) { + /* TLP of new data has been acknowledged */ + tp->tlp_high_seq = 0; + } else if (flag & FLAG_DSACK_TLP) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { @@ -3575,10 +3749,9 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) { - tp->delivered_ce += delivered; + if (flag & FLAG_ECE) NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); - } + return delivered; } @@ -3602,6 +3775,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; + sack_state.sack_delivered = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); @@ -3613,8 +3787,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - tp->max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) - tcp_send_challenge_ack(sk, skb); - return -1; + tcp_send_challenge_ack(sk); + return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; } @@ -3623,7 +3797,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * this segment (RFC793 Section 3.9). */ if (after(ack, tp->snd_nxt)) - return -1; + return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA; if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; @@ -3677,12 +3851,25 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ack_ev_flags |= CA_ACK_ECE; } + if (sack_state.sack_delivered) + tcp_count_delivered(tp, sack_state.sack_delivered, + flag & FLAG_ECE); + if (flag & FLAG_WIN_UPDATE) ack_ev_flags |= CA_ACK_WIN_UPDATE; tcp_in_ack_event(sk, ack_ev_flags); } + /* This is a deviation from RFC3168 since it states that: + * "When the TCP data sender is ready to set the CWR bit after reducing + * the congestion window, it SHOULD set the CWR bit only on the first + * new data packet that it transmits." + * We accept CWR on pure ACKs to be more robust + * with widely-deployed TCP implementations that do this. + */ + tcp_ecn_accept_cwr(sk, skb); + /* We passed data and got it acked, remove any soft error * log. Something worked... */ @@ -3693,18 +3880,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); + flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una, + &sack_state, flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - /* If needed, reset TLP/RTO timer; RACK may later override this. */ - if (flag & FLAG_SET_XMIT_TIMER) - tcp_set_xmit_timer(sk); if (tcp_ack_is_dubious(sk, flag)) { - if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | + FLAG_NOT_DUP | FLAG_DSACKING_ACK))) { num_dupack = 1; /* Consider if pure acks were aggregated in tcp_add_backlog() */ if (!(flag & FLAG_DATA)) @@ -3714,6 +3900,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) &rexmit); } + /* If needed, reset TLP/RTO timer when RACK doesn't set. */ + if (flag & FLAG_SET_XMIT_TIMER) + tcp_set_xmit_timer(sk); + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); @@ -3775,7 +3965,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } -static void smc_parse_options(const struct tcphdr *th, +static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, int opsize) @@ -3784,16 +3974,19 @@ static void smc_parse_options(const struct tcphdr *th, if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && opsize >= TCPOLEN_EXP_SMC_BASE && - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { opt_rx->smc_ok = 1; + return true; + } } #endif + return false; } /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped * value on success. */ -static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) +u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) { const unsigned char *ptr = (const unsigned char *)(th + 1); int length = (th->doff * 4) - sizeof(struct tcphdr); @@ -3832,6 +4025,7 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) } return mss; } +EXPORT_SYMBOL_GPL(tcp_parse_mss_option); /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when @@ -3848,6 +4042,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->saw_unknown = 0; while (length > 0) { int opcode = *ptr++; @@ -3881,7 +4076,7 @@ void tcp_parse_options(const struct net *net, break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW && th->syn && - !estab && net->ipv4.sysctl_tcp_window_scaling) { + !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > TCP_MAX_WSCALE) { @@ -3897,7 +4092,7 @@ void tcp_parse_options(const struct net *net, case TCPOPT_TIMESTAMP: if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || - (!estab && net->ipv4.sysctl_tcp_timestamps))) { + (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) { opt_rx->saw_tstamp = 1; opt_rx->rcv_tsval = get_unaligned_be32(ptr); opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); @@ -3905,7 +4100,7 @@ void tcp_parse_options(const struct net *net, break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && - !estab && net->ipv4.sysctl_tcp_sack) { + !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) { opt_rx->sack_ok = TCP_SACK_SEEN; tcp_sack_reset(opt_rx); } @@ -3926,10 +4121,6 @@ void tcp_parse_options(const struct net *net, */ break; #endif - case TCPOPT_MPTCP: - mptcp_parse_option(skb, ptr, opsize, opt_rx); - break; - case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, @@ -3942,15 +4133,21 @@ void tcp_parse_options(const struct net *net, */ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && get_unaligned_be16(ptr) == - TCPOPT_FASTOPEN_MAGIC) + TCPOPT_FASTOPEN_MAGIC) { tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); - else - smc_parse_options(th, opt_rx, ptr, - opsize); + break; + } + + if (smc_parse_options(th, opt_rx, ptr, opsize)) + break; + + opt_rx->saw_unknown = 1; break; + default: + opt_rx->saw_unknown = 1; } ptr += opsize-2; length -= opsize; @@ -4111,10 +4308,16 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) } /* When we get a reset we do this. */ -void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk, struct sk_buff *skb) { trace_tcp_receive_reset(sk); + /* mptcp can't tell us to ignore reset pkts, + * so just ignore the return value of mptcp_incoming_options(). + */ + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: @@ -4135,7 +4338,7 @@ void tcp_reset(struct sock *sk) tcp_done(sk); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } /* @@ -4207,7 +4410,6 @@ void tcp_fin(struct sock *sk) skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); - sk_mem_reclaim(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -4238,7 +4440,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { int mib_idx; if (before(seq, tp->rcv_nxt)) @@ -4271,10 +4473,9 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { - sk_rethink_txhash(sk); + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && + sk_rethink_txhash(sk)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); - } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -4286,7 +4487,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; tcp_rcv_spurious_retrans(sk, skb); @@ -4323,10 +4524,38 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) sp[i] = sp[i + 1]; continue; } - this_sack++, swalk++; + this_sack++; + swalk++; } } +static void tcp_sack_compress_send_ack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->compressed_ack) + return; + + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) + __sock_put(sk); + + /* Since we have to send one ack finally, + * substract one from tp->compressed_ack to keep + * LINUX_MIB_TCPACKCOMPRESSED accurate. + */ + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - 1); + + tp->compressed_ack = 0; + tcp_send_ack(sk); +} + +/* Reasonable amount of sack blocks included in TCP SACK option + * The max is 4, but this becomes 3 if TCP timestamps are there. + * Given that SACK packets might be lost, be conservative and use 2. + */ +#define TCP_SACK_BLOCKS_EXPECTED 2 + static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -4339,6 +4568,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { if (tcp_sack_extend(sp, seq, end_seq)) { + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) + tcp_sack_compress_send_ack(sk); /* Rotate this_sack to the first one. */ for (; this_sack > 0; this_sack--, sp--) swap(*sp, *(sp - 1)); @@ -4348,6 +4579,9 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) } } + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) + tcp_sack_compress_send_ack(sk); + /* Could not find an adjacent existing SACK, build a new one, * put it at the front, and shift everyone else down. We * always know there is at least one SACK present already here. @@ -4355,8 +4589,6 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -4408,7 +4640,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket - * @dest: destination queue * @to: prior buffer * @from: buffer to add in queue * @fragstolen: pointer to boolean @@ -4466,7 +4697,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, { bool res = tcp_try_coalesce(sk, to, from, fragstolen); - /* In case tcp_drop() is called later, update to->gso_segs */ + /* In case tcp_drop_reason() is called later, update to->gso_segs */ if (res) { u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + max_t(u16, 1, skb_shinfo(from)->gso_segs); @@ -4476,10 +4707,11 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, return res; } -static void tcp_drop(struct sock *sk, struct sk_buff *skb) +static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason reason) { sk_drops_add(sk, skb); - __kfree_skb(skb); + kfree_skb_reason(skb, reason); } /* This one checks to see if we can put data from the @@ -4509,7 +4741,7 @@ static void tcp_ofo_queue(struct sock *sk) rb_erase(&skb->rbnode, &tp->out_of_order_queue); if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_DROP); continue; } @@ -4564,7 +4796,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); - tcp_drop(sk, skb); + sk->sk_data_ready(sk); + tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM); return; } @@ -4597,7 +4830,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb, true); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; @@ -4623,7 +4860,8 @@ coalesce_done: /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, + SKB_DROP_REASON_TCP_OFOMERGE); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4642,7 +4880,8 @@ coalesce_done: TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb1); + tcp_drop_reason(sk, skb1, + SKB_DROP_REASON_TCP_OFOMERGE); goto merge_right; } } else if (tcp_ooo_try_coalesce(sk, skb1, @@ -4670,7 +4909,7 @@ merge_right: tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb1); + tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) @@ -4681,7 +4920,11 @@ add_sack: tcp_sack_new_ofo_skb(sk, seq, end_seq); end: if (skb) { - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb, false); skb_condense(skb); skb_set_owner_r(skb, sk); } @@ -4758,23 +5001,24 @@ err: void tcp_data_ready(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); - int avail = tp->rcv_nxt - tp->copied_seq; - - if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) - return; - - sk->sk_data_ready(sk); + if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE)) + sk->sk_data_ready(sk); } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + enum skb_drop_reason reason; bool fragstolen; int eaten; - if (sk_is_mptcp(sk)) - mptcp_incoming_options(sk, skb, &tp->rx_opt); + /* If a subflow has been reset, the packet should not continue + * to be processed, drop the packet. + */ + if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) { + __kfree_skb(skb); + return; + } if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); @@ -4783,8 +5027,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); - tcp_ecn_accept_cwr(sk, skb); - + reason = SKB_DROP_REASON_NOT_SPECIFIED; tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. @@ -4793,6 +5036,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (tcp_receive_window(tp) == 0) { + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } @@ -4802,7 +5046,9 @@ queue_and_out: if (skb_queue_len(&sk->sk_receive_queue) == 0) sk_forced_mem_schedule(sk, skb->truesize); else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + reason = SKB_DROP_REASON_PROTO_MEM; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + sk->sk_data_ready(sk); goto drop; } @@ -4837,6 +5083,7 @@ queue_and_out: if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { tcp_rcv_spurious_retrans(sk, skb); /* A retransmit, 2nd most common case. Force an immediate ack. */ + reason = SKB_DROP_REASON_TCP_OLD_DATA; NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -4844,13 +5091,16 @@ out_of_window: tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_schedule_ack(sk); drop: - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, reason); return; } /* Out of window. F.e. zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) + if (!before(TCP_SKB_CB(skb)->seq, + tp->rcv_nxt + tcp_receive_window(tp))) { + reason = SKB_DROP_REASON_TCP_OVERWINDOW; goto out_of_window; + } if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ @@ -4860,6 +5110,7 @@ drop: * remembering D-SACK for its head made in previous line. */ if (!tcp_receive_window(tp)) { + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } @@ -5055,7 +5306,7 @@ new_range: before(TCP_SKB_CB(skb)->end_seq, start)) { /* Do not attempt collapsing tiny skbs */ if (range_truesize != head->truesize || - end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { + end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) { tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); } else { @@ -5101,9 +5352,9 @@ static bool tcp_prune_ofo_queue(struct sock *sk) prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); goal -= rb_to_skb(node)->truesize; - tcp_drop(sk, rb_to_skb(node)); + tcp_drop_reason(sk, rb_to_skb(node), + SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); if (!prev || goal <= 0) { - sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; @@ -5139,7 +5390,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); else if (tcp_under_memory_pressure(sk)) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + tcp_adjust_rcv_ssthresh(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@ -5150,7 +5401,6 @@ static int tcp_prune_queue(struct sock *sk) skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); - sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@ -5174,7 +5424,7 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -static bool tcp_should_expand_sndbuf(const struct sock *sk) +static bool tcp_should_expand_sndbuf(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -5185,26 +5435,30 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) { + int unused_mem = sk_unused_reserved_mem(sk); + + /* Adjust sndbuf according to reserved mem. But make sure + * it never goes below SOCK_MIN_SNDBUF. + * See sk_stream_moderate_sndbuf() for more details. + */ + if (unused_mem > SOCK_MIN_SNDBUF) + WRITE_ONCE(sk->sk_sndbuf, unused_mem); + return false; + } /* If we are under soft global TCP memory pressure, do not expand. */ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) return false; /* If we filled the congestion window, do not expand. */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)) return false; return true; } -/* When incoming ACK allowed to free some skb from write_queue, - * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket - * on the exit from tcp input handler. - * - * PROBLEM: sndbuf expansion does not work well with largesend. - */ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -5214,21 +5468,28 @@ static void tcp_new_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_jiffies32; } - sk->sk_write_space(sk); + INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } -static void tcp_check_space(struct sock *sk) +/* Caller made space either from: + * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) + * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) + * + * We might be able to generate EPOLLOUT to the application if: + * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 + * 2) notsent amount (tp->write_seq - tp->snd_nxt) became + * small enough that tcp_stream_memory_free() decides it + * is time to generate EPOLLOUT. + */ +void tcp_check_space(struct sock *sk) { - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); - /* pairs with tcp_poll() */ - smp_mb(); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - tcp_new_space(sk); - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); - } + /* pairs with tcp_poll() */ + smp_mb(); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } } @@ -5270,20 +5531,18 @@ send_now: } if (!tcp_is_sack(tp) || - tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) + tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) goto send_now; if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { tp->compressed_ack_rcv_nxt = tp->rcv_nxt; - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = 0; + tp->dup_ack_counter = 0; } - - if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) + if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { + tp->dup_ack_counter++; goto send_now; - + } + tp->compressed_ack++; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; @@ -5293,11 +5552,13 @@ send_now: if (tp->srtt_us && tp->srtt_us < rtt) rtt = tp->srtt_us; - delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, + delay = min_t(unsigned long, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); - hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_SOFT); + hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), + HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) @@ -5324,7 +5585,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); - if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) + if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg)) ptr--; ptr += ntohl(th->seq); @@ -5377,7 +5638,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) } } - tp->urg_data = TCP_URG_NOTYET; + WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET); WRITE_ONCE(tp->urg_seq, ptr); /* Disable header prediction. */ @@ -5390,11 +5651,11 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t struct tcp_sock *tp = tcp_sk(sk); /* Check if we get a new urgent pointer - normally not. */ - if (th->urg) + if (unlikely(th->urg)) tcp_check_urg(sk, th); /* Do we wait for any urgent data? - normally not... */ - if (tp->urg_data == TCP_URG_NOTYET) { + if (unlikely(tp->urg_data == TCP_URG_NOTYET)) { u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) - th->syn; @@ -5403,7 +5664,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t u8 tmp; if (skb_copy_bits(skb, ptr, &tmp, 1)) BUG(); - tp->urg_data = TCP_URG_VALID | tmp; + WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); } @@ -5434,7 +5695,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); - bool rst_seq_match = false; + SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) && @@ -5446,6 +5707,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, LINUX_MIB_TCPACKSKIPPEDPAWS, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); + SKB_DR_SET(reason, TCP_RFC7323_PAWS); goto discard; } /* Reset is accepted even if it did not pass PAWS. */ @@ -5467,8 +5729,9 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); } else if (tcp_reset_check(sk, skb)) { - tcp_reset(sk); + goto reset; } + SKB_DR_SET(reason, TCP_INVALID_SEQUENCE); goto discard; } @@ -5484,9 +5747,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * Send a challenge ACK */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || - tcp_reset_check(sk, skb)) { - rst_seq_match = true; - } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { + tcp_reset_check(sk, skb)) + goto reset; + + if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { struct tcp_sack_block *sp = &tp->selective_acks[0]; int max_sack = sp[0].end_seq; int this_sack; @@ -5499,21 +5763,18 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } if (TCP_SKB_CB(skb)->seq == max_sack) - rst_seq_match = true; + goto reset; } - if (rst_seq_match) - tcp_reset(sk); - else { - /* Disable TFO if RST is out-of-order - * and no data has been received - * for current active TFO socket - */ - if (tp->syn_fastopen && !tp->data_segs_in && - sk->sk_state == TCP_ESTABLISHED) - tcp_fastopen_active_disable(sk); - tcp_send_challenge_ack(sk, skb); - } + /* Disable TFO if RST is out-of-order + * and no data has been received + * for current active TFO socket + */ + if (tp->syn_fastopen && !tp->data_segs_in && + sk->sk_state == TCP_ESTABLISHED) + tcp_fastopen_active_disable(sk); + tcp_send_challenge_ack(sk); + SKB_DR_SET(reason, TCP_RESET); goto discard; } @@ -5527,14 +5788,22 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk, skb); + tcp_send_challenge_ack(sk); + SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } + bpf_skops_parse_hdr(sk, skb); + return true; discard: - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, reason); + return false; + +reset: + tcp_reset(sk, skb); + __kfree_skb(skb); return false; } @@ -5563,6 +5832,7 @@ discard: */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct tcphdr *th = (const struct tcphdr *)skb->data; struct tcp_sock *tp = tcp_sk(sk); unsigned int len = skb->len; @@ -5571,7 +5841,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) trace_tcp_probe(sk, skb); tcp_mstamp_refresh(tp); - if (unlikely(!sk->sk_rx_dst)) + if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. @@ -5651,6 +5921,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ + reason = SKB_DROP_REASON_PKT_TOO_SMALL; TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -5678,6 +5949,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ + skb_dst_drop(skb); __skb_pull(skb, tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); @@ -5689,6 +5961,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; + } else { + tcp_update_wl(tp, TCP_SKB_CB(skb)->seq); } __tcp_ack_snd_check(sk, 0); @@ -5704,8 +5978,10 @@ slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; - if (!th->ack && !th->rst && !th->syn) + if (!th->ack && !th->rst && !th->syn) { + reason = SKB_DROP_REASON_TCP_FLAGS; goto discard; + } /* * Standard slow path. @@ -5715,9 +5991,11 @@ slow_path: return; step5: - if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); + if ((int)reason < 0) { + reason = -reason; goto discard; - + } tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ @@ -5731,15 +6009,17 @@ step5: return; csum_error: + reason = SKB_DROP_REASON_TCP_CSUM; + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); discard: - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, reason); } EXPORT_SYMBOL(tcp_rcv_established); -void tcp_init_transfer(struct sock *sk, int bpf_op) +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -5755,13 +6035,15 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) * retransmission has occurred. */ if (tp->total_retrans > 1 && tp->undo_marker) - tp->snd_cwnd = 1; + tcp_snd_cwnd_set(tp, 1); else - tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); + tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk))); tp->snd_cwnd_stamp = tcp_jiffies32; - tcp_call_bpf(sk, bpf_op, 0, NULL); - tcp_init_congestion_control(sk); + bpf_skops_established(sk, bpf_op, skb); + /* Initialize congestion control unless BPF initialized it already: */ + if (!icsk->icsk_ca_initialized) + tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -5779,7 +6061,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); } - tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); /* Prevent spurious tcp_cwnd_restart() on first data * packet. @@ -5838,11 +6120,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; else tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; - skb_rbtree_walk_from(data) { - if (__tcp_retransmit_skb(sk, data, 1)) - break; - } - tcp_rearm_rto(sk); + skb_rbtree_walk_from(data) + tcp_mark_skb_lost(sk, data); + tcp_xmit_retransmit_queue(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; @@ -5893,6 +6173,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; bool fastopen_fail; + SKB_DR(reason); tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) @@ -5934,8 +6215,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ if (th->rst) { - tcp_reset(sk); - goto discard; + tcp_reset(sk, skb); +consume: + __kfree_skb(skb); + return 0; } /* rfc793: @@ -5945,9 +6228,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * See note below! * --ANK(990513) */ - if (!th->syn) + if (!th->syn) { + SKB_DR_SET(reason, TCP_FLAGS); goto discard_and_undo; - + } /* rfc793: * "If the SYN bit is on ... * are acceptable then ... @@ -5990,9 +6274,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); - if (sk_is_mptcp(sk)) - mptcp_rcv_synsent(sk); - /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ @@ -6027,13 +6308,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); - -discard: - tcp_drop(sk, skb); - return 0; - } else { - tcp_send_ack(sk); + goto consume; } + tcp_send_ack(sk); return -1; } @@ -6045,15 +6322,16 @@ discard: * * Otherwise (no ACK) drop the segment and return." */ - + SKB_DR_SET(reason, TCP_RESET); goto discard_and_undo; } /* PAWS check. */ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && - tcp_paws_reject(&tp->rx_opt, 0)) + tcp_paws_reject(&tp->rx_opt, 0)) { + SKB_DR_SET(reason, TCP_RFC7323_PAWS); goto discard_and_undo; - + } if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. @@ -6102,7 +6380,7 @@ discard: */ return -1; #else - goto discard; + goto consume; #endif } /* "fifth, if neither of the SYN or RST bits is set then @@ -6112,7 +6390,8 @@ discard: discard_and_undo: tcp_clear_options(&tp->rx_opt); tp->rx_opt.mss_clamp = saved_clamp; - goto discard; + tcp_drop_reason(sk, skb, reason); + return 0; reset_and_undo: tcp_clear_options(&tp->rx_opt); @@ -6167,21 +6446,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) struct request_sock *req; int queued = 0; bool acceptable; + SKB_DR(reason); switch (sk->sk_state) { case TCP_CLOSE: + SKB_DR_SET(reason, TCP_CLOSE); goto discard; case TCP_LISTEN: if (th->ack) return 1; - if (th->rst) + if (th->rst) { + SKB_DR_SET(reason, TCP_RESET); goto discard; - + } if (th->syn) { - if (th->fin) + if (th->fin) { + SKB_DR_SET(reason, TCP_FLAGS); goto discard; + } /* It is possible that we process SYN packets from backlog, * so we need to make sure to disable BH and RCU right there. */ @@ -6196,6 +6480,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) consume_skb(skb); return 0; } + SKB_DR_SET(reason, TCP_FLAGS); goto discard; case TCP_SYN_SENT: @@ -6222,13 +6507,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true, &req_stolen)) + if (!tcp_check_req(sk, skb, req, true, &req_stolen)) { + SKB_DR_SET(reason, TCP_FASTOPEN); goto discard; + } } - if (!th->ack && !th->rst && !th->syn) + if (!th->ack && !th->rst && !th->syn) { + SKB_DR_SET(reason, TCP_FLAGS); goto discard; - + } if (!tcp_validate_incoming(sk, skb, th, 0)) return 0; @@ -6240,7 +6528,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!acceptable) { if (sk->sk_state == TCP_SYN_RECV) return 1; /* send one RST */ - tcp_send_challenge_ack(sk, skb); + tcp_send_challenge_ack(sk); + SKB_DR_SET(reason, TCP_OLD_ACK); goto discard; } switch (sk->sk_state) { @@ -6254,7 +6543,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } else { tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; - tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, + skb); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } smp_mb(); @@ -6333,7 +6623,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) inet_csk_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); - goto discard; + goto consume; } break; } @@ -6341,7 +6631,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { tcp_time_wait(sk, TCP_TIME_WAIT, 0); - goto discard; + goto consume; } break; @@ -6349,7 +6639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->snd_una == tp->write_seq) { tcp_update_metrics(sk); tcp_done(sk); - goto discard; + goto consume; } break; } @@ -6363,11 +6653,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSING: case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - if (sk_is_mptcp(sk)) - mptcp_incoming_options(sk, skb, &tp->rx_opt); + /* If a subflow has been reset, the packet should not + * continue to be processed, drop the packet. + */ + if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) + goto discard; break; } - /* fall through */ + fallthrough; case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, @@ -6378,11 +6671,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - tcp_reset(sk); + tcp_reset(sk, skb); return 1; } } - /* Fall through */ + fallthrough; case TCP_ESTABLISHED: tcp_data_queue(sk, skb); queued = 1; @@ -6397,9 +6690,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!queued) { discard: - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, reason); } return 0; + +consume: + __kfree_skb(skb); + return 0; } EXPORT_SYMBOL(tcp_rcv_state_process); @@ -6450,7 +6747,7 @@ static void tcp_ecn_create_request(struct request_sock *req, ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); - ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; + ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || @@ -6465,7 +6762,6 @@ static void tcp_openreq_init(struct request_sock *req, struct inet_request_sock *ireq = inet_rsk(req); req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ - req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; @@ -6482,7 +6778,8 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); #if IS_ENABLED(CONFIG_SMC) - ireq->smc_ok = rx_opt->smc_ok; + ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && + tcp_sk(sk)->smc_hs_congested(sk)); #endif } @@ -6504,6 +6801,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, ireq->ireq_state = TCP_NEW_SYN_RECV; write_pnet(&ireq->ireq_net, sock_net(sk_listener)); ireq->ireq_family = sk_listener->sk_family; + req->timeout = TCP_TIMEOUT_INIT; } return req; @@ -6517,11 +6815,14 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; - bool want_cookie = false; struct net *net = sock_net(sk); + bool want_cookie = false; + u8 syncookies; + + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); #ifdef CONFIG_SYN_COOKIES - if (net->ipv4.sysctl_tcp_syncookies) { + if (syncookies) { msg = "Sending cookies"; want_cookie = true; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); @@ -6529,8 +6830,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!queue->synflood_warned && - net->ipv4.sysctl_tcp_syncookies != 2 && + if (!queue->synflood_warned && syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, sk->sk_num, msg); @@ -6544,13 +6844,27 @@ static void tcp_reqsk_record_syn(const struct sock *sk, { if (tcp_sk(sk)->save_syn) { u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); - u32 *copy; + struct saved_syn *saved_syn; + u32 mac_hdrlen; + void *base; + + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ + base = skb_mac_header(skb); + mac_hdrlen = skb_mac_header_len(skb); + len += mac_hdrlen; + } else { + base = skb_network_header(skb); + mac_hdrlen = 0; + } - copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); - if (copy) { - copy[0] = len; - memcpy(©[1], skb_network_header(skb), len); - req->saved_syn = copy; + saved_syn = kmalloc(struct_size(saved_syn, data, len), + GFP_ATOMIC); + if (saved_syn) { + saved_syn->mac_hdrlen = mac_hdrlen; + saved_syn->network_hdrlen = skb_network_header_len(skb); + saved_syn->tcp_hdrlen = tcp_hdrlen(skb); + memcpy(saved_syn->data, base, len); + req->saved_syn = saved_syn; } } } @@ -6565,7 +6879,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, struct tcp_sock *tp = tcp_sk(sk); u16 mss; - if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 && !inet_csk_reqsk_queue_is_full(sk)) return 0; @@ -6599,13 +6913,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, bool want_cookie = false; struct dst_entry *dst; struct flowi fl; + u8 syncookies; + + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ - if ((net->ipv4.sysctl_tcp_syncookies == 2 || - inet_csk_reqsk_queue_is_full(sk)) && !isn) { + if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); if (!want_cookie) goto drop; @@ -6620,6 +6936,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!req) goto drop; + req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; #if IS_ENABLED(CONFIG_MPTCP) @@ -6645,26 +6962,20 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - af_ops->init_req(req, sk, skb); - - if (IS_ENABLED(CONFIG_MPTCP) && want_cookie) - tcp_rsk(req)->is_mptcp = 0; - - if (security_inet_conn_request(sk, skb, req)) + dst = af_ops->route_req(sk, skb, &fl, req); + if (!dst) goto drop_and_free; if (tmp_opt.tstamp_ok) tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); - dst = af_ops->route_req(sk, &fl, req); - if (!dst) - goto drop_and_free; - if (!want_cookie && !isn) { + int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); + /* Kill the following clause, if you dislike this way. */ - if (!net->ipv4.sysctl_tcp_syncookies && - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (net->ipv4.sysctl_max_syn_backlog >> 2)) && + if (!syncookies && + (max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, @@ -6685,13 +6996,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; } tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); + tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; tcp_openreq_init_rwin(req, sk, dst); sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { @@ -6700,7 +7011,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, - &foc, TCP_SYNACK_FASTOPEN); + &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { reqsk_fastopen_remove(fastopen_sk, req, false); @@ -6713,12 +7024,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, sock_put(fastopen_sk); } else { tcp_rsk(req)->tfo_listener = false; - if (!want_cookie) - inet_csk_reqsk_queue_hash_add(sk, req, - tcp_timeout_init((struct sock *)req)); + if (!want_cookie) { + req->timeout = tcp_timeout_init((struct sock *)req); + inet_csk_reqsk_queue_hash_add(sk, req, req->timeout); + } af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : - TCP_SYNACK_COOKIE); + TCP_SYNACK_COOKIE, + skb); if (want_cookie) { reqsk_free(req); return 0; |