diff options
author | 2017-02-22 10:15:09 -0800 | |
---|---|---|
committer | 2017-02-22 10:15:09 -0800 | |
commit | 3051bf36c25d5153051704291782f8d44e744d36 (patch) | |
tree | 72dfc8a1d12675c6f2981d13102df954b678f11b /net/ipv4/tcp_input.c | |
parent | Merge tag 'gcc-plugins-v4.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux (diff) | |
parent | Revert "ath10k: Search SMBIOS for OEM board file extension" (diff) | |
download | wireguard-linux-3051bf36c25d5153051704291782f8d44e744d36.tar.xz wireguard-linux-3051bf36c25d5153051704291782f8d44e744d36.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Highlights:
1) Support TX_RING in AF_PACKET TPACKET_V3 mode, from Sowmini
Varadhan.
2) Simplify classifier state on sk_buff in order to shrink it a bit.
From Willem de Bruijn.
3) Introduce SIPHASH and it's usage for secure sequence numbers and
syncookies. From Jason A. Donenfeld.
4) Reduce CPU usage for ICMP replies we are going to limit or
suppress, from Jesper Dangaard Brouer.
5) Introduce Shared Memory Communications socket layer, from Ursula
Braun.
6) Add RACK loss detection and allow it to actually trigger fast
recovery instead of just assisting after other algorithms have
triggered it. From Yuchung Cheng.
7) Add xmit_more and BQL support to mvneta driver, from Simon Guinot.
8) skb_cow_data avoidance in esp4 and esp6, from Steffen Klassert.
9) Export MPLS packet stats via netlink, from Robert Shearman.
10) Significantly improve inet port bind conflict handling, especially
when an application is restarted and changes it's setting of
reuseport. From Josef Bacik.
11) Implement TX batching in vhost_net, from Jason Wang.
12) Extend the dummy device so that VF (virtual function) features,
such as configuration, can be more easily tested. From Phil
Sutter.
13) Avoid two atomic ops per page on x86 in bnx2x driver, from Eric
Dumazet.
14) Add new bpf MAP, implementing a longest prefix match trie. From
Daniel Mack.
15) Packet sample offloading support in mlxsw driver, from Yotam Gigi.
16) Add new aquantia driver, from David VomLehn.
17) Add bpf tracepoints, from Daniel Borkmann.
18) Add support for port mirroring to b53 and bcm_sf2 drivers, from
Florian Fainelli.
19) Remove custom busy polling in many drivers, it is done in the core
networking since 4.5 times. From Eric Dumazet.
20) Support XDP adjust_head in virtio_net, from John Fastabend.
21) Fix several major holes in neighbour entry confirmation, from
Julian Anastasov.
22) Add XDP support to bnxt_en driver, from Michael Chan.
23) VXLAN offloads for enic driver, from Govindarajulu Varadarajan.
24) Add IPVTAP driver (IP-VLAN based tap driver) from Sainath Grandhi.
25) Support GRO in IPSEC protocols, from Steffen Klassert"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1764 commits)
Revert "ath10k: Search SMBIOS for OEM board file extension"
net: socket: fix recvmmsg not returning error from sock_error
bnxt_en: use eth_hw_addr_random()
bpf: fix unlocking of jited image when module ronx not set
arch: add ARCH_HAS_SET_MEMORY config
net: napi_watchdog() can use napi_schedule_irqoff()
tcp: Revert "tcp: tcp_probe: use spin_lock_bh()"
net/hsr: use eth_hw_addr_random()
net: mvpp2: enable building on 64-bit platforms
net: mvpp2: switch to build_skb() in the RX path
net: mvpp2: simplify MVPP2_PRS_RI_* definitions
net: mvpp2: fix indentation of MVPP2_EXT_GLOBAL_CTRL_DEFAULT
net: mvpp2: remove unused register definitions
net: mvpp2: simplify mvpp2_bm_bufs_add()
net: mvpp2: drop useless fields in mvpp2_bm_pool and related code
net: mvpp2: remove unused 'tx_skb' field of 'struct mvpp2_tx_queue'
net: mvpp2: release reference to txq_cpu[] entry after unmapping
net: mvpp2: handle too large value in mvpp2_rx_time_coal_set()
net: mvpp2: handle too large value handling in mvpp2_rx_pkts_coal_set()
net: mvpp2: remove useless arguments in mvpp2_rx_{pkts, time}_coal_set
...
Diffstat (limited to '')
-rw-r--r-- | net/ipv4/tcp_input.c | 271 |
1 files changed, 119 insertions, 152 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 41dcbd568cbe..2c0ff327b6df 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,7 +79,7 @@ int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; -int sysctl_tcp_fack __read_mostly = 1; +int sysctl_tcp_fack __read_mostly; int sysctl_tcp_max_reordering __read_mostly = 300; int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; @@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; - -int sysctl_tcp_thin_dupack __read_mostly; - int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; @@ -904,8 +901,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tcp_disable_fack(tp); } - if (metric > 0) - tcp_disable_early_retrans(tp); tp->rack.reord = 1; } @@ -916,10 +911,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = skb; - - if (!tp->lost_out || - after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } /* Sum the number of packets on the wire we have marked as lost. @@ -1135,6 +1126,7 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; + struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */ struct rate_sample *rate; int flag; }; @@ -1217,7 +1209,8 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { - tcp_rack_advance(tp, xmit_time, sacked); + tcp_rack_advance(tp, sacked, end_seq, + xmit_time, &state->ack_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, @@ -1937,7 +1930,6 @@ void tcp_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sk_buff *skb; - bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ bool mark_lost; @@ -1982,7 +1974,6 @@ void tcp_enter_loss(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } } tcp_verify_left_out(tp); @@ -1998,13 +1989,15 @@ void tcp_enter_loss(struct sock *sk) tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); - /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous - * loss recovery is underway except recurring timeout(s) on - * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing + /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO + * if a previous recovery is underway, otherwise it may incorrectly + * call a timeout spurious if some previously retransmitted packets + * are s/acked (sec 3.2). We do not apply that retriction since + * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS + * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO + * on PTMU discovery to avoid sending new data. */ - tp->frto = sysctl_tcp_frto && - (new_recovery || icsk->icsk_retransmits) && - !inet_csk(sk)->icsk_mtup.probe_size; + tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size; } /* If ACK arrived pointing to a remembered SACK, it means that our @@ -2056,30 +2049,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } -static bool tcp_pause_early_retransmit(struct sock *sk, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned long delay; - - /* Delay early retransmit and entering fast recovery for - * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples - * available, or RTO is scheduled to fire first. - */ - if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt_us) - return false; - - delay = max(usecs_to_jiffies(tp->srtt_us >> 5), - msecs_to_jiffies(2)); - - if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) - return false; - - inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, - TCP_RTO_MAX); - return true; -} - /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * @@ -2127,10 +2096,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * - * Essentially, we have now two algorithms counting + * Essentially, we have now a few algorithms detecting * lost packets. * - * FACK: It is the simplest heuristics. As soon as we decided + * If the receiver supports SACK: + * + * RFC6675/3517: It is the conventional algorithm. A packet is + * considered lost if the number of higher sequence packets + * SACKed is greater than or equal the DUPACK thoreshold + * (reordering). This is implemented in tcp_mark_head_lost and + * tcp_update_scoreboard. + * + * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm + * (2017-) that checks timing instead of counting DUPACKs. + * Essentially a packet is considered lost if it's not S/ACKed + * after RTT + reordering_window, where both metrics are + * dynamically measured and adjusted. This is implemented in + * tcp_rack_mark_lost. + * + * FACK (Disabled by default. Subsumbed by RACK): + * It is the simplest heuristics. As soon as we decided * that something is lost, we decide that _all_ not SACKed * packets until the most forward SACK are lost. I.e. * lost_out = fackets_out - sacked_out and left_out = fackets_out. @@ -2139,16 +2124,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * takes place. We use FACK by default until reordering * is suspected on the path to this destination. * - * NewReno: when Recovery is entered, we assume that one segment + * If the receiver does not support SACK: + * + * NewReno (RFC6582): in Recovery we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * - * Imagine, that's all! Forget about all this shamanism about CWND inflation - * deflation etc. CWND is real congestion window, never inflated, changes - * only according to classic VJ rules. - * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, @@ -2176,8 +2159,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - __u32 packets_out; - int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; /* Trick#1: The loss is proven. */ if (tp->lost_out) @@ -2187,39 +2168,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) if (tcp_dupack_heuristics(tp) > tp->reordering) return true; - /* Trick#4: It is still not OK... But will it be useful to delay - * recovery more? - */ - packets_out = tp->packets_out; - if (packets_out <= tp->reordering && - tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) && - !tcp_may_send_now(sk)) { - /* We have nothing to send. This connection is limited - * either by receiver window or by application. - */ - return true; - } - - /* If a thin stream is detected, retransmit after first - * received dupack. Employ only if SACK is supported in order - * to avoid possible corner-case series of spurious retransmissions - * Use only if there are no unsent data. - */ - if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && - tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && - tcp_is_sack(tp) && !tcp_send_head(sk)) - return true; - - /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious - * retransmissions due to small network reorderings, we implement - * Mitigation A.3 in the RFC and delay the retransmission for a short - * interval if appropriate. - */ - if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && - (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && - !tcp_may_send_now(sk)) - return !tcp_pause_early_retransmit(sk, flag); - return false; } @@ -2521,8 +2469,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, - int flag) +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2690,7 +2637,7 @@ void tcp_simple_retransmit(struct sock *sk) } EXPORT_SYMBOL(tcp_simple_retransmit); -static void tcp_enter_recovery(struct sock *sk, bool ece_ack) +void tcp_enter_recovery(struct sock *sk, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); int mib_idx; @@ -2726,14 +2673,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, tcp_try_undo_loss(sk, false)) return; - if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - if ((flag & FLAG_ORIG_SACK_ACKED) && - tcp_try_undo_loss(sk, true)) - return; + /* The ACK (s)acks some never-retransmitted data meaning not all + * the data packets before the timeout were lost. Therefore we + * undo the congestion window and state. This is essentially + * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since + * a retransmitted skb is permantly marked, we can apply such an + * operation even if F-RTO was not used. + */ + if ((flag & FLAG_ORIG_SACK_ACKED) && + tcp_try_undo_loss(sk, tp->undo_marker)) + return; + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ if (after(tp->snd_nxt, tp->high_seq)) { if (flag & FLAG_DATA_SACKED || is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ @@ -2800,6 +2751,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) return false; } +static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, + const struct skb_mstamp *ack_time) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Use RACK to detect loss */ + if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + u32 prior_retrans = tp->retrans_out; + + tcp_rack_mark_lost(sk, ack_time); + if (prior_retrans > tp->retrans_out) + *ack_flag |= FLAG_LOST_RETRANS; + } +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2813,7 +2779,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - bool is_dupack, int *ack_flag, int *rexmit) + bool is_dupack, int *ack_flag, int *rexmit, + const struct skb_mstamp *ack_time) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2864,13 +2831,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } } - /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && - tcp_rack_mark_lost(sk)) { - flag |= FLAG_LOST_RETRANS; - *ack_flag |= FLAG_LOST_RETRANS; - } - /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: @@ -2888,11 +2848,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_keep_open(sk); return; } + tcp_rack_identify_loss(sk, ack_flag, ack_time); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - if (icsk->icsk_ca_state != TCP_CA_Open && - !(flag & FLAG_LOST_RETRANS)) + tcp_rack_identify_loss(sk, ack_flag, ack_time); + if (!(icsk->icsk_ca_state == TCP_CA_Open || + (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ default: @@ -2906,6 +2868,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); + tcp_rack_identify_loss(sk, ack_flag, ack_time); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; @@ -3024,7 +2987,7 @@ void tcp_rearm_rto(struct sock *sk) } else { u32 rto = inet_csk(sk)->icsk_rto; /* Offset the time elapsed after installing regular RTO */ - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); const u32 rto_time_stamp = @@ -3041,24 +3004,6 @@ void tcp_rearm_rto(struct sock *sk) } } -/* This function is called when the delayed ER timer fires. TCP enters - * fast recovery and performs fast-retransmit. - */ -void tcp_resume_early_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_rearm_rto(sk); - - /* Stop if ER is disabled after the delayed ER timer is scheduled */ - if (!tp->do_early_retrans) - return; - - tcp_enter_recovery(sk, false); - tcp_update_scoreboard(sk, 1); - tcp_xmit_retransmit_queue(sk); -} - /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@ -3101,11 +3046,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_snd_una, int *acked, - struct tcp_sacktag_state *sack, - struct skb_mstamp *now) + struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt; + struct skb_mstamp *now = &sack->ack_time; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; @@ -3165,7 +3110,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } else if (tcp_is_sack(tp)) { tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) - tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + tcp_rack_advance(tp, sacked, scb->end_seq, + &skb->skb_mstamp, + &sack->ack_time); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3595,7 +3542,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ - struct skb_mstamp now; sack_state.first_sackt.v64 = 0; sack_state.rate = &rs; @@ -3621,10 +3567,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - skb_mstamp_get(&now); + skb_mstamp_get(&sack_state.ack_time); - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) + if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); if (after(ack, prior_snd_una)) { @@ -3689,34 +3634,34 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state, &now); + &sack_state); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { - struct dst_entry *dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); - } + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) + sk_dst_confirm(sk); if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, &now, &rs); - tcp_cong_control(sk, ack, delivered, flag, &rs); + tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time, + sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3737,9 +3682,11 @@ old_ack: * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { + skb_mstamp_get(&sack_state.ack_time); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); tcp_xmit_recovery(sk, rexmit); } @@ -4557,6 +4504,7 @@ add_sack: end: if (skb) { tcp_grow_window(sk, skb); + skb_condense(skb); skb_set_owner_r(skb, sk); } } @@ -5249,6 +5197,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) return err; } +/* Accept RST for rcv_nxt - 1 after a FIN. + * When tcp connections are abruptly terminated from Mac OSX (via ^C), a + * FIN is sent followed by a RST packet. The RST is sent with the same + * sequence number as the FIN, and thus according to RFC 5961 a challenge + * ACK should be sent. However, Mac OSX rate limits replies to challenge + * ACKs on the closed socket. In addition middleboxes can drop either the + * challenge ACK or a subsequent RST. + */ +static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) && + (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | + TCPF_CLOSING)); +} + /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5287,20 +5252,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, LINUX_MIB_TCPACKSKIPPEDSEQ, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); + } else if (tcp_reset_check(sk, skb)) { + tcp_reset(sk); } goto discard; } /* Step 2: check RST bit */ if (th->rst) { - /* RFC 5961 3.2 (extend to match against SACK too if available): - * If seq num matches RCV.NXT or the right-most SACK block, + /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a + * FIN and SACK too if available): + * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or + * the right-most SACK block, * then * RESET the connection * else * Send a challenge ACK */ - if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || + tcp_reset_check(sk, skb)) { rst_seq_match = true; } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { struct tcp_sack_block *sp = &tp->selective_acks[0]; @@ -6022,7 +5992,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; case TCP_FIN_WAIT1: { - struct dst_entry *dst; int tmo; /* If we enter the TCP_FIN_WAIT1 state and we are a @@ -6049,9 +6018,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; - dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); + sk_dst_confirm(sk); if (!sock_flag(sk, SOCK_DEAD)) { /* Wake up lingering close() */ @@ -6363,7 +6330,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ - if (tcp_death_row.sysctl_tw_recycle) { + if (net->ipv4.tcp_death_row.sysctl_tw_recycle) { bool strict; dst = af_ops->route_req(sk, &fl, req, &strict); @@ -6377,8 +6344,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } /* Kill the following clause, if you dislike this way. */ else if (!net->ipv4.sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && + (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (net->ipv4.sysctl_max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst, false, tmp_opt.saw_tstamp)) { /* Without syncookies last quarter of |