aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-01-19 15:39:31 -0500
committerDavid S. Miller <davem@davemloft.net>2018-01-19 15:39:31 -0500
commit3c1693f012bdc0a9ff1bf030f7abc9961e94211f (patch)
treef72181b7dfc5d58ee416a996028ea5c3698a206c
parenttipc: fix race between poll() and setsockopt() (diff)
parenttcp: avoid min RTT bloat by skipping RTT from delayed-ACK in BBR (diff)
downloadlinux-dev-3c1693f012bdc0a9ff1bf030f7abc9961e94211f.tar.xz
linux-dev-3c1693f012bdc0a9ff1bf030f7abc9961e94211f.zip
Merge branch 'tcp-min-rtt'
Yuchung Cheng says: ==================== tcp: do not use RTT from delayed ACKs for min-RTT This patch set prevents TCP sender from using RTT samples from (suspected) delayed ACKs as the minimum RTT, to avoid unbounded over-estimation of the network path delay. This issue is common when a connection has extended periods of one packet chit-chat beyond the min RTT filter window. The first patch does that for TCP general min RTT estimation. The second patch addresses specifically the BBR congestion control's min RTT filter. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/ipv4/tcp_bbr.c3
-rw-r--r--net/ipv4/tcp_input.c24
3 files changed, 25 insertions, 3 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6939e69d3c37..5a1d26a18599 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -953,6 +953,7 @@ struct rate_sample {
u32 prior_in_flight; /* in flight before this ACK */
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
+ bool is_ack_delayed; /* is this (likely) a delayed ACK? */
};
struct tcp_congestion_ops {
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 8322f26e770e..785712be5b0d 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -766,7 +766,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
filter_expired = after(tcp_jiffies32,
bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
if (rs->rtt_us >= 0 &&
- (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+ (rs->rtt_us <= bbr->min_rtt_us ||
+ (filter_expired && !rs->is_ack_delayed))) {
bbr->min_rtt_us = rs->rtt_us;
bbr->min_rtt_stamp = tcp_jiffies32;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ff71b18d9682..cfa51cfd2d99 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -97,6 +97,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
+#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -2857,11 +2858,18 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
*rexmit = REXMIT_LOST;
}
-static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
{
u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
struct tcp_sock *tp = tcp_sk(sk);
+ if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
+ /* If the remote keeps returning delayed ACKs, eventually
+ * the min filter would pick it up and overestimate the
+ * prop. delay when it expires. Skip suspected delayed ACKs.
+ */
+ return;
+ }
minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
rtt_us ? : jiffies_to_usecs(1));
}
@@ -2901,7 +2909,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* always taken together with ACK, SACK, or TS-opts. Any negative
* values will be skipped with the seq_rtt_us < 0 check above.
*/
- tcp_update_rtt_min(sk, ca_rtt_us);
+ tcp_update_rtt_min(sk, ca_rtt_us, flag);
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
@@ -3125,6 +3133,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
+
+ if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
+ last_in_flight && !prior_sacked && fully_acked &&
+ sack->rate->prior_delivered + 1 == tp->delivered &&
+ !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
+ /* Conservatively mark a delayed ACK. It's typically
+ * from a lone runt packet over the round trip to
+ * a receiver w/o out-of-order or CE events.
+ */
+ flag |= FLAG_ACK_MAYBE_DELAYED;
+ }
}
if (sack->first_sackt) {
sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
@@ -3614,6 +3633,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
+ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);