diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 623 |
1 files changed, 422 insertions, 201 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 306e25d743e8..c69f4d966024 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -82,6 +82,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); + tcp_check_space(sk); } /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one @@ -142,7 +143,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); - u32 cwnd = tp->snd_cwnd; + u32 cwnd = tcp_snd_cwnd(tp); tcp_ca_event(sk, CA_EVENT_CWND_RESTART); @@ -151,7 +152,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; - tp->snd_cwnd = max(cwnd, restart_cwnd); + tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd)); tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } @@ -166,16 +167,13 @@ static void tcp_event_data_sent(struct tcp_sock *tp, if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - /* If this is the first data packet sent in response to the - * previous received data, - * and it is a reply for ato after last received packet, - * increase pingpong count. - */ - if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) && - (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - inet_csk_inc_pingpong_cnt(sk); - tp->lsndtime = now; + + /* If it is a reply for ato after last received + * packet, enter pingpong mode. + */ + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + inet_csk_enter_pingpong_mode(sk); } /* Account for an ACK we sent. */ @@ -184,10 +182,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { + if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = TCP_FASTRETRANS_THRESH; + tp->compressed_ack); + tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } @@ -229,7 +227,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ - if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = min_t(u32, space, U16_MAX); @@ -240,8 +238,8 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); - space = max_t(u32, space, sysctl_rmem_max); + space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); + space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); space = min_t(u32, space, *window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); @@ -284,7 +282,7 @@ static u16 tcp_select_window(struct sock *sk) * scaled window. */ if (!tp->rx_opt.rcv_wscale && - sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -323,7 +321,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || tcp_ca_needs_ecn(sk) || bpf_needs_ecn; if (!use_ecn) { @@ -345,7 +343,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { - if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ @@ -394,7 +392,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; - TCP_SKB_CB(skb)->sacked = 0; tcp_skb_pcount_set(skb, 1); @@ -409,13 +406,13 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) return tp->snd_una != tp->snd_up; } -#define OPTION_SACK_ADVERTISE (1 << 0) -#define OPTION_TS (1 << 1) -#define OPTION_MD5 (1 << 2) -#define OPTION_WSCALE (1 << 3) -#define OPTION_FAST_OPEN_COOKIE (1 << 8) -#define OPTION_SMC (1 << 9) -#define OPTION_MPTCP (1 << 10) +#define OPTION_SACK_ADVERTISE BIT(0) +#define OPTION_TS BIT(1) +#define OPTION_MD5 BIT(2) +#define OPTION_WSCALE BIT(3) +#define OPTION_FAST_OPEN_COOKIE BIT(8) +#define OPTION_SMC BIT(9) +#define OPTION_MPTCP BIT(10) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -438,20 +435,162 @@ struct tcp_out_options { u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ + u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ struct mptcp_out_options mptcp; }; -static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) +static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, + struct tcp_sock *tp, + struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) - mptcp_write_options(ptr, &opts->mptcp); + mptcp_write_options(th, ptr, tp, &opts->mptcp); #endif } +#ifdef CONFIG_CGROUP_BPF +static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, + enum tcp_synack_type synack_type) +{ + if (unlikely(!skb)) + return BPF_WRITE_HDR_TCP_CURRENT_MSS; + + if (unlikely(synack_type == TCP_SYNACK_COOKIE)) + return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; + + return 0; +} + +/* req, syn_skb and synack_type are used when writing synack */ +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || + !*remaining) + return; + + /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ + + /* init sock_ops */ + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; + + if (req) { + /* The listen "sk" cannot be passed here because + * it is not locked. It would not make too much + * sense to do bpf_setsockopt(listen_sk) based + * on individual connection request also. + * + * Thus, "req" is passed here and the cgroup-bpf-progs + * of the listen "sk" will be run. + * + * "req" is also used here for fastopen even the "sk" here is + * a fullsock "child" sk. It is to keep the behavior + * consistent between fastopen and non-fastopen on + * the bpf programming side. + */ + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = *remaining; + /* tcp_current_mss() does not pass a skb */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, 0); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err || sock_ops.remaining_opt_len == *remaining) + return; + + opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; + /* round up to 4 bytes */ + opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; + + *remaining -= opts->bpf_opt_len; +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ + u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!max_opt_len)) + return; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; + + if (req) { + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = max_opt_len; + first_opt_off = tcp_hdrlen(skb) - max_opt_len; + bpf_skops_init_skb(&sock_ops, skb, first_opt_off); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err) + nr_written = 0; + else + nr_written = max_opt_len - sock_ops.remaining_opt_len; + + if (nr_written < max_opt_len) + memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, + max_opt_len - nr_written); +} +#else +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ +} +#endif + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -465,9 +604,10 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) * At least SACK_PERM as the first option is known to lead to a disaster * (but it may well be that other scenarios fail similarly). */ -static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, +static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts) { + __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ if (unlikely(OPTION_MD5 & options)) { @@ -561,7 +701,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, smc_options_write(ptr, &options); - mptcp_options_write(ptr, opts); + mptcp_options_write(th, ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, @@ -648,18 +788,18 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; - if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) { + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } - if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) { + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } - if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) { + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; @@ -691,6 +831,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -700,7 +842,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -715,7 +859,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ - ireq->tstamp_ok &= !ireq->sack_ok; + if (synack_type != TCP_SYNACK_COOKIE) + ireq->tstamp_ok &= !ireq->sack_ok; } #endif @@ -756,6 +901,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, + synack_type, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -824,6 +972,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + + size = MAX_TCP_OPTION_SPACE - remaining; + } + return size; } @@ -856,7 +1013,7 @@ static void tcp_tsq_write(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && - tp->snd_cwnd > tcp_packets_in_flight(tp)) { + tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) { tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); } @@ -881,9 +1038,9 @@ static void tcp_tsq_handler(struct sock *sk) * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ -static void tcp_tasklet_func(unsigned long data) +static void tcp_tasklet_func(struct tasklet_struct *t) { - struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; + struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; @@ -968,9 +1125,7 @@ void __init tcp_tasklet_init(void) struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); INIT_LIST_HEAD(&tsq->head); - tasklet_init(&tsq->tasklet, - tcp_tasklet_func, - (unsigned long)tsq); + tasklet_setup(&tsq->tasklet, tcp_tasklet_func); } } @@ -1064,6 +1219,10 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } +INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1094,10 +1253,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); if (clone_it) { - TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - - tp->snd_una; oskb = skb; tcp_skb_tsorted_save(oskb) { @@ -1109,6 +1266,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(!skb)) return -ENOBUFS; + /* retransmit skbs might have a non zero value in skb->dev + * because skb->dev is aliased with skb->rbnode.rb_left + */ + skb->dev = NULL; } inet = inet_sk(sk); @@ -1155,7 +1316,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; - skb_set_hash_from_sk(skb, sk); refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); @@ -1183,7 +1343,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } } - tcp_options_write((__be32 *)(th + 1), tp, &opts); skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); @@ -1194,16 +1353,24 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, */ th->window = htons(min(tp->rcv_wnd, 65535U)); } + + tcp_options_write(th, tp, &opts); + #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { - sk_nocaps_add(sk, NETIF_F_GSO_MASK); + sk_gso_disable(sk); tp->af_specific->calc_md5_hash(opts.hash_location, md5, sk, skb); } #endif - icsk->icsk_af_ops->send_check(sk, skb); + /* BPF prog is the last one writing header option */ + bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); + + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, + tcp_v6_send_check, tcp_v4_send_check, + sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); @@ -1219,6 +1386,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); + skb_set_hash_from_sk(skb, sk); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); @@ -1231,7 +1399,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_add_tx_delay(skb, tp); - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, + inet6_csk_xmit, ip_queue_xmit, + sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); @@ -1380,7 +1550,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ - limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE); + limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && @@ -1389,14 +1559,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; } - if (skb_unclone(skb, gfp)) + if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, gfp, true); + buff = tcp_stream_alloc_skb(sk, nsize, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -1418,9 +1589,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, skb_split(skb, buff, len); - buff->ip_summed = CHECKSUM_PARTIAL; - - buff->tstamp = skb->tstamp; + skb_set_delivery_time(buff, skb->tstamp, true); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); @@ -1498,19 +1667,18 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; - skb->ip_summed = CHECKSUM_PARTIAL; if (delta_truesize) { skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); - sk_mem_uncharge(sk, delta_truesize); - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, delta_truesize); } /* Any change of skb->len requires recalculation of tso factor. */ @@ -1548,7 +1716,8 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); + mss_now = max(mss_now, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); return mss_now; } @@ -1559,6 +1728,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } +EXPORT_SYMBOL(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) @@ -1590,10 +1760,10 @@ void tcp_mtup_init(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); - icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1; + icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; @@ -1691,9 +1861,9 @@ static void tcp_cwnd_application_limited(struct sock *sk) /* Limited by application or receiver window. */ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 win_used = max(tp->snd_cwnd_used, init_win); - if (win_used < tp->snd_cwnd) { + if (win_used < tcp_snd_cwnd(tp)) { tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1); } tp->snd_cwnd_used = 0; } @@ -1705,14 +1875,20 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); - /* Track the maximum number of outstanding packets in each - * window, and remember whether we were cwnd-limited then. + /* Track the strongest available signal of the degree to which the cwnd + * is fully utilized. If cwnd-limited then remember that fact for the + * current window. If not cwnd-limited then track the maximum number of + * outstanding packets in the current window. (If cwnd-limited then we + * chose to not update tp->max_packets_out to avoid an extra else + * clause with no functional impact.) */ - if (!before(tp->snd_una, tp->max_packets_seq) || - tp->packets_out > tp->max_packets_out) { - tp->max_packets_out = tp->packets_out; - tp->max_packets_seq = tp->snd_nxt; + if (!before(tp->snd_una, tp->cwnd_usage_seq) || + is_cwnd_limited || + (!tp->is_cwnd_limited && + tp->packets_out > tp->max_packets_out)) { tp->is_cwnd_limited = is_cwnd_limited; + tp->max_packets_out = tp->packets_out; + tp->cwnd_usage_seq = tp->snd_nxt; } if (tcp_is_cwnd_limited(sk)) { @@ -1724,7 +1900,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); @@ -1781,25 +1957,34 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, } /* Return how many segs we'd like on a TSO packet, - * to send one TSO packet per ms + * depending on current pacing rate, and how close the peer is. + * + * Rationale is: + * - For close peers, we rather send bigger packets to reduce + * cpu costs, because occasional losses will be repaired fast. + * - For long distance/rtt flows, we would like to get ACK clocking + * with 1 ACK per ms. + * + * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting + * in bigger TSO bursts. We we cut the RTT-based allowance in half + * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance + * is below 1500 bytes after 6 * ~500 usec = 3ms. */ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs) { - u32 bytes, segs; + unsigned long bytes; + u32 r; - bytes = min_t(unsigned long, - sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), - sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); + bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift); - /* Goal is to send at least one packet per ms, - * not one big TSO packet every 100 ms. - * This preserves ACK clocking and is consistent - * with tcp_tso_should_defer() heuristic. - */ - segs = max_t(u32, bytes / mss_now, min_tso_segs); + r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); + if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) + bytes += sk->sk_gso_max_size >> r; + + bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size); - return segs; + return max_t(u32, bytes / mss_now, min_tso_segs); } /* Return the number of segments we want in the skb we are transmitting. @@ -1812,7 +1997,7 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) min_tso = ca_ops->min_tso_segs ? ca_ops->min_tso_segs(sk) : - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs; + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); @@ -1864,7 +2049,7 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, return 1; in_flight = tcp_packets_in_flight(tp); - cwnd = tp->snd_cwnd; + cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) return 0; @@ -1948,10 +2133,11 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, len, mss_now, gfp); - buff = sk_stream_alloc_skb(sk, 0, gfp, true); + buff = tcp_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -1968,12 +2154,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; - /* This packet was never sent out yet, so no SACK bits. */ - TCP_SKB_CB(buff)->sacked = 0; - tcp_skb_fragment_eor(skb, buff); - buff->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); @@ -2020,12 +2202,12 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1); - BUG_ON(tp->snd_cwnd <= in_flight); + BUG_ON(tcp_snd_cwnd(tp) <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ - cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; + cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache; limit = min(send_win, cong_win); @@ -2039,7 +2221,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { - u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); + u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. @@ -2103,7 +2285,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) u32 interval; s32 delta; - interval = net->ipv4.sysctl_tcp_probe_interval; + interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); @@ -2129,7 +2311,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) + if (unlikely(TCP_SKB_CB(skb)->eor) || + tcp_has_tx_tstamp(skb) || + !skb_pure_zcopy_same(skb, next)) return false; len -= skb->len; @@ -2167,7 +2351,7 @@ static int tcp_mtu_probe(struct sock *sk) if (likely(!icsk->icsk_mtup.enabled || icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || - tp->snd_cwnd < 11 || + tcp_snd_cwnd(tp) < 11 || tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; @@ -2185,7 +2369,7 @@ static int tcp_mtu_probe(struct sock *sk) * probing process by not resetting search range to its orignal. */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || - interval < net->ipv4.sysctl_tcp_probe_threshold) { + interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { /* Check whether enough time has elaplased for * another round of probing. */ @@ -2203,7 +2387,7 @@ static int tcp_mtu_probe(struct sock *sk) return 0; /* Do we need to wait to drain cwnd? With none in flight, don't stall */ - if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) { + if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) { if (!tcp_packets_in_flight(tp)) return -1; else @@ -2214,7 +2398,7 @@ static int tcp_mtu_probe(struct sock *sk) return -1; /* We're allowed to probe. Build it now. */ - nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); + nskb = tcp_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; sk_wmem_queued_add(sk, nskb->truesize); @@ -2222,13 +2406,11 @@ static int tcp_mtu_probe(struct sock *sk) skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); + mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; - TCP_SKB_CB(nskb)->sacked = 0; - nskb->csum = 0; - nskb->ip_summed = CHECKSUM_PARTIAL; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); @@ -2248,7 +2430,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; tcp_skb_collapse_tstamp(nskb, skb); tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); @@ -2274,7 +2456,7 @@ static int tcp_mtu_probe(struct sock *sk) if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ - tp->snd_cwnd--; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); @@ -2327,7 +2509,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, - sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && @@ -2449,7 +2631,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ - skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; + tp->tcp_wstamp_ns = tp->tcp_clock_cache; + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ @@ -2531,6 +2714,10 @@ repair: else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)); + if (likely(sent_pkts || is_cwnd_limited)) + tcp_cwnd_validate(sk, is_cwnd_limited); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2538,8 +2725,6 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); - is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); - tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); @@ -2558,7 +2743,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rcu_access_pointer(tp->fastopen_rsk)) return false; - early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; + early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); /* Schedule a loss probe in 2*RTT for SACK capable connections * not in loss recovery, that are either limited by cwnd or application. */ @@ -2589,8 +2774,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, - TCP_RTO_MAX, NULL); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); return true; } @@ -2598,13 +2782,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ -static bool skb_still_in_host_queue(const struct sock *sk, +static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); - return true; + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + smp_mb__after_atomic(); + if (skb_fclone_busy(sk, skb)) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; + } } return false; } @@ -2619,6 +2807,11 @@ void tcp_send_loss_probe(struct sock *sk) int pcount; int mss = tcp_current_mss(sk); + /* At most one outstanding TLP */ + if (tp->tlp_high_seq) + goto rearm_timer; + + tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; @@ -2631,15 +2824,11 @@ void tcp_send_loss_probe(struct sock *sk) if (unlikely(!skb)) { WARN_ONCE(tp->packets_out, "invalid inflight: %u state %u cwnd %u mss %d\n", - tp->packets_out, sk->sk_state, tp->snd_cwnd, mss); + tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); inet_csk(sk)->icsk_pending = 0; return; } - /* At most one outstanding TLP retransmission. */ - if (tp->tlp_high_seq) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; @@ -2661,10 +2850,12 @@ void tcp_send_loss_probe(struct sock *sk) if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; + tp->tlp_retrans = 1; + +probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; -probe_sent: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; @@ -2768,8 +2959,12 @@ u32 __tcp_select_window(struct sock *sk) int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); - int full_space = min_t(int, tp->window_clamp, allowed_space); - int window; + int full_space, window; + + if (sk_is_mptcp(sk)) + mptcp_space(sk, &free_space, &allowed_space); + + full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; @@ -2780,8 +2975,7 @@ u32 __tcp_select_window(struct sock *sk) icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, - 4U * tp->advmss); + tcp_adjust_rcv_ssthresh(sk); /* free_space might become our new window, make sure we don't * increase it due to wscale. @@ -2859,13 +3053,9 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - if (next_skb_size) { - if (next_skb_size <= skb_availroom(skb)) - skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), - next_skb_size); - else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size)) - return false; - } + if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size)) + return false; + tcp_highest_sack_replace(sk, next_skb, skb); /* Update sequence range on original skb. */ @@ -2917,7 +3107,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, struct sk_buff *skb = to, *tmp; bool first = true; - if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; @@ -2957,20 +3147,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; int diff, len, err; - + int avail_wnd; /* Inconclusive MTU probe */ if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; - /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: fragmentation, tunneling, mangling etc. - */ - if (refcount_read(&sk->sk_wmem_alloc) > - min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), - sk->sk_sndbuf)) - return -EAGAIN; - if (skb_still_in_host_queue(sk, skb)) return -EBUSY; @@ -2987,23 +3169,31 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) return -EHOSTUNREACH; /* Routing failure or similar. */ cur_mss = tcp_current_mss(sk); + avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case - * our retransmit serves as a zero window probe. + * our retransmit of one segment serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && - TCP_SKB_CB(skb)->seq != tp->snd_una) - return -EAGAIN; + if (avail_wnd <= 0) { + if (TCP_SKB_CB(skb)->seq != tp->snd_una) + return -EAGAIN; + avail_wnd = cur_mss; + } len = cur_mss * segs; + if (len > avail_wnd) { + len = rounddown(avail_wnd, cur_mss); + if (!len) + len = avail_wnd; + } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); @@ -3011,8 +3201,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) diff -= tcp_skb_pcount(skb); if (diff) tcp_adjust_pcount(sk, skb, diff); - if (skb->len < cur_mss) - tcp_retrans_try_collapse(sk, skb, cur_mss); + avail_wnd = min_t(int, avail_wnd, cur_mss); + if (skb->len < avail_wnd) + tcp_retrans_try_collapse(sk, skb, avail_wnd); } /* RFC3168, section 6.1.1.1. ECN fallback */ @@ -3037,8 +3228,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + if (nskb) { + nskb->dev = NULL; + err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); + } else { + err = -ENOBUFS; + } } tcp_skb_tsorted_restore(skb); if (!err) { @@ -3101,6 +3296,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); + bool rearm_timer = false; u32 max_segs; int mib_idx; @@ -3121,9 +3317,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!hole) tp->retransmit_skb_hint = skb; - segs = tp->snd_cwnd - tcp_packets_in_flight(tp); + segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp); if (segs <= 0) - return; + break; sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets @@ -3148,10 +3344,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) continue; if (tcp_small_queue_check(sk, skb, 1)) - return; + break; if (tcp_retransmit_skb(sk, skb, segs)) - return; + break; NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); @@ -3160,11 +3356,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) - tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX, - skb); + rearm_timer = true; + } + if (rearm_timer) + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); } /* We allow to exceed memory limits for FIN packets to expedite @@ -3176,16 +3374,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt; + int delta, amt; - if (size <= sk->sk_forward_alloc) + delta = size - sk->sk_forward_alloc; + if (delta <= 0) return; - amt = sk_mem_pages(size); - sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + amt = sk_mem_pages(delta); + sk->sk_forward_alloc += amt << PAGE_SHIFT; sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) - mem_cgroup_charge_skmem(sk->sk_memcg, amt); + mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. @@ -3309,18 +3509,20 @@ int tcp_send_synack(struct sock *sk) } /** - * tcp_make_synack - Prepare a SYN-ACK. - * sk: listener socket - * dst: dst entry attached to the SYNACK - * req: request_sock pointer - * - * Allocate one skb and build a SYNACK packet. - * @dst is consumed : Caller should not use it again. + * tcp_make_synack - Allocate one skb and build a SYNACK packet. + * @sk: listener socket + * @dst: dst entry attached to the SYNACK. It is consumed and caller + * should not use it again. + * @req: request_sock pointer + * @foc: cookie for tcp fast open + * @synack_type: Type of synack to prepare + * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -3364,12 +3566,13 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES - if (unlikely(req->cookie_ts)) - skb->skb_mstamp_ns = cookie_init_timestamp(req, now); + if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) + skb_set_delivery_time(skb, cookie_init_timestamp(req, now), + true); else #endif { - skb->skb_mstamp_ns = now; + skb_set_delivery_time(skb, now, true); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3379,8 +3582,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + /* bpf program will be interested in the tcp_flags */ + TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc) + sizeof(*th); + foc, synack_type, + syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3400,7 +3606,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), NULL, &opts); + tcp_options_write(th, NULL, &opts); th->doff = (tcp_header_size >> 2); __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); @@ -3412,7 +3618,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif - skb->skb_mstamp_ns = now; + bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, + synack_type, &opts); + + skb_set_delivery_time(skb, now, true); tcp_add_tx_delay(skb, tp); return skb; @@ -3450,7 +3659,7 @@ static void tcp_connect_init(struct sock *sk) * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr); - if (sock_net(sk)->ipv4.sysctl_tcp_timestamps) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG @@ -3486,7 +3695,7 @@ static void tcp_connect_init(struct sock *sk) tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, - sock_net(sk)->ipv4.sysctl_tcp_window_scaling, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), &rcv_wscale, rcv_wnd); @@ -3537,6 +3746,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) */ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; int space, err = 0; @@ -3551,8 +3761,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) * private TCP options. The cost is reduced data space in SYN :( */ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); + /* Sync mss_cache after updating the mss_clamp */ + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; space = min_t(size_t, space, fo->size); @@ -3560,10 +3772,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false); + syn_data = tcp_stream_alloc_skb(sk, space, sk->sk_allocation, false); if (!syn_data) goto fallback; - syn_data->ip_summed = CHECKSUM_PARTIAL; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { int copied = copy_from_iter(skb_put(syn_data, space), space, @@ -3590,7 +3801,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - syn->skb_mstamp_ns = syn_data->skb_mstamp_ns; + skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) @@ -3641,7 +3852,7 @@ int tcp_connect(struct sock *sk) return 0; } - buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); + buff = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; @@ -3712,16 +3923,15 @@ void tcp_send_delayed_ack(struct sock *sk) ato = min(ato, max_ato); } + ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max); + /* Stay within the limit we were given */ timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { - /* If delack timer was blocked or is about to expire, - * send ACK now. - */ - if (icsk->icsk_ack.blocked || - time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { + /* If delack timer is about to expire, send ACK now. */ + if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } @@ -3750,10 +3960,15 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!buff)) { + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned long delay; + + delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; + if (delay < TCP_RTO_MAX) + icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); + icsk->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); return; } @@ -3881,12 +4096,13 @@ void tcp_send_probe0(struct sock *sk) /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; return; } icsk->icsk_probes_out++; if (err <= 0) { - if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) + if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { @@ -3895,7 +4111,9 @@ void tcp_send_probe0(struct sock *sk) */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL); + + timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) @@ -3904,11 +4122,14 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) struct flowi fl; int res; - tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); + /* Paired with WRITE_ONCE() in sock_setsockopt() */ + if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) + tcp_rsk(req)->txhash = net_tx_rndhash(); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, + NULL); if (!res) { - __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) tcp_sk(sk)->total_retrans++; trace_tcp_retransmit_synack(sk, req); |