aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c152
1 files changed, 88 insertions, 64 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a3d453b94747..7f18262e2326 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ tcp_ca_needs_ecn(sk);
+
+ if (!use_ecn) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ use_ecn = true;
+ }
tp->ecn_flags = 0;
- if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
- tcp_ca_needs_ecn(sk)) {
+
+ if (use_ecn) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
if (tcp_ca_needs_ecn(sk))
@@ -1515,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
((nonagle & TCP_NAGLE_CORK) ||
(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}
+
+/* Return how many segs we'd like on a TSO packet,
+ * to send one TSO packet per ms
+ */
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+{
+ u32 bytes, segs;
+
+ bytes = min(sk->sk_pacing_rate >> 10,
+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+
+ /* Goal is to send at least one packet per ms,
+ * not one big TSO packet every 100 ms.
+ * This preserves ACK clocking and is consistent
+ * with tcp_tso_should_defer() heuristic.
+ */
+ segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+
+ return min_t(u32, segs, sk->sk_gso_max_segs);
+}
+
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
const struct sk_buff *skb,
@@ -1553,7 +1583,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
const struct sk_buff *skb)
{
- u32 in_flight, cwnd;
+ u32 in_flight, cwnd, halfcwnd;
/* Don't be strict about the congestion window for the final FIN. */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
@@ -1562,10 +1592,14 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
in_flight = tcp_packets_in_flight(tp);
cwnd = tp->snd_cwnd;
- if (in_flight < cwnd)
- return (cwnd - in_flight);
+ if (in_flight >= cwnd)
+ return 0;
- return 0;
+ /* For better scheduling, ensure we have at least
+ * 2 GSO packets in flight.
+ */
+ halfcwnd = max(cwnd >> 1, 1U);
+ return min(halfcwnd, cwnd - in_flight);
}
/* Initialize TSO state of a skb.
@@ -1718,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
* This algorithm is from John Heffner.
*/
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
- bool *is_cwnd_limited)
+ bool *is_cwnd_limited, u32 max_segs)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1748,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
limit = min(send_win, cong_win);
/* If a full-sized TSO skb can be sent, do it. */
- if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
- tp->xmit_size_goal_segs * tp->mss_cache))
+ if (limit >= max_segs * tp->mss_cache)
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
@@ -1946,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int cwnd_quota;
int result;
bool is_cwnd_limited = false;
+ u32 max_segs;
sent_pkts = 0;
@@ -1959,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
}
+ max_segs = tcp_tso_autosize(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
@@ -1991,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
} else {
if (!push_one &&
- tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
+ tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
+ max_segs))
break;
}
+ limit = mss_now;
+ if (tso_segs > 1 && !tcp_urg_mode(tp))
+ limit = tcp_mss_split_point(sk, skb, mss_now,
+ min_t(unsigned int,
+ cwnd_quota,
+ max_segs),
+ nonagle);
+
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+ break;
+
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* This allows for :
@@ -2005,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
* of queued bytes to ensure line rate.
* One example is wifi aggregation (802.11 AMPDU)
*/
- limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
- sk->sk_pacing_rate >> 10);
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
@@ -2019,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
- limit = mss_now;
- if (tso_segs > 1 && !tcp_urg_mode(tp))
- limit = tcp_mss_split_point(sk, skb, mss_now,
- min_t(unsigned int,
- cwnd_quota,
- sk->sk_gso_max_segs),
- nonagle);
-
- if (skb->len > limit &&
- unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
- break;
-
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2998,9 +3034,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
- struct sk_buff *syn_data = NULL, *data;
+ int syn_loss = 0, space, err = 0;
unsigned long last_syn_loss = 0;
+ struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
@@ -3031,48 +3067,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
/* limit to order-0 allocations */
space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
- syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space,
- sk->sk_allocation);
- if (syn_data == NULL)
+ syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
+ if (!syn_data)
+ goto fallback;
+ syn_data->ip_summed = CHECKSUM_PARTIAL;
+ memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
+ if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space),
+ fo->data->msg_iter.iov, 0, space))) {
+ kfree_skb(syn_data);
goto fallback;
+ }
- for (i = 0; i < iovlen && syn_data->len < space; ++i) {
- struct iovec *iov = &fo->data->msg_iov[i];
- unsigned char __user *from = iov->iov_base;
- int len = iov->iov_len;
+ /* No more data pending in inet_wait_for_connect() */
+ if (space == fo->size)
+ fo->data = NULL;
+ fo->copied = space;
- if (syn_data->len + len > space)
- len = space - syn_data->len;
- else if (i + 1 == iovlen)
- /* No more data pending in inet_wait_for_connect() */
- fo->data = NULL;
+ tcp_connect_queue_skb(sk, syn_data);
- if (skb_add_data(syn_data, from, len))
- goto fallback;
- }
+ err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- /* Queue a data-only packet after the regular SYN for retransmission */
- data = pskb_copy(syn_data, sk->sk_allocation);
- if (data == NULL)
- goto fallback;
- TCP_SKB_CB(data)->seq++;
- TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
- TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
- tcp_connect_queue_skb(sk, data);
- fo->copied = data->len;
-
- /* syn_data is about to be sent, we need to take current time stamps
- * for the packets that are in write queue : SYN packet and DATA
- */
- skb_mstamp_get(&syn->skb_mstamp);
- data->skb_mstamp = syn->skb_mstamp;
+ syn->skb_mstamp = syn_data->skb_mstamp;
- if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+ /* Now full SYN+DATA was cloned and sent (or not),
+ * remove the SYN from the original skb (syn_data)
+ * we keep in write queue in case of a retransmit, as we
+ * also have the SYN packet (with no data) in the same queue.
+ */
+ TCP_SKB_CB(syn_data)->seq++;
+ TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
+ if (!err) {
tp->syn_data = (fo->copied > 0);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
goto done;
}
- syn_data = NULL;
fallback:
/* Send a regular SYN with Fast Open cookie request option */
@@ -3081,7 +3109,6 @@ fallback:
err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
if (err)
tp->syn_fastopen = 0;
- kfree_skb(syn_data);
done:
fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
return err;
@@ -3101,13 +3128,10 @@ int tcp_connect(struct sock *sk)
return 0;
}
- buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
- if (unlikely(buff == NULL))
+ buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ if (unlikely(!buff))
return -ENOBUFS;
- /* Reserve space for headers. */
- skb_reserve(buff, MAX_TCP_HEADER);
-
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
tp->retrans_stamp = tcp_time_stamp;
tcp_connect_queue_skb(sk, buff);