aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp_bbr.c9
-rw-r--r--net/ipv4/tcp_output.c80
-rw-r--r--net/ipv4/tcp_timer.c3
3 files changed, 88 insertions, 4 deletions
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index b89bce4c721e..92b045c72163 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -52,10 +52,9 @@
* There is a public e-mail list for discussing BBR development and testing:
* https://groups.google.com/forum/#!forum/bbr-dev
*
- * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
- * since pacing is integral to the BBR design and implementation.
- * BBR without pacing would not function properly, and may incur unnecessary
- * high packet loss rates.
+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * otherwise TCP stack falls back to an internal pacing using one high
+ * resolution timer per TCP socket and may use more resources.
*/
#include <linux/module.h>
#include <net/tcp.h>
@@ -830,6 +829,8 @@ static void bbr_init(struct sock *sk)
bbr->cycle_idx = 0;
bbr_reset_lt_bw_sampling(sk);
bbr_reset_startup_mode(sk);
+
+ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
}
static u32 bbr_sndbuf_expand(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4858e190f6ac..a32172d69a03 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -904,6 +904,72 @@ out:
sk_free(sk);
}
+/* Note: Called under hard irq.
+ * We can not call TCP stack right away.
+ */
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
+{
+ struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
+ struct sock *sk = (struct sock *)tp;
+ unsigned long nval, oval;
+
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
+ struct tsq_tasklet *tsq;
+ bool empty;
+
+ if (oval & TSQF_QUEUED)
+ break;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+ if (nval != oval)
+ continue;
+
+ if (!atomic_inc_not_zero(&sk->sk_wmem_alloc))
+ break;
+ /* queue this socket to tasklet queue */
+ tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
+ list_add(&tp->tsq_node, &tsq->head);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
+ break;
+ }
+ return HRTIMER_NORESTART;
+}
+
+/* BBR congestion control needs pacing.
+ * Same remark for SO_MAX_PACING_RATE.
+ * sch_fq packet scheduler is efficiently handling pacing,
+ * but is not always installed/used.
+ * Return true if TCP stack should pace packets itself.
+ */
+static bool tcp_needs_internal_pacing(const struct sock *sk)
+{
+ return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
+}
+
+static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+{
+ u64 len_ns;
+ u32 rate;
+
+ if (!tcp_needs_internal_pacing(sk))
+ return;
+ rate = sk->sk_pacing_rate;
+ if (!rate || rate == ~0U)
+ return;
+
+ /* Should account for header sizes as sch_fq does,
+ * but lets make things simple.
+ */
+ len_ns = (u64)skb->len * NSEC_PER_SEC;
+ do_div(len_ns, rate);
+ hrtimer_start(&tcp_sk(sk)->pacing_timer,
+ ktime_add_ns(ktime_get(), len_ns),
+ HRTIMER_MODE_ABS_PINNED);
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -1034,6 +1100,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
+ tcp_internal_pacing(sk, skb);
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -2086,6 +2153,12 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
+static bool tcp_pacing_check(const struct sock *sk)
+{
+ return tcp_needs_internal_pacing(sk) &&
+ hrtimer_active(&tcp_sk(sk)->pacing_timer);
+}
+
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* (These limits are doubled for retransmits)
@@ -2210,6 +2283,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+ if (tcp_pacing_check(sk))
+ break;
+
tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
@@ -2878,6 +2954,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
+
+ if (tcp_pacing_check(sk))
+ break;
+
/* we could do better than to assign each time */
if (!hole)
tp->retransmit_skb_hint = skb;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 14672543cf0b..86934bcf685a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -710,4 +710,7 @@ void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
+ hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
+ tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
}