aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c212
1 files changed, 190 insertions, 22 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 85ff417bda7f..bf48cd73e967 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -438,6 +438,7 @@ struct tcp_out_options {
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
+ u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
@@ -452,6 +453,145 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
#endif
}
+#ifdef CONFIG_CGROUP_BPF
+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
+ enum tcp_synack_type synack_type)
+{
+ if (unlikely(!skb))
+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
+
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
+
+ return 0;
+}
+
+/* req, syn_skb and synack_type are used when writing synack */
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
+ !*remaining)
+ return;
+
+ /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
+
+ /* init sock_ops */
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
+
+ if (req) {
+ /* The listen "sk" cannot be passed here because
+ * it is not locked. It would not make too much
+ * sense to do bpf_setsockopt(listen_sk) based
+ * on individual connection request also.
+ *
+ * Thus, "req" is passed here and the cgroup-bpf-progs
+ * of the listen "sk" will be run.
+ *
+ * "req" is also used here for fastopen even the "sk" here is
+ * a fullsock "child" sk. It is to keep the behavior
+ * consistent between fastopen and non-fastopen on
+ * the bpf programming side.
+ */
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = *remaining;
+ /* tcp_current_mss() does not pass a skb */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err || sock_ops.remaining_opt_len == *remaining)
+ return;
+
+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
+ /* round up to 4 bytes */
+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
+
+ *remaining -= opts->bpf_opt_len;
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!max_opt_len))
+ return;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+
+ if (req) {
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = max_opt_len;
+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err)
+ nr_written = 0;
+ else
+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
+
+ if (nr_written < max_opt_len)
+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
+ max_opt_len - nr_written);
+}
+#else
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+}
+#endif
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -691,6 +831,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -701,7 +843,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
struct tcp_out_options *opts,
const struct tcp_md5sig_key *md5,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -758,6 +901,9 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+ synack_type, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -826,6 +972,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}
+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+ size = MAX_TCP_OPTION_SPACE - remaining;
+ }
+
return size;
}
@@ -1213,6 +1368,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
}
#endif
+ /* BPF prog is the last one writing header option */
+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
+
INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
tcp_v6_send_check, tcp_v4_send_check,
sk, skb);
@@ -1524,7 +1682,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
skb->truesize -= delta_truesize;
sk_wmem_queued_add(sk, -delta_truesize);
sk_mem_uncharge(sk, delta_truesize);
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
}
/* Any change of skb->len requires recalculation of tso factor. */
@@ -3336,20 +3493,20 @@ int tcp_send_synack(struct sock *sk)
}
/**
- * tcp_make_synack - Prepare a SYN-ACK.
- * sk: listener socket
- * dst: dst entry attached to the SYNACK
- * req: request_sock pointer
- * foc: cookie for tcp fast open
- * synack_type: Type of synback to prepare
- *
- * Allocate one skb and build a SYNACK packet.
- * @dst is consumed : Caller should not use it again.
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
+ * @sk: listener socket
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
+ * should not use it again.
+ * @req: request_sock pointer
+ * @foc: cookie for tcp fast open
+ * @synack_type: Type of synack to prepare
+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
*/
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -3408,8 +3565,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+ /* bpf program will be interested in the tcp_flags */
+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
- foc, synack_type) + sizeof(*th);
+ foc, synack_type,
+ syn_skb) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -3441,6 +3601,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_unlock();
#endif
+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
+ synack_type, &opts);
+
skb->skb_mstamp_ns = now;
tcp_add_tx_delay(skb, tp);
@@ -3741,16 +3904,15 @@ void tcp_send_delayed_ack(struct sock *sk)
ato = min(ato, max_ato);
}
+ ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
+
/* Stay within the limit we were given */
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
- /* If delack timer was blocked or is about to expire,
- * send ACK now.
- */
- if (icsk->icsk_ack.blocked ||
- time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+ /* If delack timer is about to expire, send ACK now. */
+ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
@@ -3779,10 +3941,15 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
buff = alloc_skb(MAX_TCP_HEADER,
sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (unlikely(!buff)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long delay;
+
+ delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
+ if (delay < TCP_RTO_MAX)
+ icsk->icsk_ack.retry++;
inet_csk_schedule_ack(sk);
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
return;
}
@@ -3934,7 +4101,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
int res;
tcp_rsk(req)->txhash = net_tx_rndhash();
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ NULL);
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);