diff options
author | 2020-09-10 20:53:01 -0700 | |
---|---|---|
committer | 2020-09-10 20:53:15 -0700 | |
commit | 2bab48c5bef00d103085da71a01da27ec7200c08 (patch) | |
tree | ac322ae7e14ca53346ebf620db5f5dd438ff0aa2 | |
parent | tools: bpftool: Automate generation for "SEE ALSO" sections in man pages (diff) | |
parent | tcp: Simplify tcp_set_congestion_control() load=false case (diff) | |
download | wireguard-linux-2bab48c5bef00d103085da71a01da27ec7200c08.tar.xz wireguard-linux-2bab48c5bef00d103085da71a01da27ec7200c08.zip |
Merge branch 'improve-bpf-tcp-cc-init'
Neal Cardwell says:
====================
This patch series reorganizes TCP congestion control initialization so that if
EBPF code called by tcp_init_transfer() sets the congestion control algorithm
by calling setsockopt(TCP_CONGESTION) then the TCP stack initializes the
congestion control module immediately, instead of having tcp_init_transfer()
later initialize the congestion control module.
This increases flexibility for the EBPF code that runs at connection
establishment time, and simplifies the code.
This has the following benefits:
(1) This allows CC module customizations made by the EBPF called in
tcp_init_transfer() to persist, and not be wiped out by a later
call to tcp_init_congestion_control() in tcp_init_transfer().
(2) Does not flip the order of EBPF and CC init, to avoid causing bugs
for existing code upstream that depends on the current order.
(3) Does not cause 2 initializations for for CC in the case where the
EBPF called in tcp_init_transfer() wants to set the CC to a new CC
algorithm.
(4) Allows follow-on simplifications to the code in net/core/filter.c
and net/ipv4/tcp_cong.c, which currently both have some complexity
to special-case CC initialization to avoid double CC
initialization if EBPF sets the CC.
changes in v2:
o rebase onto bpf-next
o add another follow-on simplification suggested by Martin KaFai Lau:
"tcp: simplify tcp_set_congestion_control() load=false case"
changes in v3:
o no change in commits
o resent patch series from @gmail.com, since mail from ncardwell@google.com
stopped being accepted at netdev@vger.kernel.org mid-way through processing
the v2 patch series (between patches 2 and 3), confusing patchwork about
which patches belonged to the v2 patch series
====================
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r-- | include/net/inet_connection_sock.h | 3 | ||||
-rw-r--r-- | include/net/tcp.h | 2 | ||||
-rw-r--r-- | net/core/filter.c | 18 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 27 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 4 |
6 files changed, 19 insertions, 38 deletions
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c738abeb3265..dc763ca9413c 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -96,7 +96,8 @@ struct inet_connection_sock { void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); - __u8 icsk_ca_state:6, + __u8 icsk_ca_state:5, + icsk_ca_initialized:1, icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; diff --git a/include/net/tcp.h b/include/net/tcp.h index e85d564446c6..f857146c17a5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1104,7 +1104,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin); + bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/net/core/filter.c b/net/core/filter.c index 47eef9a0be6a..d266c6941967 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4313,10 +4313,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#define SOCKOPT_CC_REINIT (1 << 0) - static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen, u32 flags) + char *optval, int optlen) { char devname[IFNAMSIZ]; int val, valbool; @@ -4449,13 +4447,11 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, - reinit, true); + ret = tcp_set_congestion_control(sk, name, false, true); } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -4615,9 +4611,7 @@ err_clear: BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { @@ -4651,11 +4645,7 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - flags |= SOCKOPT_CC_REINIT; - return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 57a568875539..e58ab9db73ff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2698,6 +2698,7 @@ int tcp_disconnect(struct sock *sk, int flags) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); @@ -3049,7 +3050,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true, + err = tcp_set_congestion_control(sk, name, true, ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); release_sock(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 62878cf26d9c..db47ac24d057 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -176,7 +176,7 @@ void tcp_assign_congestion_control(struct sock *sk) void tcp_init_congestion_control(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) @@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); + icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, @@ -340,7 +341,7 @@ out: * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin) + bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -361,28 +362,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, goto out; } - if (!ca) { + if (!ca) err = -ENOENT; - } else if (!load) { - const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - - if (bpf_try_module_get(ca, ca->owner)) { - if (reinit) { - tcp_reinit_congestion_control(sk, ca); - } else { - icsk->icsk_ca_ops = ca; - bpf_module_put(old_ca, old_ca->owner); - } - } else { - err = -EBUSY; - } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; - } else if (!bpf_try_module_get(ca, ca->owner)) { + else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; - } else { + else tcp_reinit_congestion_control(sk, ca); - } out: rcu_read_unlock(); return err; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4337841faeff..0e5ac0d33fd3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5894,8 +5894,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; + icsk->icsk_ca_initialized = 0; bpf_skops_established(sk, bpf_op, skb); - tcp_init_congestion_control(sk); + if (!icsk->icsk_ca_initialized) + tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } |