aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/sock_reuseport.c
diff options
context:
space:
mode:
authorKuniyuki Iwashima <kuniyu@amazon.co.jp>2021-06-12 21:32:22 +0900
committerDaniel Borkmann <daniel@iogearbox.net>2021-06-15 18:01:06 +0200
commitd5e4ddaeb6ab2c3c7fbb7b247a6d34bb0b18d87e (patch)
tree6e40ab51a4b090c62469756265f16199e4d9ced7 /net/core/sock_reuseport.c
parentbpf: Support BPF_FUNC_get_socket_cookie() for BPF_PROG_TYPE_SK_REUSEPORT. (diff)
downloadlinux-dev-d5e4ddaeb6ab2c3c7fbb7b247a6d34bb0b18d87e.tar.xz
linux-dev-d5e4ddaeb6ab2c3c7fbb7b247a6d34bb0b18d87e.zip
bpf: Support socket migration by eBPF.
This patch introduces a new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT to check if the attached eBPF program is capable of migrating sockets. When the eBPF program is attached, we run it for socket migration if the expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE or net.ipv4.tcp_migrate_req is enabled. Currently, the expected_attach_type is not enforced for the BPF_PROG_TYPE_SK_REUSEPORT type of program. Thus, this commit follows the earlier idea in the commit aac3fc320d94 ("bpf: Post-hooks for sys_bind") to fix up the zero expected_attach_type in bpf_prog_load_fixup_attach_type(). Moreover, this patch adds a new field (migrating_sk) to sk_reuseport_md to select a new listener based on the child socket. migrating_sk varies depending on if it is migrating a request in the accept queue or during 3WHS. - accept_queue : sock (ESTABLISHED/SYN_RECV) - 3WHS : request_sock (NEW_SYN_RECV) In the eBPF program, we can select a new listener by BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning SK_DROP. This feature is useful when listeners have different settings at the socket API level or when we want to free resources as soon as possible. - SK_PASS with selected_sk, select it as a new listener - SK_PASS with selected_sk NULL, fallbacks to the random selection - SK_DROP, cancel the migration. There is a noteworthy point. We select a listening socket in three places, but we do not have struct skb at closing a listener or retransmitting a SYN+ACK. On the other hand, some helper functions do not expect skb is NULL (e.g. skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer() in BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb temporarily before running the eBPF program. Suggested-by: Martin KaFai Lau <kafai@fb.com> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Eric Dumazet <edumazet@google.com> Acked-by: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6tg6h@kafai-mbp.dhcp.thefacebook.com/ Link: https://lore.kernel.org/netdev/20201203042402.6cskdlit5f3mw4ru@kafai-mbp.dhcp.thefacebook.com/ Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/ Link: https://lore.kernel.org/bpf/20210612123224.12525-10-kuniyu@amazon.co.jp
Diffstat (limited to 'net/core/sock_reuseport.c')
-rw-r--r--net/core/sock_reuseport.c34
1 files changed, 30 insertions, 4 deletions
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index b239f8cd9d39..de5ee3ae86d5 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -377,13 +377,17 @@ void reuseport_stop_listen_sock(struct sock *sk)
{
if (sk->sk_protocol == IPPROTO_TCP) {
struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
+ prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
- if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
+ (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
/* Migration capable, move sk from the listening section
* to the closed section.
*/
@@ -488,7 +492,7 @@ struct sock *reuseport_select_sock(struct sock *sk,
goto select_by_hash;
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
- sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
else
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
@@ -519,6 +523,8 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
{
struct sock_reuseport *reuse;
struct sock *nsk = NULL;
+ bool allocated = false;
+ struct bpf_prog *prog;
u16 socks;
u32 hash;
@@ -536,10 +542,30 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
smp_rmb();
hash = migrating_sk->sk_hash;
- if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+ prog = rcu_dereference(reuse->prog);
+ if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+ goto select_by_hash;
+ goto out;
+ }
+
+ if (!skb) {
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+ allocated = true;
+ }
+
+ nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
+
+ if (allocated)
+ kfree_skb(skb);
+
+select_by_hash:
+ if (!nsk)
nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
- if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
+ if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
nsk = NULL;
out: