aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2022-07-22 16:55:43 -0700
committerJakub Kicinski <kuba@kernel.org>2022-07-22 16:55:44 -0700
commitb3fce974d4239bd46ae81bba07b59f255eb979d7 (patch)
tree9856a94f13b97f74e831664899b1ab9d1e0c7b69 /net
parentMerge branch 'io_uring-zerocopy-send' of git://git.kernel.org/pub/scm/linux/kernel/git/kuba/linux (diff)
parentbpf: Simplify bpf_prog_pack_[size|mask] (diff)
downloadlinux-dev-b3fce974d4239bd46ae81bba07b59f255eb979d7.tar.xz
linux-dev-b3fce974d4239bd46ae81bba07b59f255eb979d7.zip
Merge https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== bpf-next 2022-07-22 We've added 73 non-merge commits during the last 12 day(s) which contain a total of 88 files changed, 3458 insertions(+), 860 deletions(-). The main changes are: 1) Implement BPF trampoline for arm64 JIT, from Xu Kuohai. 2) Add ksyscall/kretsyscall section support to libbpf to simplify tracing kernel syscalls through kprobe mechanism, from Andrii Nakryiko. 3) Allow for livepatch (KLP) and BPF trampolines to attach to the same kernel function, from Song Liu & Jiri Olsa. 4) Add new kfunc infrastructure for netfilter's CT e.g. to insert and change entries, from Kumar Kartikeya Dwivedi & Lorenzo Bianconi. 5) Add a ksym BPF iterator to allow for more flexible and efficient interactions with kernel symbols, from Alan Maguire. 6) Bug fixes in libbpf e.g. for uprobe binary path resolution, from Dan Carpenter. 7) Fix BPF subprog function names in stack traces, from Alexei Starovoitov. 8) libbpf support for writing custom perf event readers, from Jon Doron. 9) Switch to use SPDX tag for BPF helper man page, from Alejandro Colomar. 10) Fix xsk send-only sockets when in busy poll mode, from Maciej Fijalkowski. 11) Reparent BPF maps and their charging on memcg offlining, from Roman Gushchin. 12) Multiple follow-up fixes around BPF lsm cgroup infra, from Stanislav Fomichev. 13) Use bootstrap version of bpftool where possible to speed up builds, from Pu Lehui. 14) Cleanup BPF verifier's check_func_arg() handling, from Joanne Koong. 15) Make non-prealloced BPF map allocations low priority to play better with memcg limits, from Yafang Shao. 16) Fix BPF test runner to reject zero-length data for skbs, from Zhengchao Shao. 17) Various smaller cleanups and improvements all over the place. * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (73 commits) bpf: Simplify bpf_prog_pack_[size|mask] bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch) bpf, x64: Allow to use caller address from stack ftrace: Allow IPMODIFY and DIRECT ops on the same function ftrace: Add modify_ftrace_direct_multi_nolock bpf/selftests: Fix couldn't retrieve pinned program in xdp veth test bpf: Fix build error in case of !CONFIG_DEBUG_INFO_BTF selftests/bpf: Fix test_verifier failed test in unprivileged mode selftests/bpf: Add negative tests for new nf_conntrack kfuncs selftests/bpf: Add tests for new nf_conntrack kfuncs selftests/bpf: Add verifier tests for trusted kfunc args net: netfilter: Add kfuncs to set and change CT status net: netfilter: Add kfuncs to set and change CT timeout net: netfilter: Add kfuncs to allocate and insert CT net: netfilter: Deduplicate code in bpf_{xdp,skb}_ct_lookup bpf: Add documentation for kfuncs bpf: Add support for forcing kfunc args to be trusted bpf: Switch to new kfunc flags infrastructure tools/resolve_btfids: Add support for 8-byte BTF sets bpf: Introduce 8-byte BTF set ... ==================== Link: https://lore.kernel.org/r/20220722221218.29943-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
-rw-r--r--net/bpf/test_run.c78
-rw-r--r--net/core/dev.c1
-rw-r--r--net/core/filter.c4
-rw-r--r--net/core/skmsg.c4
-rw-r--r--net/ipv4/bpf_tcp_ca.c18
-rw-r--r--net/ipv4/tcp_bbr.c24
-rw-r--r--net/ipv4/tcp_cubic.c20
-rw-r--r--net/ipv4/tcp_dctcp.c20
-rw-r--r--net/netfilter/nf_conntrack_bpf.c365
-rw-r--r--net/netfilter/nf_conntrack_core.c62
-rw-r--r--net/netfilter/nf_conntrack_netlink.c54
-rw-r--r--net/xdp/xsk.c5
12 files changed, 422 insertions, 233 deletions
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2ca96acbc50a..cbc9cd5058cb 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -691,52 +691,35 @@ noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
{
}
+noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
+{
+}
+
__diag_pop();
ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
-BTF_SET_START(test_sk_check_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test1)
-BTF_ID(func, bpf_kfunc_call_test2)
-BTF_ID(func, bpf_kfunc_call_test3)
-BTF_ID(func, bpf_kfunc_call_test_acquire)
-BTF_ID(func, bpf_kfunc_call_memb_acquire)
-BTF_ID(func, bpf_kfunc_call_test_release)
-BTF_ID(func, bpf_kfunc_call_memb_release)
-BTF_ID(func, bpf_kfunc_call_memb1_release)
-BTF_ID(func, bpf_kfunc_call_test_kptr_get)
-BTF_ID(func, bpf_kfunc_call_test_pass_ctx)
-BTF_ID(func, bpf_kfunc_call_test_pass1)
-BTF_ID(func, bpf_kfunc_call_test_pass2)
-BTF_ID(func, bpf_kfunc_call_test_fail1)
-BTF_ID(func, bpf_kfunc_call_test_fail2)
-BTF_ID(func, bpf_kfunc_call_test_fail3)
-BTF_ID(func, bpf_kfunc_call_test_mem_len_pass1)
-BTF_ID(func, bpf_kfunc_call_test_mem_len_fail1)
-BTF_ID(func, bpf_kfunc_call_test_mem_len_fail2)
-BTF_SET_END(test_sk_check_kfunc_ids)
-
-BTF_SET_START(test_sk_acquire_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test_acquire)
-BTF_ID(func, bpf_kfunc_call_memb_acquire)
-BTF_ID(func, bpf_kfunc_call_test_kptr_get)
-BTF_SET_END(test_sk_acquire_kfunc_ids)
-
-BTF_SET_START(test_sk_release_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test_release)
-BTF_ID(func, bpf_kfunc_call_memb_release)
-BTF_ID(func, bpf_kfunc_call_memb1_release)
-BTF_SET_END(test_sk_release_kfunc_ids)
-
-BTF_SET_START(test_sk_ret_null_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test_acquire)
-BTF_ID(func, bpf_kfunc_call_memb_acquire)
-BTF_ID(func, bpf_kfunc_call_test_kptr_get)
-BTF_SET_END(test_sk_ret_null_kfunc_ids)
-
-BTF_SET_START(test_sk_kptr_acquire_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test_kptr_get)
-BTF_SET_END(test_sk_kptr_acquire_kfunc_ids)
+BTF_SET8_START(test_sk_check_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test3)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb1_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_kptr_get, KF_ACQUIRE | KF_RET_NULL | KF_KPTR_GET)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass_ctx)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS)
+BTF_SET8_END(test_sk_check_kfunc_ids)
static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
u32 size, u32 headroom, u32 tailroom)
@@ -955,6 +938,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
{
struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+ if (!skb->len)
+ return -EINVAL;
+
if (!__skb)
return 0;
@@ -1617,12 +1603,8 @@ out:
}
static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = {
- .owner = THIS_MODULE,
- .check_set = &test_sk_check_kfunc_ids,
- .acquire_set = &test_sk_acquire_kfunc_ids,
- .release_set = &test_sk_release_kfunc_ids,
- .ret_null_set = &test_sk_ret_null_kfunc_ids,
- .kptr_acquire_set = &test_sk_kptr_acquire_kfunc_ids
+ .owner = THIS_MODULE,
+ .set = &test_sk_check_kfunc_ids,
};
BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids)
diff --git a/net/core/dev.c b/net/core/dev.c
index d588fd0a54ce..716df64fcfa5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4168,6 +4168,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
bool again = false;
skb_reset_mac_header(skb);
+ skb_assert_len(skb);
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
diff --git a/net/core/filter.c b/net/core/filter.c
index a0c61094aeac..57c5e4c4efd2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -237,7 +237,7 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u16 tmp, *ptr;
+ __be16 tmp, *ptr;
const int len = sizeof(tmp);
if (offset >= 0) {
@@ -264,7 +264,7 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u32 tmp, *ptr;
+ __be32 tmp, *ptr;
const int len = sizeof(tmp);
if (likely(offset >= 0)) {
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 266d3b7b7d0b..81627892bdd4 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -462,7 +462,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
if (copied == len)
break;
- } while (i != msg_rx->sg.end);
+ } while (!sg_is_last(sge));
if (unlikely(peek)) {
msg_rx = sk_psock_next_msg(psock, msg_rx);
@@ -472,7 +472,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
}
msg_rx->sg.start = i;
- if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+ if (!sge->length && sg_is_last(sge)) {
msg_rx = sk_psock_dequeue_msg(psock);
kfree_sk_msg(msg_rx);
}
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 7a181631b995..85a9e500c42d 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -197,17 +197,17 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
}
}
-BTF_SET_START(bpf_tcp_ca_check_kfunc_ids)
-BTF_ID(func, tcp_reno_ssthresh)
-BTF_ID(func, tcp_reno_cong_avoid)
-BTF_ID(func, tcp_reno_undo_cwnd)
-BTF_ID(func, tcp_slow_start)
-BTF_ID(func, tcp_cong_avoid_ai)
-BTF_SET_END(bpf_tcp_ca_check_kfunc_ids)
+BTF_SET8_START(bpf_tcp_ca_check_kfunc_ids)
+BTF_ID_FLAGS(func, tcp_reno_ssthresh)
+BTF_ID_FLAGS(func, tcp_reno_cong_avoid)
+BTF_ID_FLAGS(func, tcp_reno_undo_cwnd)
+BTF_ID_FLAGS(func, tcp_slow_start)
+BTF_ID_FLAGS(func, tcp_cong_avoid_ai)
+BTF_SET8_END(bpf_tcp_ca_check_kfunc_ids)
static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
- .owner = THIS_MODULE,
- .check_set = &bpf_tcp_ca_check_kfunc_ids,
+ .owner = THIS_MODULE,
+ .set = &bpf_tcp_ca_check_kfunc_ids,
};
static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 075e744bfb48..54eec33c6e1c 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1154,24 +1154,24 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.set_state = bbr_set_state,
};
-BTF_SET_START(tcp_bbr_check_kfunc_ids)
+BTF_SET8_START(tcp_bbr_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
-BTF_ID(func, bbr_init)
-BTF_ID(func, bbr_main)
-BTF_ID(func, bbr_sndbuf_expand)
-BTF_ID(func, bbr_undo_cwnd)
-BTF_ID(func, bbr_cwnd_event)
-BTF_ID(func, bbr_ssthresh)
-BTF_ID(func, bbr_min_tso_segs)
-BTF_ID(func, bbr_set_state)
+BTF_ID_FLAGS(func, bbr_init)
+BTF_ID_FLAGS(func, bbr_main)
+BTF_ID_FLAGS(func, bbr_sndbuf_expand)
+BTF_ID_FLAGS(func, bbr_undo_cwnd)
+BTF_ID_FLAGS(func, bbr_cwnd_event)
+BTF_ID_FLAGS(func, bbr_ssthresh)
+BTF_ID_FLAGS(func, bbr_min_tso_segs)
+BTF_ID_FLAGS(func, bbr_set_state)
#endif
#endif
-BTF_SET_END(tcp_bbr_check_kfunc_ids)
+BTF_SET8_END(tcp_bbr_check_kfunc_ids)
static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
- .owner = THIS_MODULE,
- .check_set = &tcp_bbr_check_kfunc_ids,
+ .owner = THIS_MODULE,
+ .set = &tcp_bbr_check_kfunc_ids,
};
static int __init bbr_register(void)
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 68178e7280ce..768c10c1f649 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -485,22 +485,22 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.name = "cubic",
};
-BTF_SET_START(tcp_cubic_check_kfunc_ids)
+BTF_SET8_START(tcp_cubic_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
-BTF_ID(func, cubictcp_init)
-BTF_ID(func, cubictcp_recalc_ssthresh)
-BTF_ID(func, cubictcp_cong_avoid)
-BTF_ID(func, cubictcp_state)
-BTF_ID(func, cubictcp_cwnd_event)
-BTF_ID(func, cubictcp_acked)
+BTF_ID_FLAGS(func, cubictcp_init)
+BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh)
+BTF_ID_FLAGS(func, cubictcp_cong_avoid)
+BTF_ID_FLAGS(func, cubictcp_state)
+BTF_ID_FLAGS(func, cubictcp_cwnd_event)
+BTF_ID_FLAGS(func, cubictcp_acked)
#endif
#endif
-BTF_SET_END(tcp_cubic_check_kfunc_ids)
+BTF_SET8_END(tcp_cubic_check_kfunc_ids)
static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
- .owner = THIS_MODULE,
- .check_set = &tcp_cubic_check_kfunc_ids,
+ .owner = THIS_MODULE,
+ .set = &tcp_cubic_check_kfunc_ids,
};
static int __init cubictcp_register(void)
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index ab034a4e9324..2a6c0dd665a4 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -239,22 +239,22 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = {
.name = "dctcp-reno",
};
-BTF_SET_START(tcp_dctcp_check_kfunc_ids)
+BTF_SET8_START(tcp_dctcp_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
-BTF_ID(func, dctcp_init)
-BTF_ID(func, dctcp_update_alpha)
-BTF_ID(func, dctcp_cwnd_event)
-BTF_ID(func, dctcp_ssthresh)
-BTF_ID(func, dctcp_cwnd_undo)
-BTF_ID(func, dctcp_state)
+BTF_ID_FLAGS(func, dctcp_init)
+BTF_ID_FLAGS(func, dctcp_update_alpha)
+BTF_ID_FLAGS(func, dctcp_cwnd_event)
+BTF_ID_FLAGS(func, dctcp_ssthresh)
+BTF_ID_FLAGS(func, dctcp_cwnd_undo)
+BTF_ID_FLAGS(func, dctcp_state)
#endif
#endif
-BTF_SET_END(tcp_dctcp_check_kfunc_ids)
+BTF_SET8_END(tcp_dctcp_check_kfunc_ids)
static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
- .owner = THIS_MODULE,
- .check_set = &tcp_dctcp_check_kfunc_ids,
+ .owner = THIS_MODULE,
+ .set = &tcp_dctcp_check_kfunc_ids,
};
static int __init dctcp_register(void)
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index bc4d5cd63a94..1cd87b28c9b0 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -55,57 +55,131 @@ enum {
NF_BPF_CT_OPTS_SZ = 12,
};
-static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
- struct bpf_sock_tuple *bpf_tuple,
- u32 tuple_len, u8 protonum,
- s32 netns_id, u8 *dir)
+static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, u8 protonum, u8 dir,
+ struct nf_conntrack_tuple *tuple)
{
- struct nf_conntrack_tuple_hash *hash;
- struct nf_conntrack_tuple tuple;
- struct nf_conn *ct;
+ union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
+ union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
+ union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
+ : &tuple->src.u;
+ union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
+ : (void *)&tuple->dst.u;
if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
- return ERR_PTR(-EPROTO);
- if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
- return ERR_PTR(-EINVAL);
+ return -EPROTO;
+
+ memset(tuple, 0, sizeof(*tuple));
- memset(&tuple, 0, sizeof(tuple));
switch (tuple_len) {
case sizeof(bpf_tuple->ipv4):
- tuple.src.l3num = AF_INET;
- tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
- tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
- tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
- tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
+ tuple->src.l3num = AF_INET;
+ src->ip = bpf_tuple->ipv4.saddr;
+ sport->tcp.port = bpf_tuple->ipv4.sport;
+ dst->ip = bpf_tuple->ipv4.daddr;
+ dport->tcp.port = bpf_tuple->ipv4.dport;
break;
case sizeof(bpf_tuple->ipv6):
- tuple.src.l3num = AF_INET6;
- memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
- tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
- memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
- tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
+ tuple->src.l3num = AF_INET6;
+ memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+ sport->tcp.port = bpf_tuple->ipv6.sport;
+ memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+ dport->tcp.port = bpf_tuple->ipv6.dport;
break;
default:
- return ERR_PTR(-EAFNOSUPPORT);
+ return -EAFNOSUPPORT;
+ }
+ tuple->dst.protonum = protonum;
+ tuple->dst.dir = dir;
+
+ return 0;
+}
+
+static struct nf_conn *
+__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len,
+ u32 timeout)
+{
+ struct nf_conntrack_tuple otuple, rtuple;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+ opts_len != NF_BPF_CT_OPTS_SZ)
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &otuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_REPLY, &rtuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
}
- tuple.dst.protonum = protonum;
+ ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
+ GFP_ATOMIC);
+ if (IS_ERR(ct))
+ goto out;
+
+ memset(&ct->proto, 0, sizeof(ct->proto));
+ __nf_ct_set_timeout(ct, timeout * HZ);
+ ct->status |= IPS_CONFIRMED;
+
+out:
+ if (opts->netns_id >= 0)
+ put_net(net);
+
+ return ct;
+}
+
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+ struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts,
+ u32 opts_len)
+{
+ struct nf_conntrack_tuple_hash *hash;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+ opts_len != NF_BPF_CT_OPTS_SZ)
+ return ERR_PTR(-EINVAL);
+ if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
+ return ERR_PTR(-EPROTO);
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &tuple);
+ if (err < 0)
+ return ERR_PTR(err);
- if (netns_id >= 0) {
- net = get_net_ns_by_id(net, netns_id);
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
if (unlikely(!net))
return ERR_PTR(-ENONET);
}
hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
- if (netns_id >= 0)
+ if (opts->netns_id >= 0)
put_net(net);
if (!hash)
return ERR_PTR(-ENOENT);
ct = nf_ct_tuplehash_to_ctrack(hash);
- if (dir)
- *dir = NF_CT_DIRECTION(hash);
+ opts->dir = NF_CT_DIRECTION(hash);
return ct;
}
@@ -114,6 +188,43 @@ __diag_push();
__diag_ignore_all("-Wmissing-prototypes",
"Global functions as their definitions will be in nf_conntrack BTF");
+struct nf_conn___init {
+ struct nf_conn ct;
+};
+
+/* bpf_xdp_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn___init *
+bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct nf_conn *nfct;
+
+ nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz,
+ opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
* reference to it
*
@@ -138,25 +249,50 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
struct net *caller_net;
struct nf_conn *nfct;
- BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
-
- if (!opts)
- return NULL;
- if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
- opts__sz != NF_BPF_CT_OPTS_SZ) {
- opts->error = -EINVAL;
- return NULL;
- }
caller_net = dev_net(ctx->rxq->dev);
- nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
- opts->netns_id, &opts->dir);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
if (IS_ERR(nfct)) {
- opts->error = PTR_ERR(nfct);
+ if (opts)
+ opts->error = PTR_ERR(nfct);
return NULL;
}
return nfct;
}
+/* bpf_skb_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn___init *
+bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct nf_conn *nfct;
+ struct net *net;
+
+ net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
* reference to it
*
@@ -181,20 +317,31 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
struct net *caller_net;
struct nf_conn *nfct;
- BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
-
- if (!opts)
- return NULL;
- if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
- opts__sz != NF_BPF_CT_OPTS_SZ) {
- opts->error = -EINVAL;
- return NULL;
- }
caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
- nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
- opts->netns_id, &opts->dir);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
if (IS_ERR(nfct)) {
- opts->error = PTR_ERR(nfct);
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_insert_entry - Add the provided entry into a CT map
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID.
+ *
+ * @nfct - Pointer to referenced nf_conn___init object, obtained
+ * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ */
+struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
+{
+ struct nf_conn *nfct = (struct nf_conn *)nfct_i;
+ int err;
+
+ err = nf_conntrack_hash_check_insert(nfct);
+ if (err < 0) {
+ nf_conntrack_free(nfct);
return NULL;
}
return nfct;
@@ -217,50 +364,90 @@ void bpf_ct_release(struct nf_conn *nfct)
nf_ct_put(nfct);
}
+/* bpf_ct_set_timeout - Set timeout of allocated nf_conn
+ *
+ * Sets the default timeout of newly allocated nf_conn before insertion.
+ * This helper must be invoked for refcounted pointer to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @timeout - Timeout in msecs.
+ */
+void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
+{
+ __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_change_timeout - Change timeout of inserted nf_conn
+ *
+ * Change timeout associated of the inserted or looked up nf_conn.
+ * This helper must be invoked for refcounted pointer to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
+ * @timeout - New timeout in msecs.
+ */
+int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
+{
+ return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_set_status - Set status field of allocated nf_conn
+ *
+ * Set the status field of the newly allocated nf_conn before insertion.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @status - New status value.
+ */
+int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
+{
+ return nf_ct_change_status_common((struct nf_conn *)nfct, status);
+}
+
+/* bpf_ct_change_status - Change status of inserted nf_conn
+ *
+ * Change the status field of the provided connection tracking entry.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ * @status - New status value.
+ */
+int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
+{
+ return nf_ct_change_status_common(nfct, status);
+}
+
__diag_pop()
-BTF_SET_START(nf_ct_xdp_check_kfunc_ids)
-BTF_ID(func, bpf_xdp_ct_lookup)
-BTF_ID(func, bpf_ct_release)
-BTF_SET_END(nf_ct_xdp_check_kfunc_ids)
-
-BTF_SET_START(nf_ct_tc_check_kfunc_ids)
-BTF_ID(func, bpf_skb_ct_lookup)
-BTF_ID(func, bpf_ct_release)
-BTF_SET_END(nf_ct_tc_check_kfunc_ids)
-
-BTF_SET_START(nf_ct_acquire_kfunc_ids)
-BTF_ID(func, bpf_xdp_ct_lookup)
-BTF_ID(func, bpf_skb_ct_lookup)
-BTF_SET_END(nf_ct_acquire_kfunc_ids)
-
-BTF_SET_START(nf_ct_release_kfunc_ids)
-BTF_ID(func, bpf_ct_release)
-BTF_SET_END(nf_ct_release_kfunc_ids)
-
-/* Both sets are identical */
-#define nf_ct_ret_null_kfunc_ids nf_ct_acquire_kfunc_ids
-
-static const struct btf_kfunc_id_set nf_conntrack_xdp_kfunc_set = {
- .owner = THIS_MODULE,
- .check_set = &nf_ct_xdp_check_kfunc_ids,
- .acquire_set = &nf_ct_acquire_kfunc_ids,
- .release_set = &nf_ct_release_kfunc_ids,
- .ret_null_set = &nf_ct_ret_null_kfunc_ids,
-};
+BTF_SET8_START(nf_ct_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
+BTF_SET8_END(nf_ct_kfunc_set)
-static const struct btf_kfunc_id_set nf_conntrack_tc_kfunc_set = {
- .owner = THIS_MODULE,
- .check_set = &nf_ct_tc_check_kfunc_ids,
- .acquire_set = &nf_ct_acquire_kfunc_ids,
- .release_set = &nf_ct_release_kfunc_ids,
- .ret_null_set = &nf_ct_ret_null_kfunc_ids,
+static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_ct_kfunc_set,
};
int register_nf_conntrack_bpf(void)
{
int ret;
- ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_xdp_kfunc_set);
- return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_tc_kfunc_set);
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
}
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8c97d062b1ae..71c2f4f95d36 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2806,3 +2806,65 @@ err_expect:
free_percpu(net->ct.stat);
return ret;
}
+
+#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+ (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) || \
+ IS_ENABLED(CONFIG_NF_CT_NETLINK))
+
+/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+
+int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
+{
+ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ return -EPERM;
+
+ __nf_ct_set_timeout(ct, timeout);
+
+ if (test_bit(IPS_DYING_BIT, &ct->status))
+ return -ETIME;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);
+
+void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off)
+{
+ unsigned int bit;
+
+ /* Ignore these unchangable bits */
+ on &= ~IPS_UNCHANGEABLE_MASK;
+ off &= ~IPS_UNCHANGEABLE_MASK;
+
+ for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
+ if (on & (1 << bit))
+ set_bit(bit, &ct->status);
+ else if (off & (1 << bit))
+ clear_bit(bit, &ct->status);
+ }
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_status);
+
+int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status)
+{
+ unsigned long d;
+
+ d = ct->status ^ status;
+
+ if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+ /* unchangeable */
+ return -EBUSY;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EBUSY;
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EBUSY;
+
+ __nf_ct_change_status(ct, status, 0);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_change_status_common);
+
+#endif
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index f8dd4ed8dc60..04169b54f2a2 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1891,45 +1891,10 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
}
#endif
-static void
-__ctnetlink_change_status(struct nf_conn *ct, unsigned long on,
- unsigned long off)
-{
- unsigned int bit;
-
- /* Ignore these unchangable bits */
- on &= ~IPS_UNCHANGEABLE_MASK;
- off &= ~IPS_UNCHANGEABLE_MASK;
-
- for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
- if (on & (1 << bit))
- set_bit(bit, &ct->status);
- else if (off & (1 << bit))
- clear_bit(bit, &ct->status);
- }
-}
-
static int
ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
{
- unsigned long d;
- unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
- d = ct->status ^ status;
-
- if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
- /* unchangeable */
- return -EBUSY;
-
- if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
- /* SEEN_REPLY bit can only be set */
- return -EBUSY;
-
- if (d & IPS_ASSURED && !(status & IPS_ASSURED))
- /* ASSURED bit can only be set */
- return -EBUSY;
-
- __ctnetlink_change_status(ct, status, 0);
- return 0;
+ return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS])));
}
static int
@@ -2024,16 +1989,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
static int ctnetlink_change_timeout(struct nf_conn *ct,
const struct nlattr * const cda[])
{
- u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
-
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
-
- if (test_bit(IPS_DYING_BIT, &ct->status))
- return -ETIME;
-
- return 0;
+ return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ);
}
#if defined(CONFIG_NF_CONNTRACK_MARK)
@@ -2293,9 +2249,7 @@ ctnetlink_create_conntrack(struct net *net,
goto err1;
timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- ct->timeout = (u32)timeout + nfct_time_stamp;
+ __nf_ct_set_timeout(ct, timeout);
rcu_read_lock();
if (cda[CTA_HELP]) {
@@ -2837,7 +2791,7 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
* unchangeable bits but do not error out. Also user programs
* are allowed to clear the bits that they are allowed to change.
*/
- __ctnetlink_change_status(ct, status, ~status);
+ __nf_ct_change_status(ct, status, ~status);
return 0;
}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 09002387987e..5b4ce6ba1bc7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -639,8 +639,11 @@ static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len
if (unlikely(need_wait))
return -EOPNOTSUPP;
- if (sk_can_busy_loop(sk))
+ if (sk_can_busy_loop(sk)) {
+ if (xs->zc)
+ __sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
sk_busy_loop(sk, 1); /* only support non-blocking sockets */
+ }
if (xs->zc && xsk_no_wakeup(sk))
return 0;