aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/bpf.h1
-rw-r--r--include/linux/bpf_types.h1
-rw-r--r--include/linux/skbuff.h20
-rw-r--r--include/net/net_namespace.h3
-rw-r--r--include/net/sch_generic.h12
-rw-r--r--include/uapi/linux/bpf.h26
-rw-r--r--kernel/bpf/arraymap.c25
-rw-r--r--kernel/bpf/sockmap.c6
-rw-r--r--kernel/bpf/syscall.c8
-rw-r--r--kernel/bpf/verifier.c48
-rw-r--r--net/core/filter.c139
-rw-r--r--net/core/flow_dissector.c140
-rw-r--r--samples/bpf/bpf_load.c1
-rw-r--r--samples/bpf/sampleip_user.c1
-rw-r--r--samples/bpf/sockex2_kern.c11
-rw-r--r--samples/bpf/sockex3_kern.c8
-rw-r--r--samples/bpf/sockex3_user.c4
-rw-r--r--samples/bpf/test_current_task_under_cgroup_user.c1
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-net.rst139
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool.rst6
-rw-r--r--tools/bpf/bpftool/bash-completion/bpftool17
-rw-r--r--tools/bpf/bpftool/main.c3
-rw-r--r--tools/bpf/bpftool/main.h8
-rw-r--r--tools/bpf/bpftool/map.c12
-rw-r--r--tools/bpf/bpftool/net.c272
-rw-r--r--tools/bpf/bpftool/netlink_dumper.c174
-rw-r--r--tools/bpf/bpftool/netlink_dumper.h95
-rw-r--r--tools/bpf/bpftool/prog.c1
-rw-r--r--tools/include/uapi/linux/bpf.h26
-rw-r--r--tools/lib/bpf/Build2
-rw-r--r--tools/lib/bpf/bpf.c129
-rw-r--r--tools/lib/bpf/libbpf.c4
-rw-r--r--tools/lib/bpf/libbpf.h16
-rw-r--r--tools/lib/bpf/libbpf_errno.c1
-rw-r--r--tools/lib/bpf/netlink.c331
-rw-r--r--tools/lib/bpf/nlattr.c33
-rw-r--r--tools/lib/bpf/nlattr.h38
-rw-r--r--tools/testing/selftests/bpf/.gitignore6
-rw-r--r--tools/testing/selftests/bpf/Makefile8
-rw-r--r--tools/testing/selftests/bpf/bpf_flow.c373
-rw-r--r--tools/testing/selftests/bpf/config1
-rw-r--r--tools/testing/selftests/bpf/flow_dissector_load.c140
-rw-r--r--tools/testing/selftests/bpf/test_flow_dissector.c782
-rwxr-xr-xtools/testing/selftests/bpf/test_flow_dissector.sh115
-rw-r--r--tools/testing/selftests/bpf/test_progs.c20
-rwxr-xr-xtools/testing/selftests/bpf/with_addr.sh54
-rwxr-xr-xtools/testing/selftests/bpf/with_tunnels.sh36
47 files changed, 3066 insertions, 231 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 523481a3471b..988a00797bcd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -212,6 +212,7 @@ enum bpf_reg_type {
PTR_TO_PACKET_META, /* skb->data - meta_len */
PTR_TO_PACKET, /* reg points to skb->data */
PTR_TO_PACKET_END, /* skb->data + headlen */
+ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
};
/* The information passed from prog-specific *_is_valid_access
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index cd26c090e7c0..c9bd6fb765b0 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -16,6 +16,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local)
BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
+BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
#endif
#ifdef CONFIG_BPF_EVENTS
BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 86f337e9a81d..87e29710373f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -243,6 +243,8 @@ struct scatterlist;
struct pipe_inode_info;
struct iov_iter;
struct napi_struct;
+struct bpf_prog;
+union bpf_attr;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack {
@@ -1192,6 +1194,24 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
const struct flow_dissector_key *key,
unsigned int key_count);
+#ifdef CONFIG_NET
+int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+ struct bpf_prog *prog);
+
+int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
+#else
+static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container,
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 9b5fdc50519a..99d4148e0f90 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -43,6 +43,7 @@ struct ctl_table_header;
struct net_generic;
struct uevent_sock;
struct netns_ipvs;
+struct bpf_prog;
#define NETDEV_HASHBITS 8
@@ -145,6 +146,8 @@ struct net {
#endif
struct net_generic __rcu *gen;
+ struct bpf_prog __rcu *flow_dissector_prog;
+
/* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
struct netns_xfrm xfrm;
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 7a6b71ee5433..4d736427a4cb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -19,6 +19,7 @@ struct Qdisc_ops;
struct qdisc_walker;
struct tcf_walker;
struct module;
+struct bpf_flow_keys;
typedef int tc_setup_cb_t(enum tc_setup_type type,
void *type_data, void *cb_priv);
@@ -321,9 +322,14 @@ struct tcf_proto {
};
struct qdisc_skb_cb {
- unsigned int pkt_len;
- u16 slave_dev_queue_mapping;
- u16 tc_classid;
+ union {
+ struct {
+ unsigned int pkt_len;
+ u16 slave_dev_queue_mapping;
+ u16 tc_classid;
+ };
+ struct bpf_flow_keys *flow_keys;
+ };
#define QDISC_CB_PRIV_LEN 20
unsigned char data[QDISC_CB_PRIV_LEN];
};
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 66917a4eba27..aa5ccd2385ed 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -152,6 +152,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2,
BPF_PROG_TYPE_SK_REUSEPORT,
+ BPF_PROG_TYPE_FLOW_DISSECTOR,
};
enum bpf_attach_type {
@@ -172,6 +173,7 @@ enum bpf_attach_type {
BPF_CGROUP_UDP4_SENDMSG,
BPF_CGROUP_UDP6_SENDMSG,
BPF_LIRC_MODE2,
+ BPF_FLOW_DISSECTOR,
__MAX_BPF_ATTACH_TYPE
};
@@ -2333,6 +2335,7 @@ struct __sk_buff {
/* ... here. */
__u32 data_meta;
+ struct bpf_flow_keys *flow_keys;
};
struct bpf_tunnel_key {
@@ -2778,4 +2781,27 @@ enum bpf_task_fd_type {
BPF_FD_TYPE_URETPROBE, /* filename + offset */
};
+struct bpf_flow_keys {
+ __u16 nhoff;
+ __u16 thoff;
+ __u16 addr_proto; /* ETH_P_* of valid addrs */
+ __u8 is_frag;
+ __u8 is_first_frag;
+ __u8 is_encap;
+ __u8 ip_proto;
+ __be16 n_proto;
+ __be16 sport;
+ __be16 dport;
+ union {
+ struct {
+ __be32 ipv4_src;
+ __be32 ipv4_dst;
+ };
+ struct {
+ __u32 ipv6_src[4]; /* in6_addr; network order */
+ __u32 ipv6_dst[4]; /* in6_addr; network order */
+ };
+ };
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index f9d24121be99..dded84cbe814 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -553,6 +553,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map)
fd_array_map_delete_elem(map, &i);
}
+static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void **elem, *ptr;
+ u32 prog_id;
+
+ rcu_read_lock();
+
+ elem = array_map_lookup_elem(map, key);
+ if (elem) {
+ ptr = READ_ONCE(*elem);
+ if (ptr) {
+ seq_printf(m, "%u: ", *(u32 *)key);
+ prog_id = prog_fd_array_sys_lookup_elem(ptr);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ &prog_id, m);
+ seq_puts(m, "\n");
+ }
+ }
+
+ rcu_read_unlock();
+}
+
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
@@ -564,7 +587,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = bpf_fd_array_map_clear,
- .map_check_btf = map_check_no_btf,
+ .map_seq_show_elem = prog_array_map_seq_show_elem,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0a0f2ec75370..d37a1a0a6e1e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -612,8 +612,7 @@ static int free_sg(struct sock *sk, int start,
if (i == MAX_SKB_FRAGS)
i = 0;
}
- if (md->skb)
- consume_skb(md->skb);
+ consume_skb(md->skb);
return free;
}
@@ -995,8 +994,7 @@ bytes_ready:
if (!sg->length && md->sg_start == md->sg_end) {
list_del(&md->list);
- if (md->skb)
- consume_skb(md->skb);
+ consume_skb(md->skb);
kfree(md);
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3c9636f03bb2..b3c2d09bcf7a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1615,6 +1615,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_LIRC_MODE2:
ptype = BPF_PROG_TYPE_LIRC_MODE2;
break;
+ case BPF_FLOW_DISSECTOR:
+ ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
+ break;
default:
return -EINVAL;
}
@@ -1636,6 +1639,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_LIRC_MODE2:
ret = lirc_prog_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
+ break;
default:
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
}
@@ -1688,6 +1694,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
case BPF_LIRC_MODE2:
return lirc_prog_detach(attr);
+ case BPF_FLOW_DISSECTOR:
+ return skb_flow_dissector_bpf_prog_detach(attr);
default:
return -EINVAL;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8cd4f5306c3c..e986518d7bc3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -261,6 +261,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_PACKET] = "pkt",
[PTR_TO_PACKET_META] = "pkt_meta",
[PTR_TO_PACKET_END] = "pkt_end",
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
};
static char slot_type_char[] = {
@@ -570,7 +571,9 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg);
*/
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- reg->id = 0;
+ /* Clear id, off, and union(map_ptr, range) */
+ memset(((u8 *)reg) + sizeof(reg->type), 0,
+ offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
reg->var_off = tnum_const(imm);
reg->smin_value = (s64)imm;
reg->smax_value = (s64)imm;
@@ -589,7 +592,6 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
static void __mark_reg_const_zero(struct bpf_reg_state *reg)
{
__mark_reg_known(reg, 0);
- reg->off = 0;
reg->type = SCALAR_VALUE;
}
@@ -700,9 +702,12 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
/* Mark a register as having a completely unknown (scalar) value. */
static void __mark_reg_unknown(struct bpf_reg_state *reg)
{
+ /*
+ * Clear type, id, off, and union(map_ptr, range) and
+ * padding between 'type' and union
+ */
+ memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
reg->type = SCALAR_VALUE;
- reg->id = 0;
- reg->off = 0;
reg->var_off = tnum_unknown;
reg->frameno = 0;
__mark_reg_unbounded(reg);
@@ -961,6 +966,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
case PTR_TO_PACKET_END:
+ case PTR_TO_FLOW_KEYS:
case CONST_PTR_TO_MAP:
return true;
default:
@@ -1234,6 +1240,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_LWT_XMIT:
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
if (meta)
return meta->pkt_access;
@@ -1317,6 +1324,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
return -EACCES;
}
+static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
+ int size)
+{
+ if (size < 0 || off < 0 ||
+ (u64)off + size > sizeof(struct bpf_flow_keys)) {
+ verbose(env, "invalid access to flow keys off=%d size=%d\n",
+ off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
static bool __is_pointer_value(bool allow_ptr_leaks,
const struct bpf_reg_state *reg)
{
@@ -1418,6 +1437,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
* right in front, treat it the very same way.
*/
return check_pkt_ptr_alignment(env, reg, off, size, strict);
+ case PTR_TO_FLOW_KEYS:
+ pointer_desc = "flow keys ";
+ break;
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
@@ -1640,9 +1662,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
else
mark_reg_known_zero(env, regs,
value_regno);
- regs[value_regno].id = 0;
- regs[value_regno].off = 0;
- regs[value_regno].range = 0;
regs[value_regno].type = reg_type;
}
@@ -1691,6 +1710,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_packet_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_FLOW_KEYS) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose(env, "R%d leaks addr into flow keys\n",
+ value_regno);
+ return -EACCES;
+ }
+
+ err = check_flow_keys_access(env, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str[reg->type]);
@@ -1838,6 +1868,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
+ case PTR_TO_FLOW_KEYS:
+ return check_flow_keys_access(env, reg->off, access_size);
case PTR_TO_MAP_VALUE:
return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed);
@@ -2495,7 +2527,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
@@ -4366,6 +4397,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_CTX:
case CONST_PTR_TO_MAP:
case PTR_TO_PACKET_END:
+ case PTR_TO_FLOW_KEYS:
/* Only valid matches are exact, which memcmp() above
* would have accepted
*/
diff --git a/net/core/filter.c b/net/core/filter.c
index 0b40f95da21b..72db8afb7cb6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3176,6 +3176,32 @@ static int __bpf_tx_xdp(struct net_device *dev,
return 0;
}
+static noinline int
+xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
+{
+ struct net_device *fwd;
+ u32 index = ri->ifindex;
+ int err;
+
+ fwd = dev_get_by_index_rcu(dev_net(dev), index);
+ ri->ifindex = 0;
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
+ if (unlikely(err))
+ goto err;
+
+ _trace_xdp_redirect(dev, xdp_prog, index);
+ return 0;
+err:
+ _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ return err;
+}
+
static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_map *map,
struct xdp_buff *xdp,
@@ -3188,7 +3214,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_dtab_netdev *dst = fwd;
err = dev_map_enqueue(dst, xdp, dev_rx);
- if (err)
+ if (unlikely(err))
return err;
__dev_map_insert_ctx(map, index);
break;
@@ -3197,7 +3223,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_cpu_map_entry *rcpu = fwd;
err = cpu_map_enqueue(rcpu, xdp, dev_rx);
- if (err)
+ if (unlikely(err))
return err;
__cpu_map_insert_ctx(map, index);
break;
@@ -3238,7 +3264,7 @@ void xdp_do_flush_map(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush_map);
-static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
+static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
{
switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP:
@@ -3270,9 +3296,9 @@ void bpf_clear_redirect_map(struct bpf_map *map)
}
static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog, struct bpf_map *map)
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ struct bpf_redirect_info *ri)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
u32 index = ri->ifindex;
void *fwd = NULL;
int err;
@@ -3281,11 +3307,11 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
WRITE_ONCE(ri->map, NULL);
fwd = __xdp_map_lookup_elem(map, index);
- if (!fwd) {
+ if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
- if (ri->map_to_flush && ri->map_to_flush != map)
+ if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
xdp_do_flush_map();
err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
@@ -3305,29 +3331,11 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map);
- struct net_device *fwd;
- u32 index = ri->ifindex;
- int err;
- if (map)
- return xdp_do_redirect_map(dev, xdp, xdp_prog, map);
+ if (likely(map))
+ return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- ri->ifindex = 0;
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
-
- err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
- if (unlikely(err))
- goto err;
-
- _trace_xdp_redirect(dev, xdp_prog, index);
- return 0;
-err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
- return err;
+ return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -4044,14 +4052,15 @@ static const struct bpf_func_proto bpf_setsockopt_proto = {
BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
- struct inet_connection_sock *icsk;
struct sock *sk = bpf_sock->sk;
- struct tcp_sock *tp;
if (!sk_fullsock(sk))
goto err_clear;
#ifdef CONFIG_INET
if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
+ struct inet_connection_sock *icsk;
+ struct tcp_sock *tp;
+
switch (optname) {
case TCP_CONGESTION:
icsk = inet_csk(sk);
@@ -5116,6 +5125,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
static const struct bpf_func_proto *
+flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -5233,6 +5253,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (size != size_default)
return false;
break;
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
+ if (size != sizeof(struct bpf_flow_keys *))
+ return false;
+ break;
default:
/* Only narrow read access allowed for now. */
if (type == BPF_WRITE) {
@@ -5258,6 +5282,7 @@ static bool sk_filter_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, data_end):
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
@@ -5283,6 +5308,7 @@ static bool lwt_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
return false;
}
@@ -5493,6 +5519,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END;
break;
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
@@ -5694,6 +5721,7 @@ static bool sk_skb_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
return false;
}
@@ -5753,6 +5781,39 @@ static bool sk_msg_is_valid_access(int off, int size,
return true;
}
+static bool flow_dissector_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
+ info->reg_type = PTR_TO_FLOW_KEYS;
+ break;
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ return false;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -6047,6 +6108,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(struct sock_common,
skc_num, 2, target_size));
break;
+
+ case offsetof(struct __sk_buff, flow_keys):
+ off = si->off;
+ off -= offsetof(struct __sk_buff, flow_keys);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, flow_keys);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+ si->src_reg, off);
+ break;
}
return insn - insn_buf;
@@ -7010,6 +7080,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
const struct bpf_prog_ops sk_msg_prog_ops = {
};
+const struct bpf_verifier_ops flow_dissector_verifier_ops = {
+ .get_func_proto = flow_dissector_func_proto,
+ .is_valid_access = flow_dissector_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops flow_dissector_prog_ops = {
+};
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1e745db44cc3..676f3ad629f9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -25,6 +25,9 @@
#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
+#include <linux/bpf.h>
+
+static DEFINE_MUTEX(flow_dissector_mutex);
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
@@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
}
EXPORT_SYMBOL(skb_flow_dissector_init);
+int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog *attached;
+ struct net *net;
+
+ net = current->nsproxy->net_ns;
+ mutex_lock(&flow_dissector_mutex);
+ attached = rcu_dereference_protected(net->flow_dissector_prog,
+ lockdep_is_held(&flow_dissector_mutex));
+ if (attached) {
+ /* Only one BPF program can be attached at a time */
+ mutex_unlock(&flow_dissector_mutex);
+ return -EEXIST;
+ }
+ rcu_assign_pointer(net->flow_dissector_prog, prog);
+ mutex_unlock(&flow_dissector_mutex);
+ return 0;
+}
+
+int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
+{
+ struct bpf_prog *attached;
+ struct net *net;
+
+ net = current->nsproxy->net_ns;
+ mutex_lock(&flow_dissector_mutex);
+ attached = rcu_dereference_protected(net->flow_dissector_prog,
+ lockdep_is_held(&flow_dissector_mutex));
+ if (!attached) {
+ mutex_unlock(&flow_dissector_mutex);
+ return -ENOENT;
+ }
+ bpf_prog_put(attached);
+ RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
+ mutex_unlock(&flow_dissector_mutex);
+ return 0;
+}
/**
* skb_flow_get_be16 - extract be16 entity
* @skb: sk_buff to extract from
@@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
}
+static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_control *key_control;
+ struct flow_dissector_key_basic *key_basic;
+ struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_ports *key_ports;
+
+ key_control = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL,
+ target_container);
+ key_control->thoff = flow_keys->thoff;
+ if (flow_keys->is_frag)
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+ if (flow_keys->is_first_frag)
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (flow_keys->is_encap)
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+
+ key_basic = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ target_container);
+ key_basic->n_proto = flow_keys->n_proto;
+ key_basic->ip_proto = flow_keys->ip_proto;
+
+ if (flow_keys->addr_proto == ETH_P_IP &&
+ dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ target_container);
+ key_addrs->v4addrs.src = flow_keys->ipv4_src;
+ key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
+ dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
+ memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
+ sizeof(key_addrs->v6addrs));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+ key_ports->src = flow_keys->sport;
+ key_ports->dst = flow_keys->dport;
+ }
+}
+
/**
* __skb_flow_dissect - extract the flow_keys struct and return it
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
@@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_vlan *key_vlan;
enum flow_dissect_ret fdret;
enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+ struct bpf_prog *attached = NULL;
int num_hdrs = 0;
u8 ip_proto = 0;
bool ret;
@@ -658,6 +754,50 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
FLOW_DISSECTOR_KEY_BASIC,
target_container);
+ rcu_read_lock();
+ if (skb) {
+ if (skb->dev)
+ attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog);
+ else if (skb->sk)
+ attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog);
+ else
+ WARN_ON_ONCE(1);
+ }
+ if (attached) {
+ /* Note that even though the const qualifier is discarded
+ * throughout the execution of the BPF program, all changes(the
+ * control block) are reverted after the BPF program returns.
+ * Therefore, __skb_flow_dissect does not alter the skb.
+ */
+ struct bpf_flow_keys flow_keys = {};
+ struct bpf_skb_data_end cb_saved;
+ struct bpf_skb_data_end *cb;
+ u32 result;
+
+ cb = (struct bpf_skb_data_end *)skb->cb;
+
+ /* Save Control Block */
+ memcpy(&cb_saved, cb, sizeof(cb_saved));
+ memset(cb, 0, sizeof(cb_saved));
+
+ /* Pass parameters to the BPF program */
+ cb->qdisc_cb.flow_keys = &flow_keys;
+ flow_keys.nhoff = nhoff;
+
+ bpf_compute_data_pointers((struct sk_buff *)skb);
+ result = BPF_PROG_RUN(attached, skb);
+
+ /* Restore state */
+ memcpy(cb, &cb_saved, sizeof(cb_saved));
+
+ __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+ target_container);
+ key_control->thoff = min_t(u16, key_control->thoff, skb->len);
+ rcu_read_unlock();
+ return result == BPF_OK;
+ }
+ rcu_read_unlock();
+
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb);
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 904e775d1a44..e6d7e0fe155b 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -16,7 +16,6 @@
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/types.h>
-#include <sys/types.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
index 60c2b73d1b4d..216c7ecbbbe9 100644
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -9,7 +9,6 @@
*/
#include <stdio.h>
#include <stdlib.h>
-#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <signal.h>
diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c
index f58acfc92556..f2f9dbc021b0 100644
--- a/samples/bpf/sockex2_kern.c
+++ b/samples/bpf/sockex2_kern.c
@@ -14,7 +14,7 @@ struct vlan_hdr {
__be16 h_vlan_encapsulated_proto;
};
-struct bpf_flow_keys {
+struct flow_key_record {
__be32 src;
__be32 dst;
union {
@@ -59,7 +59,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
}
static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
- struct bpf_flow_keys *flow)
+ struct flow_key_record *flow)
{
__u64 verlen;
@@ -83,7 +83,7 @@ static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto
}
static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
- struct bpf_flow_keys *flow)
+ struct flow_key_record *flow)
{
*ip_proto = load_byte(skb,
nhoff + offsetof(struct ipv6hdr, nexthdr));
@@ -96,7 +96,8 @@ static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_pro
return nhoff;
}
-static inline bool flow_dissector(struct __sk_buff *skb, struct bpf_flow_keys *flow)
+static inline bool flow_dissector(struct __sk_buff *skb,
+ struct flow_key_record *flow)
{
__u64 nhoff = ETH_HLEN;
__u64 ip_proto;
@@ -198,7 +199,7 @@ struct bpf_map_def SEC("maps") hash_map = {
SEC("socket2")
int bpf_prog2(struct __sk_buff *skb)
{
- struct bpf_flow_keys flow = {};
+ struct flow_key_record flow = {};
struct pair *value;
u32 key;
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
index 95907f8d2b17..c527b57d3ec8 100644
--- a/samples/bpf/sockex3_kern.c
+++ b/samples/bpf/sockex3_kern.c
@@ -61,7 +61,7 @@ struct vlan_hdr {
__be16 h_vlan_encapsulated_proto;
};
-struct bpf_flow_keys {
+struct flow_key_record {
__be32 src;
__be32 dst;
union {
@@ -88,7 +88,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
}
struct globals {
- struct bpf_flow_keys flow;
+ struct flow_key_record flow;
};
struct bpf_map_def SEC("maps") percpu_map = {
@@ -114,14 +114,14 @@ struct pair {
struct bpf_map_def SEC("maps") hash_map = {
.type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(struct bpf_flow_keys),
+ .key_size = sizeof(struct flow_key_record),
.value_size = sizeof(struct pair),
.max_entries = 1024,
};
static void update_stats(struct __sk_buff *skb, struct globals *g)
{
- struct bpf_flow_keys key = g->flow;
+ struct flow_key_record key = g->flow;
struct pair *value;
value = bpf_map_lookup_elem(&hash_map, &key);
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index 5ba3ae9d180b..9d02e0404719 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -13,7 +13,7 @@
#define PARSE_IP_PROG_FD (prog_fd[0])
#define PROG_ARRAY_FD (map_fd[0])
-struct bpf_flow_keys {
+struct flow_key_record {
__be32 src;
__be32 dst;
union {
@@ -64,7 +64,7 @@ int main(int argc, char **argv)
(void) f;
for (i = 0; i < 5; i++) {
- struct bpf_flow_keys key = {}, next_key;
+ struct flow_key_record key = {}, next_key;
struct pair value;
sleep(1);
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c
index 4be4874ca2bc..2259f997a26c 100644
--- a/samples/bpf/test_current_task_under_cgroup_user.c
+++ b/samples/bpf/test_current_task_under_cgroup_user.c
@@ -11,7 +11,6 @@
#include <unistd.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
-#include <linux/bpf.h>
#include "cgroup_helpers.h"
#define CGROUP_PATH "/my-cgroup"
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
new file mode 100644
index 000000000000..408ec30d8872
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -0,0 +1,139 @@
+================
+bpftool-net
+================
+-------------------------------------------------------------------------------
+tool for inspection of netdev/tc related bpf prog attachments
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+ **bpftool** [*OPTIONS*] **net** *COMMAND*
+
+ *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+ *COMMANDS* :=
+ { **show** | **list** } [ **dev** name ] | **help**
+
+NET COMMANDS
+============
+
+| **bpftool** **net { show | list } [ dev name ]**
+| **bpftool** **net help**
+
+DESCRIPTION
+===========
+ **bpftool net { show | list } [ dev name ]**
+ List bpf program attachments in the kernel networking subsystem.
+
+ Currently, only device driver xdp attachments and tc filter
+ classification/action attachments are implemented, i.e., for
+ program types **BPF_PROG_TYPE_SCHED_CLS**,
+ **BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**.
+ For programs attached to a particular cgroup, e.g.,
+ **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**,
+ **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ users can use **bpftool cgroup** to dump cgroup attachments.
+ For sk_{filter, skb, msg, reuseport} and lwt/seg6
+ bpf programs, users should consult other tools, e.g., iproute2.
+
+ The current output will start with all xdp program attachments, followed by
+ all tc class/qdisc bpf program attachments. Both xdp programs and
+ tc programs are ordered based on ifindex number. If multiple bpf
+ programs attached to the same networking device through **tc filter**,
+ the order will be first all bpf programs attached to tc classes, then
+ all bpf programs attached to non clsact qdiscs, and finally all
+ bpf programs attached to root and clsact qdisc.
+
+ **bpftool net help**
+ Print short help message.
+
+OPTIONS
+=======
+ -h, --help
+ Print short generic help message (similar to **bpftool help**).
+
+ -v, --version
+ Print version number (similar to **bpftool version**).
+
+ -j, --json
+ Generate JSON output. For commands that cannot produce JSON, this
+ option has no effect.
+
+ -p, --pretty
+ Generate human-readable JSON output. Implies **-j**.
+
+EXAMPLES
+========
+
+| **# bpftool net**
+
+::
+
+ xdp:
+ eth0(2) driver id 198
+
+ tc:
+ eth0(2) htb name prefix_matcher.o:[cls_prefix_matcher_htb] id 111727 act []
+ eth0(2) clsact/ingress fbflow_icmp id 130246 act []
+ eth0(2) clsact/egress prefix_matcher.o:[cls_prefix_matcher_clsact] id 111726
+ eth0(2) clsact/egress cls_fg_dscp id 108619 act []
+ eth0(2) clsact/egress fbflow_egress id 130245
+
+|
+| **# bpftool -jp net**
+
+::
+
+ [{
+ "xdp": [{
+ "devname": "eth0",
+ "ifindex": 2,
+ "mode": "driver",
+ "id": 198
+ }
+ ],
+ "tc": [{
+ "devname": "eth0",
+ "ifindex": 2,
+ "kind": "htb",
+ "name": "prefix_matcher.o:[cls_prefix_matcher_htb]",
+ "id": 111727,
+ "act": []
+ },{
+ "devname": "eth0",
+ "ifindex": 2,
+ "kind": "clsact/ingress",
+ "name": "fbflow_icmp",
+ "id": 130246,
+ "act": []
+ },{
+ "devname": "eth0",
+ "ifindex": 2,
+ "kind": "clsact/egress",
+ "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]",
+ "id": 111726,
+ },{
+ "devname": "eth0",
+ "ifindex": 2,
+ "kind": "clsact/egress",
+ "name": "cls_fg_dscp",
+ "id": 108619,
+ "act": []
+ },{
+ "devname": "eth0",
+ "ifindex": 2,
+ "kind": "clsact/egress",
+ "name": "fbflow_egress",
+ "id": 130245,
+ }
+ ]
+ }
+ ]
+
+
+SEE ALSO
+========
+ **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index b6f5d560460d..8dda77daeda9 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -16,7 +16,7 @@ SYNOPSIS
**bpftool** **version**
- *OBJECT* := { **map** | **program** | **cgroup** | **perf** }
+ *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** }
*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
@@ -32,6 +32,8 @@ SYNOPSIS
*PERF-COMMANDS* := { **show** | **list** | **help** }
+ *NET-COMMANDS* := { **show** | **list** | **help** }
+
DESCRIPTION
===========
*bpftool* allows for inspection and simple modification of BPF objects
@@ -58,4 +60,4 @@ OPTIONS
SEE ALSO
========
**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
- **bpftool-perf**\ (8)
+ **bpftool-perf**\ (8), **bpftool-net**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 598066c40191..df1060b852c1 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -494,10 +494,10 @@ _bpftool()
_filedir
return 0
;;
- tree)
- _filedir
- return 0
- ;;
+ tree)
+ _filedir
+ return 0
+ ;;
attach|detach)
local ATTACH_TYPES='ingress egress sock_create sock_ops \
device bind4 bind6 post_bind4 post_bind6 connect4 \
@@ -552,6 +552,15 @@ _bpftool()
;;
esac
;;
+ net)
+ case $command in
+ *)
+ [[ $prev == $object ]] && \
+ COMPREPLY=( $( compgen -W 'help \
+ show list' -- "$cur" ) )
+ ;;
+ esac
+ ;;
esac
} &&
complete -F _bpftool bpftool
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index d15a62be6cf0..79dc3f193547 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -85,7 +85,7 @@ static int do_help(int argc, char **argv)
" %s batch file FILE\n"
" %s version\n"
"\n"
- " OBJECT := { prog | map | cgroup | perf }\n"
+ " OBJECT := { prog | map | cgroup | perf | net }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, bin_name, bin_name);
@@ -215,6 +215,7 @@ static const struct cmd cmds[] = {
{ "map", do_map },
{ "cgroup", do_cgroup },
{ "perf", do_perf },
+ { "net", do_net },
{ "version", do_version },
{ 0 }
};
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 238e734d75b3..40492cdc4e53 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -136,6 +136,7 @@ int do_map(int argc, char **arg);
int do_event_pipe(int argc, char **argv);
int do_cgroup(int argc, char **arg);
int do_perf(int argc, char **arg);
+int do_net(int argc, char **arg);
int prog_parse_fd(int *argc, char ***argv);
int map_parse_fd(int *argc, char ***argv);
@@ -165,4 +166,11 @@ struct btf_dumper {
*/
int btf_dumper_type(const struct btf_dumper *d, __u32 type_id,
const void *data);
+
+struct nlattr;
+struct ifinfomsg;
+struct tcmsg;
+int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb);
+int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind,
+ const char *devname, int ifindex);
#endif
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 9c55077ca5dd..e22fbe8b975f 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -71,6 +71,7 @@ static const char * const map_type_name[] = {
[BPF_MAP_TYPE_XSKMAP] = "xskmap",
[BPF_MAP_TYPE_SOCKHASH] = "sockhash",
[BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage",
+ [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray",
};
static bool map_is_per_cpu(__u32 type)
@@ -673,12 +674,6 @@ static int do_dump(int argc, char **argv)
if (fd < 0)
return -1;
- if (map_is_map_of_maps(info.type) || map_is_map_of_progs(info.type)) {
- p_err("Dumping maps of maps and program maps not supported");
- close(fd);
- return -1;
- }
-
key = malloc(info.key_size);
value = alloc_value(&info);
if (!key || !value) {
@@ -732,7 +727,9 @@ static int do_dump(int argc, char **argv)
} else {
print_entry_plain(&info, key, value);
}
- } else {
+ num_elems++;
+ } else if (!map_is_map_of_maps(info.type) &&
+ !map_is_map_of_progs(info.type)) {
if (json_output) {
jsonw_name(json_wtr, "key");
print_hex_data_json(key, info.key_size);
@@ -749,7 +746,6 @@ static int do_dump(int argc, char **argv)
}
prev_key = key;
- num_elems++;
}
if (json_output)
diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
new file mode 100644
index 000000000000..ed205ee57655
--- /dev/null
+++ b/tools/bpf/bpftool/net.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <libbpf.h>
+#include <net/if.h>
+#include <linux/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_bpf.h>
+#include <sys/socket.h>
+
+#include <bpf.h>
+#include <nlattr.h>
+#include "main.h"
+#include "netlink_dumper.h"
+
+struct ip_devname_ifindex {
+ char devname[64];
+ int ifindex;
+};
+
+struct bpf_netdev_t {
+ struct ip_devname_ifindex *devices;
+ int used_len;
+ int array_len;
+ int filter_idx;
+};
+
+struct tc_kind_handle {
+ char kind[64];
+ int handle;
+};
+
+struct bpf_tcinfo_t {
+ struct tc_kind_handle *handle_array;
+ int used_len;
+ int array_len;
+ bool is_qdisc;
+};
+
+struct bpf_filter_t {
+ const char *kind;
+ const char *devname;
+ int ifindex;
+};
+
+static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
+{
+ struct bpf_netdev_t *netinfo = cookie;
+ struct ifinfomsg *ifinfo = msg;
+
+ if (netinfo->filter_idx > 0 && netinfo->filter_idx != ifinfo->ifi_index)
+ return 0;
+
+ if (netinfo->used_len == netinfo->array_len) {
+ netinfo->devices = realloc(netinfo->devices,
+ (netinfo->array_len + 16) *
+ sizeof(struct ip_devname_ifindex));
+ if (!netinfo->devices)
+ return -ENOMEM;
+
+ netinfo->array_len += 16;
+ }
+ netinfo->devices[netinfo->used_len].ifindex = ifinfo->ifi_index;
+ snprintf(netinfo->devices[netinfo->used_len].devname,
+ sizeof(netinfo->devices[netinfo->used_len].devname),
+ "%s",
+ tb[IFLA_IFNAME] ? nla_getattr_str(tb[IFLA_IFNAME]) : "");
+ netinfo->used_len++;
+
+ return do_xdp_dump(ifinfo, tb);
+}
+
+static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb)
+{
+ struct bpf_tcinfo_t *tcinfo = cookie;
+ struct tcmsg *info = msg;
+
+ if (tcinfo->is_qdisc) {
+ /* skip clsact qdisc */
+ if (tb[TCA_KIND] &&
+ strcmp(nla_data(tb[TCA_KIND]), "clsact") == 0)
+ return 0;
+ if (info->tcm_handle == 0)
+ return 0;
+ }
+
+ if (tcinfo->used_len == tcinfo->array_len) {
+ tcinfo->handle_array = realloc(tcinfo->handle_array,
+ (tcinfo->array_len + 16) * sizeof(struct tc_kind_handle));
+ if (!tcinfo->handle_array)
+ return -ENOMEM;
+
+ tcinfo->array_len += 16;
+ }
+ tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle;
+ snprintf(tcinfo->handle_array[tcinfo->used_len].kind,
+ sizeof(tcinfo->handle_array[tcinfo->used_len].kind),
+ "%s",
+ tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown");
+ tcinfo->used_len++;
+
+ return 0;
+}
+
+static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb)
+{
+ const struct bpf_filter_t *filter_info = cookie;
+
+ return do_filter_dump((struct tcmsg *)msg, tb, filter_info->kind,
+ filter_info->devname, filter_info->ifindex);
+}
+
+static int show_dev_tc_bpf(int sock, unsigned int nl_pid,
+ struct ip_devname_ifindex *dev)
+{
+ struct bpf_filter_t filter_info;
+ struct bpf_tcinfo_t tcinfo;
+ int i, handle, ret = 0;
+
+ tcinfo.handle_array = NULL;
+ tcinfo.used_len = 0;
+ tcinfo.array_len = 0;
+
+ tcinfo.is_qdisc = false;
+ ret = nl_get_class(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg,
+ &tcinfo);
+ if (ret)
+ goto out;
+
+ tcinfo.is_qdisc = true;
+ ret = nl_get_qdisc(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg,
+ &tcinfo);
+ if (ret)
+ goto out;
+
+ filter_info.devname = dev->devname;
+ filter_info.ifindex = dev->ifindex;
+ for (i = 0; i < tcinfo.used_len; i++) {
+ filter_info.kind = tcinfo.handle_array[i].kind;
+ ret = nl_get_filter(sock, nl_pid, dev->ifindex,
+ tcinfo.handle_array[i].handle,
+ dump_filter_nlmsg,
+ &filter_info);
+ if (ret)
+ goto out;
+ }
+
+ /* root, ingress and egress handle */
+ handle = TC_H_ROOT;
+ filter_info.kind = "root";
+ ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
+ dump_filter_nlmsg, &filter_info);
+ if (ret)
+ goto out;
+
+ handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS);
+ filter_info.kind = "clsact/ingress";
+ ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
+ dump_filter_nlmsg, &filter_info);
+ if (ret)
+ goto out;
+
+ handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS);
+ filter_info.kind = "clsact/egress";
+ ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
+ dump_filter_nlmsg, &filter_info);
+ if (ret)
+ goto out;
+
+out:
+ free(tcinfo.handle_array);
+ return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+ int i, sock, ret, filter_idx = -1;
+ struct bpf_netdev_t dev_array;
+ unsigned int nl_pid;
+ char err_buf[256];
+
+ if (argc == 2) {
+ if (strcmp(argv[0], "dev") != 0)
+ usage();
+ filter_idx = if_nametoindex(argv[1]);
+ if (filter_idx == 0) {
+ fprintf(stderr, "invalid dev name %s\n", argv[1]);
+ return -1;
+ }
+ } else if (argc != 0) {
+ usage();
+ }
+
+ sock = bpf_netlink_open(&nl_pid);
+ if (sock < 0) {
+ fprintf(stderr, "failed to open netlink sock\n");
+ return -1;
+ }
+
+ dev_array.devices = NULL;
+ dev_array.used_len = 0;
+ dev_array.array_len = 0;
+ dev_array.filter_idx = filter_idx;
+
+ if (json_output)
+ jsonw_start_array(json_wtr);
+ NET_START_OBJECT;
+ NET_START_ARRAY("xdp", "%s:\n");
+ ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array);
+ NET_END_ARRAY("\n");
+
+ if (!ret) {
+ NET_START_ARRAY("tc", "%s:\n");
+ for (i = 0; i < dev_array.used_len; i++) {
+ ret = show_dev_tc_bpf(sock, nl_pid,
+ &dev_array.devices[i]);
+ if (ret)
+ break;
+ }
+ NET_END_ARRAY("\n");
+ }
+ NET_END_OBJECT;
+ if (json_output)
+ jsonw_end_array(json_wtr);
+
+ if (ret) {
+ if (json_output)
+ jsonw_null(json_wtr);
+ libbpf_strerror(ret, err_buf, sizeof(err_buf));
+ fprintf(stderr, "Error: %s\n", err_buf);
+ }
+ free(dev_array.devices);
+ close(sock);
+ return ret;
+}
+
+static int do_help(int argc, char **argv)
+{
+ if (json_output) {
+ jsonw_null(json_wtr);
+ return 0;
+ }
+
+ fprintf(stderr,
+ "Usage: %s %s { show | list } [dev <devname>]\n"
+ " %s %s help\n"
+ "Note: Only xdp and tc attachments are supported now.\n"
+ " For progs attached to cgroups, use \"bpftool cgroup\"\n"
+ " to dump program attachments. For program types\n"
+ " sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n"
+ " consult iproute2.\n",
+ bin_name, argv[-2], bin_name, argv[-2]);
+
+ return 0;
+}
+
+static const struct cmd cmds[] = {
+ { "show", do_show },
+ { "list", do_show },
+ { "help", do_help },
+ { 0 }
+};
+
+int do_net(int argc, char **argv)
+{
+ return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c
new file mode 100644
index 000000000000..6f5e9cc6836c
--- /dev/null
+++ b/tools/bpf/bpftool/netlink_dumper.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+
+#include <stdlib.h>
+#include <string.h>
+#include <libbpf.h>
+#include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_bpf.h>
+
+#include <nlattr.h>
+#include "main.h"
+#include "netlink_dumper.h"
+
+static void xdp_dump_prog_id(struct nlattr **tb, int attr,
+ const char *mode,
+ bool new_json_object)
+{
+ if (!tb[attr])
+ return;
+
+ if (new_json_object)
+ NET_START_OBJECT
+ NET_DUMP_STR("mode", " %s", mode);
+ NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[attr]))
+ if (new_json_object)
+ NET_END_OBJECT
+}
+
+static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex,
+ const char *name)
+{
+ struct nlattr *tb[IFLA_XDP_MAX + 1];
+ unsigned char mode;
+
+ if (nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0)
+ return -1;
+
+ if (!tb[IFLA_XDP_ATTACHED])
+ return 0;
+
+ mode = nla_getattr_u8(tb[IFLA_XDP_ATTACHED]);
+ if (mode == XDP_ATTACHED_NONE)
+ return 0;
+
+ NET_START_OBJECT;
+ if (name)
+ NET_DUMP_STR("devname", "%s", name);
+ NET_DUMP_UINT("ifindex", "(%d)", ifindex);
+
+ if (mode == XDP_ATTACHED_MULTI) {
+ if (json_output) {
+ jsonw_name(json_wtr, "multi_attachments");
+ jsonw_start_array(json_wtr);
+ }
+ xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic", true);
+ xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "driver", true);
+ xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload", true);
+ if (json_output)
+ jsonw_end_array(json_wtr);
+ } else if (mode == XDP_ATTACHED_DRV) {
+ xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "driver", false);
+ } else if (mode == XDP_ATTACHED_SKB) {
+ xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "generic", false);
+ } else if (mode == XDP_ATTACHED_HW) {
+ xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "offload", false);
+ }
+
+ NET_END_OBJECT_FINAL;
+ return 0;
+}
+
+int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb)
+{
+ if (!tb[IFLA_XDP])
+ return 0;
+
+ return do_xdp_dump_one(tb[IFLA_XDP], ifinfo->ifi_index,
+ nla_getattr_str(tb[IFLA_IFNAME]));
+}
+
+static int do_bpf_dump_one_act(struct nlattr *attr)
+{
+ struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
+
+ if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0)
+ return -LIBBPF_ERRNO__NLPARSE;
+
+ if (!tb[TCA_ACT_BPF_PARMS])
+ return -LIBBPF_ERRNO__NLPARSE;
+
+ NET_START_OBJECT_NESTED2;
+ if (tb[TCA_ACT_BPF_NAME])
+ NET_DUMP_STR("name", "%s",
+ nla_getattr_str(tb[TCA_ACT_BPF_NAME]));
+ if (tb[TCA_ACT_BPF_ID])
+ NET_DUMP_UINT("id", " id %u",
+ nla_getattr_u32(tb[TCA_ACT_BPF_ID]));
+ NET_END_OBJECT_NESTED;
+ return 0;
+}
+
+static int do_dump_one_act(struct nlattr *attr)
+{
+ struct nlattr *tb[TCA_ACT_MAX + 1];
+
+ if (!attr)
+ return 0;
+
+ if (nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0)
+ return -LIBBPF_ERRNO__NLPARSE;
+
+ if (tb[TCA_ACT_KIND] && strcmp(nla_data(tb[TCA_ACT_KIND]), "bpf") == 0)
+ return do_bpf_dump_one_act(tb[TCA_ACT_OPTIONS]);
+
+ return 0;
+}
+
+static int do_bpf_act_dump(struct nlattr *attr)
+{
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ int act, ret;
+
+ if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0)
+ return -LIBBPF_ERRNO__NLPARSE;
+
+ NET_START_ARRAY("act", " %s [");
+ for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) {
+ ret = do_dump_one_act(tb[act]);
+ if (ret)
+ break;
+ }
+ NET_END_ARRAY("] ");
+
+ return ret;
+}
+
+static int do_bpf_filter_dump(struct nlattr *attr)
+{
+ struct nlattr *tb[TCA_BPF_MAX + 1];
+ int ret;
+
+ if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0)
+ return -LIBBPF_ERRNO__NLPARSE;
+
+ if (tb[TCA_BPF_NAME])
+ NET_DUMP_STR("name", " %s", nla_getattr_str(tb[TCA_BPF_NAME]));
+ if (tb[TCA_BPF_ID])
+ NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[TCA_BPF_ID]));
+ if (tb[TCA_BPF_ACT]) {
+ ret = do_bpf_act_dump(tb[TCA_BPF_ACT]);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind,
+ const char *devname, int ifindex)
+{
+ int ret = 0;
+
+ if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) {
+ NET_START_OBJECT;
+ if (devname[0] != '\0')
+ NET_DUMP_STR("devname", "%s", devname);
+ NET_DUMP_UINT("ifindex", "(%u)", ifindex);
+ NET_DUMP_STR("kind", " %s", kind);
+ ret = do_bpf_filter_dump(tb[TCA_OPTIONS]);
+ NET_END_OBJECT_FINAL;
+ }
+
+ return ret;
+}
diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h
new file mode 100644
index 000000000000..0788cfbbed0e
--- /dev/null
+++ b/tools/bpf/bpftool/netlink_dumper.h
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+
+#ifndef _NETLINK_DUMPER_H_
+#define _NETLINK_DUMPER_H_
+
+#define NET_START_OBJECT \
+{ \
+ if (json_output) \
+ jsonw_start_object(json_wtr); \
+}
+
+#define NET_START_OBJECT_NESTED(name) \
+{ \
+ if (json_output) { \
+ jsonw_name(json_wtr, name); \
+ jsonw_start_object(json_wtr); \
+ } else { \
+ fprintf(stderr, "%s {", name); \
+ } \
+}
+
+#define NET_START_OBJECT_NESTED2 \
+{ \
+ if (json_output) \
+ jsonw_start_object(json_wtr); \
+ else \
+ fprintf(stderr, "{"); \
+}
+
+#define NET_END_OBJECT_NESTED \
+{ \
+ if (json_output) \
+ jsonw_end_object(json_wtr); \
+ else \
+ fprintf(stderr, "}"); \
+}
+
+#define NET_END_OBJECT \
+{ \
+ if (json_output) \
+ jsonw_end_object(json_wtr); \
+}
+
+#define NET_END_OBJECT_FINAL \
+{ \
+ if (json_output) \
+ jsonw_end_object(json_wtr); \
+ else \
+ fprintf(stderr, "\n"); \
+}
+
+#define NET_START_ARRAY(name, fmt_str) \
+{ \
+ if (json_output) { \
+ jsonw_name(json_wtr, name); \
+ jsonw_start_array(json_wtr); \
+ } else { \
+ fprintf(stderr, fmt_str, name); \
+ } \
+}
+
+#define NET_END_ARRAY(endstr) \
+{ \
+ if (json_output) \
+ jsonw_end_array(json_wtr); \
+ else \
+ fprintf(stderr, "%s", endstr); \
+}
+
+#define NET_DUMP_UINT(name, fmt_str, val) \
+{ \
+ if (json_output) \
+ jsonw_uint_field(json_wtr, name, val); \
+ else \
+ fprintf(stderr, fmt_str, val); \
+}
+
+#define NET_DUMP_STR(name, fmt_str, str) \
+{ \
+ if (json_output) \
+ jsonw_string_field(json_wtr, name, str);\
+ else \
+ fprintf(stderr, fmt_str, str); \
+}
+
+#define NET_DUMP_STR_ONLY(str) \
+{ \
+ if (json_output) \
+ jsonw_string(json_wtr, str); \
+ else \
+ fprintf(stderr, "%s ", str); \
+}
+
+#endif
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index dce960d22106..b1cd3bc8db70 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -74,6 +74,7 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint",
[BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
[BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2",
+ [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector",
};
static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 66917a4eba27..aa5ccd2385ed 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -152,6 +152,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2,
BPF_PROG_TYPE_SK_REUSEPORT,
+ BPF_PROG_TYPE_FLOW_DISSECTOR,
};
enum bpf_attach_type {
@@ -172,6 +173,7 @@ enum bpf_attach_type {
BPF_CGROUP_UDP4_SENDMSG,
BPF_CGROUP_UDP6_SENDMSG,
BPF_LIRC_MODE2,
+ BPF_FLOW_DISSECTOR,
__MAX_BPF_ATTACH_TYPE
};
@@ -2333,6 +2335,7 @@ struct __sk_buff {
/* ... here. */
__u32 data_meta;
+ struct bpf_flow_keys *flow_keys;
};
struct bpf_tunnel_key {
@@ -2778,4 +2781,27 @@ enum bpf_task_fd_type {
BPF_FD_TYPE_URETPROBE, /* filename + offset */
};
+struct bpf_flow_keys {
+ __u16 nhoff;
+ __u16 thoff;
+ __u16 addr_proto; /* ETH_P_* of valid addrs */
+ __u8 is_frag;
+ __u8 is_first_frag;
+ __u8 is_encap;
+ __u8 ip_proto;
+ __be16 n_proto;
+ __be16 sport;
+ __be16 dport;
+ union {
+ struct {
+ __be32 ipv4_src;
+ __be32 ipv4_dst;
+ };
+ struct {
+ __u32 ipv6_src[4]; /* in6_addr; network order */
+ __u32 ipv6_dst[4]; /* in6_addr; network order */
+ };
+ };
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index 6eb9bacd1948..7bc31c905018 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 60aa4ca8b2c5..3878a26a2071 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -28,16 +28,8 @@
#include <linux/bpf.h>
#include "bpf.h"
#include "libbpf.h"
-#include "nlattr.h"
-#include <linux/rtnetlink.h>
-#include <linux/if_link.h>
-#include <sys/socket.h>
#include <errno.h>
-#ifndef SOL_NETLINK
-#define SOL_NETLINK 270
-#endif
-
/*
* When building perf, unistd.h is overridden. __NR_bpf is
* required to be defined explicitly.
@@ -499,127 +491,6 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
}
-int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
-{
- struct sockaddr_nl sa;
- int sock, seq = 0, len, ret = -1;
- char buf[4096];
- struct nlattr *nla, *nla_xdp;
- struct {
- struct nlmsghdr nh;
- struct ifinfomsg ifinfo;
- char attrbuf[64];
- } req;
- struct nlmsghdr *nh;
- struct nlmsgerr *err;
- socklen_t addrlen;
- int one = 1;
-
- memset(&sa, 0, sizeof(sa));
- sa.nl_family = AF_NETLINK;
-
- sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
- if (sock < 0) {
- return -errno;
- }
-
- if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
- &one, sizeof(one)) < 0) {
- fprintf(stderr, "Netlink error reporting not supported\n");
- }
-
- if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
- ret = -errno;
- goto cleanup;
- }
-
- addrlen = sizeof(sa);
- if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
- ret = -errno;
- goto cleanup;
- }
-
- if (addrlen != sizeof(sa)) {
- ret = -LIBBPF_ERRNO__INTERNAL;
- goto cleanup;
- }
-
- memset(&req, 0, sizeof(req));
- req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
- req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
- req.nh.nlmsg_type = RTM_SETLINK;
- req.nh.nlmsg_pid = 0;
- req.nh.nlmsg_seq = ++seq;
- req.ifinfo.ifi_family = AF_UNSPEC;
- req.ifinfo.ifi_index = ifindex;
-
- /* started nested attribute for XDP */
- nla = (struct nlattr *)(((char *)&req)
- + NLMSG_ALIGN(req.nh.nlmsg_len));
- nla->nla_type = NLA_F_NESTED | IFLA_XDP;
- nla->nla_len = NLA_HDRLEN;
-
- /* add XDP fd */
- nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
- nla_xdp->nla_type = IFLA_XDP_FD;
- nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
- memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
- nla->nla_len += nla_xdp->nla_len;
-
- /* if user passed in any flags, add those too */
- if (flags) {
- nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
- nla_xdp->nla_type = IFLA_XDP_FLAGS;
- nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
- memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
- nla->nla_len += nla_xdp->nla_len;
- }
-
- req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
-
- if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
- ret = -errno;
- goto cleanup;
- }
-
- len = recv(sock, buf, sizeof(buf), 0);
- if (len < 0) {
- ret = -errno;
- goto cleanup;
- }
-
- for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
- nh = NLMSG_NEXT(nh, len)) {
- if (nh->nlmsg_pid != sa.nl_pid) {
- ret = -LIBBPF_ERRNO__WRNGPID;
- goto cleanup;
- }
- if (nh->nlmsg_seq != seq) {
- ret = -LIBBPF_ERRNO__INVSEQ;
- goto cleanup;
- }
- switch (nh->nlmsg_type) {
- case NLMSG_ERROR:
- err = (struct nlmsgerr *)NLMSG_DATA(nh);
- if (!err->error)
- continue;
- ret = err->error;
- nla_dump_errormsg(nh);
- goto cleanup;
- case NLMSG_DONE:
- break;
- default:
- break;
- }
- }
-
- ret = 0;
-
-cleanup:
- close(sock);
- return ret;
-}
-
int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
bool do_log)
{
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index bdb94939fd60..4f8d43ae20d2 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1502,6 +1502,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_SK_REUSEPORT:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
return false;
case BPF_PROG_TYPE_UNSPEC:
case BPF_PROG_TYPE_KPROBE:
@@ -2121,6 +2122,7 @@ static const struct {
BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB),
BPF_PROG_SEC("sk_msg", BPF_PROG_TYPE_SK_MSG),
BPF_PROG_SEC("lirc_mode2", BPF_PROG_TYPE_LIRC_MODE2),
+ BPF_PROG_SEC("flow_dissector", BPF_PROG_TYPE_FLOW_DISSECTOR),
BPF_SA_PROG_SEC("cgroup/bind4", BPF_CGROUP_INET4_BIND),
BPF_SA_PROG_SEC("cgroup/bind6", BPF_CGROUP_INET6_BIND),
BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT),
@@ -2336,7 +2338,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
bpf_program__set_expected_attach_type(prog,
expected_attach_type);
- if (!bpf_program__is_function_storage(prog, obj) && !first_prog)
+ if (!first_prog)
first_prog = prog;
}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 96c55fac54c3..e3b00e23e181 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -46,6 +46,7 @@ enum libbpf_errno {
LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */
LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */
LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */
+ LIBBPF_ERRNO__NLPARSE, /* netlink parsing error */
__LIBBPF_ERRNO__END,
};
@@ -297,4 +298,19 @@ int bpf_perf_event_read_simple(void *mem, unsigned long size,
unsigned long page_size,
void **buf, size_t *buf_len,
bpf_perf_event_print_t fn, void *priv);
+
+struct nlmsghdr;
+struct nlattr;
+typedef int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
+typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t,
+ void *cookie);
+int bpf_netlink_open(unsigned int *nl_pid);
+int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
+ void *cookie);
+int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
+ dump_nlmsg_t dump_class_nlmsg, void *cookie);
+int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
+ dump_nlmsg_t dump_qdisc_nlmsg, void *cookie);
+int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
+ dump_nlmsg_t dump_filter_nlmsg, void *cookie);
#endif
diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c
index d9ba851bd7f9..2464ade3b326 100644
--- a/tools/lib/bpf/libbpf_errno.c
+++ b/tools/lib/bpf/libbpf_errno.c
@@ -42,6 +42,7 @@ static const char *libbpf_strerror_table[NR_ERRNO] = {
[ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type",
[ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message",
[ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence",
+ [ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing",
};
int libbpf_strerror(int err, char *buf, size_t size)
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
new file mode 100644
index 000000000000..fde1d7bf8199
--- /dev/null
+++ b/tools/lib/bpf/netlink.c
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: LGPL-2.1
+/* Copyright (c) 2018 Facebook */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <linux/rtnetlink.h>
+#include <sys/socket.h>
+#include <errno.h>
+#include <time.h>
+
+#include "bpf.h"
+#include "libbpf.h"
+#include "nlattr.h"
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+int bpf_netlink_open(__u32 *nl_pid)
+{
+ struct sockaddr_nl sa;
+ socklen_t addrlen;
+ int one = 1, ret;
+ int sock;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0)
+ return -errno;
+
+ if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
+ &one, sizeof(one)) < 0) {
+ fprintf(stderr, "Netlink error reporting not supported\n");
+ }
+
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ ret = -errno;
+ goto cleanup;
+ }
+
+ addrlen = sizeof(sa);
+ if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
+ ret = -errno;
+ goto cleanup;
+ }
+
+ if (addrlen != sizeof(sa)) {
+ ret = -LIBBPF_ERRNO__INTERNAL;
+ goto cleanup;
+ }
+
+ *nl_pid = sa.nl_pid;
+ return sock;
+
+cleanup:
+ close(sock);
+ return ret;
+}
+
+static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq,
+ __dump_nlmsg_t _fn, dump_nlmsg_t fn,
+ void *cookie)
+{
+ bool multipart = true;
+ struct nlmsgerr *err;
+ struct nlmsghdr *nh;
+ char buf[4096];
+ int len, ret;
+
+ while (multipart) {
+ multipart = false;
+ len = recv(sock, buf, sizeof(buf), 0);
+ if (len < 0) {
+ ret = -errno;
+ goto done;
+ }
+
+ if (len == 0)
+ break;
+
+ for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+ nh = NLMSG_NEXT(nh, len)) {
+ if (nh->nlmsg_pid != nl_pid) {
+ ret = -LIBBPF_ERRNO__WRNGPID;
+ goto done;
+ }
+ if (nh->nlmsg_seq != seq) {
+ ret = -LIBBPF_ERRNO__INVSEQ;
+ goto done;
+ }
+ if (nh->nlmsg_flags & NLM_F_MULTI)
+ multipart = true;
+ switch (nh->nlmsg_type) {
+ case NLMSG_ERROR:
+ err = (struct nlmsgerr *)NLMSG_DATA(nh);
+ if (!err->error)
+ continue;
+ ret = err->error;
+ nla_dump_errormsg(nh);
+ goto done;
+ case NLMSG_DONE:
+ return 0;
+ default:
+ break;
+ }
+ if (_fn) {
+ ret = _fn(nh, fn, cookie);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+ ret = 0;
+done:
+ return ret;
+}
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
+{
+ int sock, seq = 0, ret;
+ struct nlattr *nla, *nla_xdp;
+ struct {
+ struct nlmsghdr nh;
+ struct ifinfomsg ifinfo;
+ char attrbuf[64];
+ } req;
+ __u32 nl_pid;
+
+ sock = bpf_netlink_open(&nl_pid);
+ if (sock < 0)
+ return sock;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_type = RTM_SETLINK;
+ req.nh.nlmsg_pid = 0;
+ req.nh.nlmsg_seq = ++seq;
+ req.ifinfo.ifi_family = AF_UNSPEC;
+ req.ifinfo.ifi_index = ifindex;
+
+ /* started nested attribute for XDP */
+ nla = (struct nlattr *)(((char *)&req)
+ + NLMSG_ALIGN(req.nh.nlmsg_len));
+ nla->nla_type = NLA_F_NESTED | IFLA_XDP;
+ nla->nla_len = NLA_HDRLEN;
+
+ /* add XDP fd */
+ nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+ nla_xdp->nla_type = IFLA_XDP_FD;
+ nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+ memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
+ nla->nla_len += nla_xdp->nla_len;
+
+ /* if user passed in any flags, add those too */
+ if (flags) {
+ nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+ nla_xdp->nla_type = IFLA_XDP_FLAGS;
+ nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
+ memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
+ nla->nla_len += nla_xdp->nla_len;
+ }
+
+ req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ ret = -errno;
+ goto cleanup;
+ }
+ ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL);
+
+cleanup:
+ close(sock);
+ return ret;
+}
+
+static int __dump_link_nlmsg(struct nlmsghdr *nlh, dump_nlmsg_t dump_link_nlmsg,
+ void *cookie)
+{
+ struct nlattr *tb[IFLA_MAX + 1], *attr;
+ struct ifinfomsg *ifi = NLMSG_DATA(nlh);
+ int len;
+
+ len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+ attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi)));
+ if (nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0)
+ return -LIBBPF_ERRNO__NLPARSE;
+
+ return dump_link_nlmsg(cookie, ifi, tb);
+}
+
+int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
+ void *cookie)
+{
+ struct {
+ struct nlmsghdr nlh;
+ struct ifinfomsg ifm;
+ } req = {
+ .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+ .nlh.nlmsg_type = RTM_GETLINK,
+ .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+ .ifm.ifi_family = AF_PACKET,
+ };
+ int seq = time(NULL);
+
+ req.nlh.nlmsg_seq = seq;
+ if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+ return -errno;
+
+ return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg,
+ dump_link_nlmsg, cookie);
+}
+
+static int __dump_class_nlmsg(struct nlmsghdr *nlh,
+ dump_nlmsg_t dump_class_nlmsg, void *cookie)
+{
+ struct nlattr *tb[TCA_MAX + 1], *attr;
+ struct tcmsg *t = NLMSG_DATA(nlh);
+ int len;
+
+ len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
+ attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
+ if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+ return -LIBBPF_ERRNO__NLPARSE;
+
+ return dump_class_nlmsg(cookie, t, tb);
+}
+
+int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
+ dump_nlmsg_t dump_class_nlmsg, void *cookie)
+{
+ struct {
+ struct nlmsghdr nlh;
+ struct tcmsg t;
+ } req = {
+ .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+ .nlh.nlmsg_type = RTM_GETTCLASS,
+ .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+ .t.tcm_family = AF_UNSPEC,
+ .t.tcm_ifindex = ifindex,
+ };
+ int seq = time(NULL);
+
+ req.nlh.nlmsg_seq = seq;
+ if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+ return -errno;
+
+ return bpf_netlink_recv(sock, nl_pid, seq, __dump_class_nlmsg,
+ dump_class_nlmsg, cookie);
+}
+
+static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh,
+ dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
+{
+ struct nlattr *tb[TCA_MAX + 1], *attr;
+ struct tcmsg *t = NLMSG_DATA(nlh);
+ int len;
+
+ len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
+ attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
+ if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+ return -LIBBPF_ERRNO__NLPARSE;
+
+ return dump_qdisc_nlmsg(cookie, t, tb);
+}
+
+int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
+ dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
+{
+ struct {
+ struct nlmsghdr nlh;
+ struct tcmsg t;
+ } req = {
+ .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+ .nlh.nlmsg_type = RTM_GETQDISC,
+ .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+ .t.tcm_family = AF_UNSPEC,
+ .t.tcm_ifindex = ifindex,
+ };
+ int seq = time(NULL);
+
+ req.nlh.nlmsg_seq = seq;
+ if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+ return -errno;
+
+ return bpf_netlink_recv(sock, nl_pid, seq, __dump_qdisc_nlmsg,
+ dump_qdisc_nlmsg, cookie);
+}
+
+static int __dump_filter_nlmsg(struct nlmsghdr *nlh,
+ dump_nlmsg_t dump_filter_nlmsg, void *cookie)
+{
+ struct nlattr *tb[TCA_MAX + 1], *attr;
+ struct tcmsg *t = NLMSG_DATA(nlh);
+ int len;
+
+ len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
+ attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
+ if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+ return -LIBBPF_ERRNO__NLPARSE;
+
+ return dump_filter_nlmsg(cookie, t, tb);
+}
+
+int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
+ dump_nlmsg_t dump_filter_nlmsg, void *cookie)
+{
+ struct {
+ struct nlmsghdr nlh;
+ struct tcmsg t;
+ } req = {
+ .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+ .nlh.nlmsg_type = RTM_GETTFILTER,
+ .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+ .t.tcm_family = AF_UNSPEC,
+ .t.tcm_ifindex = ifindex,
+ .t.tcm_parent = handle,
+ };
+ int seq = time(NULL);
+
+ req.nlh.nlmsg_seq = seq;
+ if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+ return -errno;
+
+ return bpf_netlink_recv(sock, nl_pid, seq, __dump_filter_nlmsg,
+ dump_filter_nlmsg, cookie);
+}
diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
index 4719434278b2..49f514119bdb 100644
--- a/tools/lib/bpf/nlattr.c
+++ b/tools/lib/bpf/nlattr.c
@@ -26,11 +26,6 @@ static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = {
[NLA_FLAG] = 0,
};
-static int nla_len(const struct nlattr *nla)
-{
- return nla->nla_len - NLA_HDRLEN;
-}
-
static struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
{
int totlen = NLA_ALIGN(nla->nla_len);
@@ -46,11 +41,6 @@ static int nla_ok(const struct nlattr *nla, int remaining)
nla->nla_len <= remaining;
}
-static void *nla_data(const struct nlattr *nla)
-{
- return (char *) nla + NLA_HDRLEN;
-}
-
static int nla_type(const struct nlattr *nla)
{
return nla->nla_type & NLA_TYPE_MASK;
@@ -114,8 +104,8 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh)
* @see nla_validate
* @return 0 on success or a negative error code.
*/
-static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
- struct nla_policy *policy)
+int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
+ struct nla_policy *policy)
{
struct nlattr *nla;
int rem, err;
@@ -146,6 +136,25 @@ errout:
return err;
}
+/**
+ * Create attribute index based on nested attribute
+ * @arg tb Index array to be filled (maxtype+1 elements).
+ * @arg maxtype Maximum attribute type expected and accepted.
+ * @arg nla Nested Attribute.
+ * @arg policy Attribute validation policy.
+ *
+ * Feeds the stream of attributes nested into the specified attribute
+ * to nla_parse().
+ *
+ * @see nla_parse
+ * @return 0 on success or a negative error code.
+ */
+int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ struct nla_policy *policy)
+{
+ return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy);
+}
+
/* dump netlink extended ack error message */
int nla_dump_errormsg(struct nlmsghdr *nlh)
{
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h
index 931a71f68f93..a6e2396bce7c 100644
--- a/tools/lib/bpf/nlattr.h
+++ b/tools/lib/bpf/nlattr.h
@@ -67,6 +67,44 @@ struct nla_policy {
nla_ok(pos, rem); \
pos = nla_next(pos, &(rem)))
+/**
+ * nla_data - head of payload
+ * @nla: netlink attribute
+ */
+static inline void *nla_data(const struct nlattr *nla)
+{
+ return (char *) nla + NLA_HDRLEN;
+}
+
+static inline uint8_t nla_getattr_u8(const struct nlattr *nla)
+{
+ return *(uint8_t *)nla_data(nla);
+}
+
+static inline uint32_t nla_getattr_u32(const struct nlattr *nla)
+{
+ return *(uint32_t *)nla_data(nla);
+}
+
+static inline const char *nla_getattr_str(const struct nlattr *nla)
+{
+ return (const char *)nla_data(nla);
+}
+
+/**
+ * nla_len - length of payload
+ * @nla: netlink attribute
+ */
+static inline int nla_len(const struct nlattr *nla)
+{
+ return nla->nla_len - NLA_HDRLEN;
+}
+
+int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
+ struct nla_policy *policy);
+int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ struct nla_policy *policy);
+
int nla_dump_errormsg(struct nlmsghdr *nlh);
#endif /* __NLATTR_H */
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 49938d72cf63..8a60c9b9892d 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -19,3 +19,9 @@ test_btf
test_sockmap
test_lirc_mode2_user
get_cgroup_id_user
+test_skb_cgroup_id_user
+test_socket_cookie
+test_cgroup_storage
+test_select_reuseport
+test_flow_dissector
+flow_dissector_load
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index fff7fb1285fc..fd3851d5c079 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
- test_skb_cgroup_id_kern.o
+ test_skb_cgroup_id_kern.o bpf_flow.o
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
@@ -47,10 +47,12 @@ TEST_PROGS := test_kmod.sh \
test_tunnel.sh \
test_lwt_seg6local.sh \
test_lirc_mode2.sh \
- test_skb_cgroup_id.sh
+ test_skb_cgroup_id.sh \
+ test_flow_dissector.sh
# Compile but not part of 'make run_tests'
-TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user
+TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \
+ flow_dissector_load test_flow_dissector
include ../lib.mk
diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/bpf_flow.c
new file mode 100644
index 000000000000..107350a7821d
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_flow.c
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <limits.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_packet.h>
+#include <sys/socket.h>
+#include <linux/if_tunnel.h>
+#include <linux/mpls.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+#define PROG(F) SEC(#F) int bpf_func_##F
+
+/* These are the identifiers of the BPF programs that will be used in tail
+ * calls. Name is limited to 16 characters, with the terminating character and
+ * bpf_func_ above, we have only 6 to work with, anything after will be cropped.
+ */
+enum {
+ IP,
+ IPV6,
+ IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */
+ IPV6FR, /* Fragmentation IPv6 Extension Header */
+ MPLS,
+ VLAN,
+};
+
+#define IP_MF 0x2000
+#define IP_OFFSET 0x1FFF
+#define IP6_MF 0x0001
+#define IP6_OFFSET 0xFFF8
+
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+};
+
+struct frag_hdr {
+ __u8 nexthdr;
+ __u8 reserved;
+ __be16 frag_off;
+ __be32 identification;
+};
+
+struct bpf_map_def SEC("maps") jmp_table = {
+ .type = BPF_MAP_TYPE_PROG_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u32),
+ .max_entries = 8
+};
+
+static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
+ __u16 hdr_size,
+ void *buffer)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ __u16 nhoff = skb->flow_keys->nhoff;
+ __u8 *hdr;
+
+ /* Verifies this variable offset does not overflow */
+ if (nhoff > (USHRT_MAX - hdr_size))
+ return NULL;
+
+ hdr = data + nhoff;
+ if (hdr + hdr_size <= data_end)
+ return hdr;
+
+ if (bpf_skb_load_bytes(skb, nhoff, buffer, hdr_size))
+ return NULL;
+
+ return buffer;
+}
+
+/* Dispatches on ETHERTYPE */
+static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+
+ keys->n_proto = proto;
+ switch (proto) {
+ case bpf_htons(ETH_P_IP):
+ bpf_tail_call(skb, &jmp_table, IP);
+ break;
+ case bpf_htons(ETH_P_IPV6):
+ bpf_tail_call(skb, &jmp_table, IPV6);
+ break;
+ case bpf_htons(ETH_P_MPLS_MC):
+ case bpf_htons(ETH_P_MPLS_UC):
+ bpf_tail_call(skb, &jmp_table, MPLS);
+ break;
+ case bpf_htons(ETH_P_8021Q):
+ case bpf_htons(ETH_P_8021AD):
+ bpf_tail_call(skb, &jmp_table, VLAN);
+ break;
+ default:
+ /* Protocol not supported */
+ return BPF_DROP;
+ }
+
+ return BPF_DROP;
+}
+
+SEC("dissect")
+int _dissect(struct __sk_buff *skb)
+{
+ if (!skb->vlan_present)
+ return parse_eth_proto(skb, skb->protocol);
+ else
+ return parse_eth_proto(skb, skb->vlan_proto);
+}
+
+/* Parses on IPPROTO_* */
+static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ void *data_end = (void *)(long)skb->data_end;
+ struct icmphdr *icmp, _icmp;
+ struct gre_hdr *gre, _gre;
+ struct ethhdr *eth, _eth;
+ struct tcphdr *tcp, _tcp;
+ struct udphdr *udp, _udp;
+
+ keys->ip_proto = proto;
+ switch (proto) {
+ case IPPROTO_ICMP:
+ icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp);
+ if (!icmp)
+ return BPF_DROP;
+ return BPF_OK;
+ case IPPROTO_IPIP:
+ keys->is_encap = true;
+ return parse_eth_proto(skb, bpf_htons(ETH_P_IP));
+ case IPPROTO_IPV6:
+ keys->is_encap = true;
+ return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6));
+ case IPPROTO_GRE:
+ gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre);
+ if (!gre)
+ return BPF_DROP;
+
+ if (bpf_htons(gre->flags & GRE_VERSION))
+ /* Only inspect standard GRE packets with version 0 */
+ return BPF_OK;
+
+ keys->nhoff += sizeof(*gre); /* Step over GRE Flags and Proto */
+ if (GRE_IS_CSUM(gre->flags))
+ keys->nhoff += 4; /* Step over chksum and Padding */
+ if (GRE_IS_KEY(gre->flags))
+ keys->nhoff += 4; /* Step over key */
+ if (GRE_IS_SEQ(gre->flags))
+ keys->nhoff += 4; /* Step over sequence number */
+
+ keys->is_encap = true;
+
+ if (gre->proto == bpf_htons(ETH_P_TEB)) {
+ eth = bpf_flow_dissect_get_header(skb, sizeof(*eth),
+ &_eth);
+ if (!eth)
+ return BPF_DROP;
+
+ keys->nhoff += sizeof(*eth);
+
+ return parse_eth_proto(skb, eth->h_proto);
+ } else {
+ return parse_eth_proto(skb, gre->proto);
+ }
+ case IPPROTO_TCP:
+ tcp = bpf_flow_dissect_get_header(skb, sizeof(*tcp), &_tcp);
+ if (!tcp)
+ return BPF_DROP;
+
+ if (tcp->doff < 5)
+ return BPF_DROP;
+
+ if ((__u8 *)tcp + (tcp->doff << 2) > data_end)
+ return BPF_DROP;
+
+ keys->thoff = keys->nhoff;
+ keys->sport = tcp->source;
+ keys->dport = tcp->dest;
+ return BPF_OK;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ udp = bpf_flow_dissect_get_header(skb, sizeof(*udp), &_udp);
+ if (!udp)
+ return BPF_DROP;
+
+ keys->thoff = keys->nhoff;
+ keys->sport = udp->source;
+ keys->dport = udp->dest;
+ return BPF_OK;
+ default:
+ return BPF_DROP;
+ }
+
+ return BPF_DROP;
+}
+
+static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+
+ keys->ip_proto = nexthdr;
+ switch (nexthdr) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_DSTOPTS:
+ bpf_tail_call(skb, &jmp_table, IPV6OP);
+ break;
+ case IPPROTO_FRAGMENT:
+ bpf_tail_call(skb, &jmp_table, IPV6FR);
+ break;
+ default:
+ return parse_ip_proto(skb, nexthdr);
+ }
+
+ return BPF_DROP;
+}
+
+PROG(IP)(struct __sk_buff *skb)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ void *data = (void *)(long)skb->data;
+ struct iphdr *iph, _iph;
+ bool done = false;
+
+ iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph);
+ if (!iph)
+ return BPF_DROP;
+
+ /* IP header cannot be smaller than 20 bytes */
+ if (iph->ihl < 5)
+ return BPF_DROP;
+
+ keys->addr_proto = ETH_P_IP;
+ keys->ipv4_src = iph->saddr;
+ keys->ipv4_dst = iph->daddr;
+
+ keys->nhoff += iph->ihl << 2;
+ if (data + keys->nhoff > data_end)
+ return BPF_DROP;
+
+ if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
+ keys->is_frag = true;
+ if (iph->frag_off & bpf_htons(IP_OFFSET))
+ /* From second fragment on, packets do not have headers
+ * we can parse.
+ */
+ done = true;
+ else
+ keys->is_first_frag = true;
+ }
+
+ if (done)
+ return BPF_OK;
+
+ return parse_ip_proto(skb, iph->protocol);
+}
+
+PROG(IPV6)(struct __sk_buff *skb)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ struct ipv6hdr *ip6h, _ip6h;
+
+ ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
+ if (!ip6h)
+ return BPF_DROP;
+
+ keys->addr_proto = ETH_P_IPV6;
+ memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
+
+ keys->nhoff += sizeof(struct ipv6hdr);
+
+ return parse_ipv6_proto(skb, ip6h->nexthdr);
+}
+
+PROG(IPV6OP)(struct __sk_buff *skb)
+{
+ struct ipv6_opt_hdr *ip6h, _ip6h;
+
+ ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
+ if (!ip6h)
+ return BPF_DROP;
+
+ /* hlen is in 8-octets and does not include the first 8 bytes
+ * of the header
+ */
+ skb->flow_keys->nhoff += (1 + ip6h->hdrlen) << 3;
+
+ return parse_ipv6_proto(skb, ip6h->nexthdr);
+}
+
+PROG(IPV6FR)(struct __sk_buff *skb)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ struct frag_hdr *fragh, _fragh;
+
+ fragh = bpf_flow_dissect_get_header(skb, sizeof(*fragh), &_fragh);
+ if (!fragh)
+ return BPF_DROP;
+
+ keys->nhoff += sizeof(*fragh);
+ keys->is_frag = true;
+ if (!(fragh->frag_off & bpf_htons(IP6_OFFSET)))
+ keys->is_first_frag = true;
+
+ return parse_ipv6_proto(skb, fragh->nexthdr);
+}
+
+PROG(MPLS)(struct __sk_buff *skb)
+{
+ struct mpls_label *mpls, _mpls;
+
+ mpls = bpf_flow_dissect_get_header(skb, sizeof(*mpls), &_mpls);
+ if (!mpls)
+ return BPF_DROP;
+
+ return BPF_OK;
+}
+
+PROG(VLAN)(struct __sk_buff *skb)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ struct vlan_hdr *vlan, _vlan;
+ __be16 proto;
+
+ /* Peek back to see if single or double-tagging */
+ if (bpf_skb_load_bytes(skb, keys->nhoff - sizeof(proto), &proto,
+ sizeof(proto)))
+ return BPF_DROP;
+
+ /* Account for double-tagging */
+ if (proto == bpf_htons(ETH_P_8021AD)) {
+ vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
+ if (!vlan)
+ return BPF_DROP;
+
+ if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q))
+ return BPF_DROP;
+
+ keys->nhoff += sizeof(*vlan);
+ }
+
+ vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
+ if (!vlan)
+ return BPF_DROP;
+
+ keys->nhoff += sizeof(*vlan);
+ /* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/
+ if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) ||
+ vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q))
+ return BPF_DROP;
+
+ return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto);
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index b4994a94968b..3655508f95fd 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -18,3 +18,4 @@ CONFIG_CRYPTO_HMAC=m
CONFIG_CRYPTO_SHA256=m
CONFIG_VXLAN=y
CONFIG_GENEVE=y
+CONFIG_NET_CLS_FLOWER=m
diff --git a/tools/testing/selftests/bpf/flow_dissector_load.c b/tools/testing/selftests/bpf/flow_dissector_load.c
new file mode 100644
index 000000000000..d3273b5b3173
--- /dev/null
+++ b/tools/testing/selftests/bpf/flow_dissector_load.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+const char *cfg_pin_path = "/sys/fs/bpf/flow_dissector";
+const char *cfg_map_name = "jmp_table";
+bool cfg_attach = true;
+char *cfg_section_name;
+char *cfg_path_name;
+
+static void load_and_attach_program(void)
+{
+ struct bpf_program *prog, *main_prog;
+ struct bpf_map *prog_array;
+ int i, fd, prog_fd, ret;
+ struct bpf_object *obj;
+ int prog_array_fd;
+
+ ret = bpf_prog_load(cfg_path_name, BPF_PROG_TYPE_FLOW_DISSECTOR, &obj,
+ &prog_fd);
+ if (ret)
+ error(1, 0, "bpf_prog_load %s", cfg_path_name);
+
+ main_prog = bpf_object__find_program_by_title(obj, cfg_section_name);
+ if (!main_prog)
+ error(1, 0, "bpf_object__find_program_by_title %s",
+ cfg_section_name);
+
+ prog_fd = bpf_program__fd(main_prog);
+ if (prog_fd < 0)
+ error(1, 0, "bpf_program__fd");
+
+ prog_array = bpf_object__find_map_by_name(obj, cfg_map_name);
+ if (!prog_array)
+ error(1, 0, "bpf_object__find_map_by_name %s", cfg_map_name);
+
+ prog_array_fd = bpf_map__fd(prog_array);
+ if (prog_array_fd < 0)
+ error(1, 0, "bpf_map__fd %s", cfg_map_name);
+
+ i = 0;
+ bpf_object__for_each_program(prog, obj) {
+ fd = bpf_program__fd(prog);
+ if (fd < 0)
+ error(1, 0, "bpf_program__fd");
+
+ if (fd != prog_fd) {
+ printf("%d: %s\n", i, bpf_program__title(prog, false));
+ bpf_map_update_elem(prog_array_fd, &i, &fd, BPF_ANY);
+ ++i;
+ }
+ }
+
+ ret = bpf_prog_attach(prog_fd, 0 /* Ignore */, BPF_FLOW_DISSECTOR, 0);
+ if (ret)
+ error(1, 0, "bpf_prog_attach %s", cfg_path_name);
+
+ ret = bpf_object__pin(obj, cfg_pin_path);
+ if (ret)
+ error(1, 0, "bpf_object__pin %s", cfg_pin_path);
+
+}
+
+static void detach_program(void)
+{
+ char command[64];
+ int ret;
+
+ ret = bpf_prog_detach(0, BPF_FLOW_DISSECTOR);
+ if (ret)
+ error(1, 0, "bpf_prog_detach");
+
+ /* To unpin, it is necessary and sufficient to just remove this dir */
+ sprintf(command, "rm -r %s", cfg_pin_path);
+ ret = system(command);
+ if (ret)
+ error(1, errno, command);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ bool attach = false;
+ bool detach = false;
+ int c;
+
+ while ((c = getopt(argc, argv, "adp:s:")) != -1) {
+ switch (c) {
+ case 'a':
+ if (detach)
+ error(1, 0, "attach/detach are exclusive");
+ attach = true;
+ break;
+ case 'd':
+ if (attach)
+ error(1, 0, "attach/detach are exclusive");
+ detach = true;
+ break;
+ case 'p':
+ if (cfg_path_name)
+ error(1, 0, "only one prog name can be given");
+
+ cfg_path_name = optarg;
+ break;
+ case 's':
+ if (cfg_section_name)
+ error(1, 0, "only one section can be given");
+
+ cfg_section_name = optarg;
+ break;
+ }
+ }
+
+ if (detach)
+ cfg_attach = false;
+
+ if (cfg_attach && !cfg_path_name)
+ error(1, 0, "must provide a path to the BPF program");
+
+ if (cfg_attach && !cfg_section_name)
+ error(1, 0, "must provide a section name");
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+ if (cfg_attach)
+ load_and_attach_program();
+ else
+ detach_program();
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.c b/tools/testing/selftests/bpf/test_flow_dissector.c
new file mode 100644
index 000000000000..12b784afba31
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_flow_dissector.c
@@ -0,0 +1,782 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Inject packets with all sorts of encapsulation into the kernel.
+ *
+ * IPv4/IPv6 outer layer 3
+ * GRE/GUE/BARE outer layer 4, where bare is IPIP/SIT/IPv4-in-IPv6/..
+ * IPv4/IPv6 inner layer 3
+ */
+
+#define _GNU_SOURCE
+
+#include <stddef.h>
+#include <arpa/inet.h>
+#include <asm/byteorder.h>
+#include <error.h>
+#include <errno.h>
+#include <linux/if_packet.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ipv6.h>
+#include <netinet/ip.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#define CFG_PORT_INNER 8000
+
+/* Add some protocol definitions that do not exist in userspace */
+
+struct grehdr {
+ uint16_t unused;
+ uint16_t protocol;
+} __attribute__((packed));
+
+struct guehdr {
+ union {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u8 hlen:5,
+ control:1,
+ version:2;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u8 version:2,
+ control:1,
+ hlen:5;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+ __u8 proto_ctype;
+ __be16 flags;
+ };
+ __be32 word;
+ };
+};
+
+static uint8_t cfg_dsfield_inner;
+static uint8_t cfg_dsfield_outer;
+static uint8_t cfg_encap_proto;
+static bool cfg_expect_failure = false;
+static int cfg_l3_extra = AF_UNSPEC; /* optional SIT prefix */
+static int cfg_l3_inner = AF_UNSPEC;
+static int cfg_l3_outer = AF_UNSPEC;
+static int cfg_num_pkt = 10;
+static int cfg_num_secs = 0;
+static char cfg_payload_char = 'a';
+static int cfg_payload_len = 100;
+static int cfg_port_gue = 6080;
+static bool cfg_only_rx;
+static bool cfg_only_tx;
+static int cfg_src_port = 9;
+
+static char buf[ETH_DATA_LEN];
+
+#define INIT_ADDR4(name, addr4, port) \
+ static struct sockaddr_in name = { \
+ .sin_family = AF_INET, \
+ .sin_port = __constant_htons(port), \
+ .sin_addr.s_addr = __constant_htonl(addr4), \
+ };
+
+#define INIT_ADDR6(name, addr6, port) \
+ static struct sockaddr_in6 name = { \
+ .sin6_family = AF_INET6, \
+ .sin6_port = __constant_htons(port), \
+ .sin6_addr = addr6, \
+ };
+
+INIT_ADDR4(in_daddr4, INADDR_LOOPBACK, CFG_PORT_INNER)
+INIT_ADDR4(in_saddr4, INADDR_LOOPBACK + 2, 0)
+INIT_ADDR4(out_daddr4, INADDR_LOOPBACK, 0)
+INIT_ADDR4(out_saddr4, INADDR_LOOPBACK + 1, 0)
+INIT_ADDR4(extra_daddr4, INADDR_LOOPBACK, 0)
+INIT_ADDR4(extra_saddr4, INADDR_LOOPBACK + 1, 0)
+
+INIT_ADDR6(in_daddr6, IN6ADDR_LOOPBACK_INIT, CFG_PORT_INNER)
+INIT_ADDR6(in_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(out_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(out_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(extra_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(extra_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+
+static unsigned long util_gettime(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void util_printaddr(const char *msg, struct sockaddr *addr)
+{
+ unsigned long off = 0;
+ char nbuf[INET6_ADDRSTRLEN];
+
+ switch (addr->sa_family) {
+ case PF_INET:
+ off = __builtin_offsetof(struct sockaddr_in, sin_addr);
+ break;
+ case PF_INET6:
+ off = __builtin_offsetof(struct sockaddr_in6, sin6_addr);
+ break;
+ default:
+ error(1, 0, "printaddr: unsupported family %u\n",
+ addr->sa_family);
+ }
+
+ if (!inet_ntop(addr->sa_family, ((void *) addr) + off, nbuf,
+ sizeof(nbuf)))
+ error(1, errno, "inet_ntop");
+
+ fprintf(stderr, "%s: %s\n", msg, nbuf);
+}
+
+static unsigned long add_csum_hword(const uint16_t *start, int num_u16)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for (i = 0; i < num_u16; i++)
+ sum += start[i];
+
+ return sum;
+}
+
+static uint16_t build_ip_csum(const uint16_t *start, int num_u16,
+ unsigned long sum)
+{
+ sum += add_csum_hword(start, num_u16);
+
+ while (sum >> 16)
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ return ~sum;
+}
+
+static void build_ipv4_header(void *header, uint8_t proto,
+ uint32_t src, uint32_t dst,
+ int payload_len, uint8_t tos)
+{
+ struct iphdr *iph = header;
+
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->tos = tos;
+ iph->ttl = 8;
+ iph->tot_len = htons(sizeof(*iph) + payload_len);
+ iph->id = htons(1337);
+ iph->protocol = proto;
+ iph->saddr = src;
+ iph->daddr = dst;
+ iph->check = build_ip_csum((void *) iph, iph->ihl << 1, 0);
+}
+
+static void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield)
+{
+ uint16_t val, *ptr = (uint16_t *)ip6h;
+
+ val = ntohs(*ptr);
+ val &= 0xF00F;
+ val |= ((uint16_t) dsfield) << 4;
+ *ptr = htons(val);
+}
+
+static void build_ipv6_header(void *header, uint8_t proto,
+ struct sockaddr_in6 *src,
+ struct sockaddr_in6 *dst,
+ int payload_len, uint8_t dsfield)
+{
+ struct ipv6hdr *ip6h = header;
+
+ ip6h->version = 6;
+ ip6h->payload_len = htons(payload_len);
+ ip6h->nexthdr = proto;
+ ip6h->hop_limit = 8;
+ ipv6_set_dsfield(ip6h, dsfield);
+
+ memcpy(&ip6h->saddr, &src->sin6_addr, sizeof(ip6h->saddr));
+ memcpy(&ip6h->daddr, &dst->sin6_addr, sizeof(ip6h->daddr));
+}
+
+static uint16_t build_udp_v4_csum(const struct iphdr *iph,
+ const struct udphdr *udph,
+ int num_words)
+{
+ unsigned long pseudo_sum;
+ int num_u16 = sizeof(iph->saddr); /* halfwords: twice byte len */
+
+ pseudo_sum = add_csum_hword((void *) &iph->saddr, num_u16);
+ pseudo_sum += htons(IPPROTO_UDP);
+ pseudo_sum += udph->len;
+ return build_ip_csum((void *) udph, num_words, pseudo_sum);
+}
+
+static uint16_t build_udp_v6_csum(const struct ipv6hdr *ip6h,
+ const struct udphdr *udph,
+ int num_words)
+{
+ unsigned long pseudo_sum;
+ int num_u16 = sizeof(ip6h->saddr); /* halfwords: twice byte len */
+
+ pseudo_sum = add_csum_hword((void *) &ip6h->saddr, num_u16);
+ pseudo_sum += htons(ip6h->nexthdr);
+ pseudo_sum += ip6h->payload_len;
+ return build_ip_csum((void *) udph, num_words, pseudo_sum);
+}
+
+static void build_udp_header(void *header, int payload_len,
+ uint16_t dport, int family)
+{
+ struct udphdr *udph = header;
+ int len = sizeof(*udph) + payload_len;
+
+ udph->source = htons(cfg_src_port);
+ udph->dest = htons(dport);
+ udph->len = htons(len);
+ udph->check = 0;
+ if (family == AF_INET)
+ udph->check = build_udp_v4_csum(header - sizeof(struct iphdr),
+ udph, len >> 1);
+ else
+ udph->check = build_udp_v6_csum(header - sizeof(struct ipv6hdr),
+ udph, len >> 1);
+}
+
+static void build_gue_header(void *header, uint8_t proto)
+{
+ struct guehdr *gueh = header;
+
+ gueh->proto_ctype = proto;
+}
+
+static void build_gre_header(void *header, uint16_t proto)
+{
+ struct grehdr *greh = header;
+
+ greh->protocol = htons(proto);
+}
+
+static int l3_length(int family)
+{
+ if (family == AF_INET)
+ return sizeof(struct iphdr);
+ else
+ return sizeof(struct ipv6hdr);
+}
+
+static int build_packet(void)
+{
+ int ol3_len = 0, ol4_len = 0, il3_len = 0, il4_len = 0;
+ int el3_len = 0;
+
+ if (cfg_l3_extra)
+ el3_len = l3_length(cfg_l3_extra);
+
+ /* calculate header offsets */
+ if (cfg_encap_proto) {
+ ol3_len = l3_length(cfg_l3_outer);
+
+ if (cfg_encap_proto == IPPROTO_GRE)
+ ol4_len = sizeof(struct grehdr);
+ else if (cfg_encap_proto == IPPROTO_UDP)
+ ol4_len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ }
+
+ il3_len = l3_length(cfg_l3_inner);
+ il4_len = sizeof(struct udphdr);
+
+ if (el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len >=
+ sizeof(buf))
+ error(1, 0, "packet too large\n");
+
+ /*
+ * Fill packet from inside out, to calculate correct checksums.
+ * But create ip before udp headers, as udp uses ip for pseudo-sum.
+ */
+ memset(buf + el3_len + ol3_len + ol4_len + il3_len + il4_len,
+ cfg_payload_char, cfg_payload_len);
+
+ /* add zero byte for udp csum padding */
+ buf[el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len] = 0;
+
+ switch (cfg_l3_inner) {
+ case PF_INET:
+ build_ipv4_header(buf + el3_len + ol3_len + ol4_len,
+ IPPROTO_UDP,
+ in_saddr4.sin_addr.s_addr,
+ in_daddr4.sin_addr.s_addr,
+ il4_len + cfg_payload_len,
+ cfg_dsfield_inner);
+ break;
+ case PF_INET6:
+ build_ipv6_header(buf + el3_len + ol3_len + ol4_len,
+ IPPROTO_UDP,
+ &in_saddr6, &in_daddr6,
+ il4_len + cfg_payload_len,
+ cfg_dsfield_inner);
+ break;
+ }
+
+ build_udp_header(buf + el3_len + ol3_len + ol4_len + il3_len,
+ cfg_payload_len, CFG_PORT_INNER, cfg_l3_inner);
+
+ if (!cfg_encap_proto)
+ return il3_len + il4_len + cfg_payload_len;
+
+ switch (cfg_l3_outer) {
+ case PF_INET:
+ build_ipv4_header(buf + el3_len, cfg_encap_proto,
+ out_saddr4.sin_addr.s_addr,
+ out_daddr4.sin_addr.s_addr,
+ ol4_len + il3_len + il4_len + cfg_payload_len,
+ cfg_dsfield_outer);
+ break;
+ case PF_INET6:
+ build_ipv6_header(buf + el3_len, cfg_encap_proto,
+ &out_saddr6, &out_daddr6,
+ ol4_len + il3_len + il4_len + cfg_payload_len,
+ cfg_dsfield_outer);
+ break;
+ }
+
+ switch (cfg_encap_proto) {
+ case IPPROTO_UDP:
+ build_gue_header(buf + el3_len + ol3_len + ol4_len -
+ sizeof(struct guehdr),
+ cfg_l3_inner == PF_INET ? IPPROTO_IPIP
+ : IPPROTO_IPV6);
+ build_udp_header(buf + el3_len + ol3_len,
+ sizeof(struct guehdr) + il3_len + il4_len +
+ cfg_payload_len,
+ cfg_port_gue, cfg_l3_outer);
+ break;
+ case IPPROTO_GRE:
+ build_gre_header(buf + el3_len + ol3_len,
+ cfg_l3_inner == PF_INET ? ETH_P_IP
+ : ETH_P_IPV6);
+ break;
+ }
+
+ switch (cfg_l3_extra) {
+ case PF_INET:
+ build_ipv4_header(buf,
+ cfg_l3_outer == PF_INET ? IPPROTO_IPIP
+ : IPPROTO_IPV6,
+ extra_saddr4.sin_addr.s_addr,
+ extra_daddr4.sin_addr.s_addr,
+ ol3_len + ol4_len + il3_len + il4_len +
+ cfg_payload_len, 0);
+ break;
+ case PF_INET6:
+ build_ipv6_header(buf,
+ cfg_l3_outer == PF_INET ? IPPROTO_IPIP
+ : IPPROTO_IPV6,
+ &extra_saddr6, &extra_daddr6,
+ ol3_len + ol4_len + il3_len + il4_len +
+ cfg_payload_len, 0);
+ break;
+ }
+
+ return el3_len + ol3_len + ol4_len + il3_len + il4_len +
+ cfg_payload_len;
+}
+
+/* sender transmits encapsulated over RAW or unencap'd over UDP */
+static int setup_tx(void)
+{
+ int family, fd, ret;
+
+ if (cfg_l3_extra)
+ family = cfg_l3_extra;
+ else if (cfg_l3_outer)
+ family = cfg_l3_outer;
+ else
+ family = cfg_l3_inner;
+
+ fd = socket(family, SOCK_RAW, IPPROTO_RAW);
+ if (fd == -1)
+ error(1, errno, "socket tx");
+
+ if (cfg_l3_extra) {
+ if (cfg_l3_extra == PF_INET)
+ ret = connect(fd, (void *) &extra_daddr4,
+ sizeof(extra_daddr4));
+ else
+ ret = connect(fd, (void *) &extra_daddr6,
+ sizeof(extra_daddr6));
+ if (ret)
+ error(1, errno, "connect tx");
+ } else if (cfg_l3_outer) {
+ /* connect to destination if not encapsulated */
+ if (cfg_l3_outer == PF_INET)
+ ret = connect(fd, (void *) &out_daddr4,
+ sizeof(out_daddr4));
+ else
+ ret = connect(fd, (void *) &out_daddr6,
+ sizeof(out_daddr6));
+ if (ret)
+ error(1, errno, "connect tx");
+ } else {
+ /* otherwise using loopback */
+ if (cfg_l3_inner == PF_INET)
+ ret = connect(fd, (void *) &in_daddr4,
+ sizeof(in_daddr4));
+ else
+ ret = connect(fd, (void *) &in_daddr6,
+ sizeof(in_daddr6));
+ if (ret)
+ error(1, errno, "connect tx");
+ }
+
+ return fd;
+}
+
+/* receiver reads unencapsulated UDP */
+static int setup_rx(void)
+{
+ int fd, ret;
+
+ fd = socket(cfg_l3_inner, SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket rx");
+
+ if (cfg_l3_inner == PF_INET)
+ ret = bind(fd, (void *) &in_daddr4, sizeof(in_daddr4));
+ else
+ ret = bind(fd, (void *) &in_daddr6, sizeof(in_daddr6));
+ if (ret)
+ error(1, errno, "bind rx");
+
+ return fd;
+}
+
+static int do_tx(int fd, const char *pkt, int len)
+{
+ int ret;
+
+ ret = write(fd, pkt, len);
+ if (ret == -1)
+ error(1, errno, "send");
+ if (ret != len)
+ error(1, errno, "send: len (%d < %d)\n", ret, len);
+
+ return 1;
+}
+
+static int do_poll(int fd, short events, int timeout)
+{
+ struct pollfd pfd;
+ int ret;
+
+ pfd.fd = fd;
+ pfd.events = events;
+
+ ret = poll(&pfd, 1, timeout);
+ if (ret == -1)
+ error(1, errno, "poll");
+ if (ret && !(pfd.revents & POLLIN))
+ error(1, errno, "poll: unexpected event 0x%x\n", pfd.revents);
+
+ return ret;
+}
+
+static int do_rx(int fd)
+{
+ char rbuf;
+ int ret, num = 0;
+
+ while (1) {
+ ret = recv(fd, &rbuf, 1, MSG_DONTWAIT);
+ if (ret == -1 && errno == EAGAIN)
+ break;
+ if (ret == -1)
+ error(1, errno, "recv");
+ if (rbuf != cfg_payload_char)
+ error(1, 0, "recv: payload mismatch");
+ num++;
+ };
+
+ return num;
+}
+
+static int do_main(void)
+{
+ unsigned long tstop, treport, tcur;
+ int fdt = -1, fdr = -1, len, tx = 0, rx = 0;
+
+ if (!cfg_only_tx)
+ fdr = setup_rx();
+ if (!cfg_only_rx)
+ fdt = setup_tx();
+
+ len = build_packet();
+
+ tcur = util_gettime();
+ treport = tcur + 1000;
+ tstop = tcur + (cfg_num_secs * 1000);
+
+ while (1) {
+ if (!cfg_only_rx)
+ tx += do_tx(fdt, buf, len);
+
+ if (!cfg_only_tx)
+ rx += do_rx(fdr);
+
+ if (cfg_num_secs) {
+ tcur = util_gettime();
+ if (tcur >= tstop)
+ break;
+ if (tcur >= treport) {
+ fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
+ tx = 0;
+ rx = 0;
+ treport = tcur + 1000;
+ }
+ } else {
+ if (tx == cfg_num_pkt)
+ break;
+ }
+ }
+
+ /* read straggler packets, if any */
+ if (rx < tx) {
+ tstop = util_gettime() + 100;
+ while (rx < tx) {
+ tcur = util_gettime();
+ if (tcur >= tstop)
+ break;
+
+ do_poll(fdr, POLLIN, tstop - tcur);
+ rx += do_rx(fdr);
+ }
+ }
+
+ fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
+
+ if (fdr != -1 && close(fdr))
+ error(1, errno, "close rx");
+ if (fdt != -1 && close(fdt))
+ error(1, errno, "close tx");
+
+ /*
+ * success (== 0) only if received all packets
+ * unless failure is expected, in which case none must arrive.
+ */
+ if (cfg_expect_failure)
+ return rx != 0;
+ else
+ return rx != tx;
+}
+
+
+static void __attribute__((noreturn)) usage(const char *filepath)
+{
+ fprintf(stderr, "Usage: %s [-e gre|gue|bare|none] [-i 4|6] [-l len] "
+ "[-O 4|6] [-o 4|6] [-n num] [-t secs] [-R] [-T] "
+ "[-s <osrc> [-d <odst>] [-S <isrc>] [-D <idst>] "
+ "[-x <otos>] [-X <itos>] [-f <isport>] [-F]\n",
+ filepath);
+ exit(1);
+}
+
+static void parse_addr(int family, void *addr, const char *optarg)
+{
+ int ret;
+
+ ret = inet_pton(family, optarg, addr);
+ if (ret == -1)
+ error(1, errno, "inet_pton");
+ if (ret == 0)
+ error(1, 0, "inet_pton: bad string");
+}
+
+static void parse_addr4(struct sockaddr_in *addr, const char *optarg)
+{
+ parse_addr(AF_INET, &addr->sin_addr, optarg);
+}
+
+static void parse_addr6(struct sockaddr_in6 *addr, const char *optarg)
+{
+ parse_addr(AF_INET6, &addr->sin6_addr, optarg);
+}
+
+static int parse_protocol_family(const char *filepath, const char *optarg)
+{
+ if (!strcmp(optarg, "4"))
+ return PF_INET;
+ if (!strcmp(optarg, "6"))
+ return PF_INET6;
+
+ usage(filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "d:D:e:f:Fhi:l:n:o:O:Rs:S:t:Tx:X:")) != -1) {
+ switch (c) {
+ case 'd':
+ if (cfg_l3_outer == AF_UNSPEC)
+ error(1, 0, "-d must be preceded by -o");
+ if (cfg_l3_outer == AF_INET)
+ parse_addr4(&out_daddr4, optarg);
+ else
+ parse_addr6(&out_daddr6, optarg);
+ break;
+ case 'D':
+ if (cfg_l3_inner == AF_UNSPEC)
+ error(1, 0, "-D must be preceded by -i");
+ if (cfg_l3_inner == AF_INET)
+ parse_addr4(&in_daddr4, optarg);
+ else
+ parse_addr6(&in_daddr6, optarg);
+ break;
+ case 'e':
+ if (!strcmp(optarg, "gre"))
+ cfg_encap_proto = IPPROTO_GRE;
+ else if (!strcmp(optarg, "gue"))
+ cfg_encap_proto = IPPROTO_UDP;
+ else if (!strcmp(optarg, "bare"))
+ cfg_encap_proto = IPPROTO_IPIP;
+ else if (!strcmp(optarg, "none"))
+ cfg_encap_proto = IPPROTO_IP; /* == 0 */
+ else
+ usage(argv[0]);
+ break;
+ case 'f':
+ cfg_src_port = strtol(optarg, NULL, 0);
+ break;
+ case 'F':
+ cfg_expect_failure = true;
+ break;
+ case 'h':
+ usage(argv[0]);
+ break;
+ case 'i':
+ if (!strcmp(optarg, "4"))
+ cfg_l3_inner = PF_INET;
+ else if (!strcmp(optarg, "6"))
+ cfg_l3_inner = PF_INET6;
+ else
+ usage(argv[0]);
+ break;
+ case 'l':
+ cfg_payload_len = strtol(optarg, NULL, 0);
+ break;
+ case 'n':
+ cfg_num_pkt = strtol(optarg, NULL, 0);
+ break;
+ case 'o':
+ cfg_l3_outer = parse_protocol_family(argv[0], optarg);
+ break;
+ case 'O':
+ cfg_l3_extra = parse_protocol_family(argv[0], optarg);
+ break;
+ case 'R':
+ cfg_only_rx = true;
+ break;
+ case 's':
+ if (cfg_l3_outer == AF_INET)
+ parse_addr4(&out_saddr4, optarg);
+ else
+ parse_addr6(&out_saddr6, optarg);
+ break;
+ case 'S':
+ if (cfg_l3_inner == AF_INET)
+ parse_addr4(&in_saddr4, optarg);
+ else
+ parse_addr6(&in_saddr6, optarg);
+ break;
+ case 't':
+ cfg_num_secs = strtol(optarg, NULL, 0);
+ break;
+ case 'T':
+ cfg_only_tx = true;
+ break;
+ case 'x':
+ cfg_dsfield_outer = strtol(optarg, NULL, 0);
+ break;
+ case 'X':
+ cfg_dsfield_inner = strtol(optarg, NULL, 0);
+ break;
+ }
+ }
+
+ if (cfg_only_rx && cfg_only_tx)
+ error(1, 0, "options: cannot combine rx-only and tx-only");
+
+ if (cfg_encap_proto && cfg_l3_outer == AF_UNSPEC)
+ error(1, 0, "options: must specify outer with encap");
+ else if ((!cfg_encap_proto) && cfg_l3_outer != AF_UNSPEC)
+ error(1, 0, "options: cannot combine no-encap and outer");
+ else if ((!cfg_encap_proto) && cfg_l3_extra != AF_UNSPEC)
+ error(1, 0, "options: cannot combine no-encap and extra");
+
+ if (cfg_l3_inner == AF_UNSPEC)
+ cfg_l3_inner = AF_INET6;
+ if (cfg_l3_inner == AF_INET6 && cfg_encap_proto == IPPROTO_IPIP)
+ cfg_encap_proto = IPPROTO_IPV6;
+
+ /* RFC 6040 4.2:
+ * on decap, if outer encountered congestion (CE == 0x3),
+ * but inner cannot encode ECN (NoECT == 0x0), then drop packet.
+ */
+ if (((cfg_dsfield_outer & 0x3) == 0x3) &&
+ ((cfg_dsfield_inner & 0x3) == 0x0))
+ cfg_expect_failure = true;
+}
+
+static void print_opts(void)
+{
+ if (cfg_l3_inner == PF_INET6) {
+ util_printaddr("inner.dest6", (void *) &in_daddr6);
+ util_printaddr("inner.source6", (void *) &in_saddr6);
+ } else {
+ util_printaddr("inner.dest4", (void *) &in_daddr4);
+ util_printaddr("inner.source4", (void *) &in_saddr4);
+ }
+
+ if (!cfg_l3_outer)
+ return;
+
+ fprintf(stderr, "encap proto: %u\n", cfg_encap_proto);
+
+ if (cfg_l3_outer == PF_INET6) {
+ util_printaddr("outer.dest6", (void *) &out_daddr6);
+ util_printaddr("outer.source6", (void *) &out_saddr6);
+ } else {
+ util_printaddr("outer.dest4", (void *) &out_daddr4);
+ util_printaddr("outer.source4", (void *) &out_saddr4);
+ }
+
+ if (!cfg_l3_extra)
+ return;
+
+ if (cfg_l3_outer == PF_INET6) {
+ util_printaddr("extra.dest6", (void *) &extra_daddr6);
+ util_printaddr("extra.source6", (void *) &extra_saddr6);
+ } else {
+ util_printaddr("extra.dest4", (void *) &extra_daddr4);
+ util_printaddr("extra.source4", (void *) &extra_saddr4);
+ }
+
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+ print_opts();
+ return do_main();
+}
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh
new file mode 100755
index 000000000000..c0fb073b5eab
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_flow_dissector.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Load BPF flow dissector and verify it correctly dissects traffic
+export TESTNAME=test_flow_dissector
+unmount=0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+msg="skip all tests:"
+if [ $UID != 0 ]; then
+ echo $msg please run this as root >&2
+ exit $ksft_skip
+fi
+
+# This test needs to be run in a network namespace with in_netns.sh. Check if
+# this is the case and run it with in_netns.sh if it is being run in the root
+# namespace.
+if [[ -z $(ip netns identify $$) ]]; then
+ ../net/in_netns.sh "$0" "$@"
+ exit $?
+fi
+
+# Determine selftest success via shell exit code
+exit_handler()
+{
+ if (( $? == 0 )); then
+ echo "selftests: $TESTNAME [PASS]";
+ else
+ echo "selftests: $TESTNAME [FAILED]";
+ fi
+
+ set +e
+
+ # Cleanup
+ tc filter del dev lo ingress pref 1337 2> /dev/null
+ tc qdisc del dev lo ingress 2> /dev/null
+ ./flow_dissector_load -d 2> /dev/null
+ if [ $unmount -ne 0 ]; then
+ umount bpffs 2> /dev/null
+ fi
+}
+
+# Exit script immediately (well catched by trap handler) if any
+# program/thing exits with a non-zero status.
+set -e
+
+# (Use 'trap -l' to list meaning of numbers)
+trap exit_handler 0 2 3 6 9
+
+# Mount BPF file system
+if /bin/mount | grep /sys/fs/bpf > /dev/null; then
+ echo "bpffs already mounted"
+else
+ echo "bpffs not mounted. Mounting..."
+ unmount=1
+ /bin/mount bpffs /sys/fs/bpf -t bpf
+fi
+
+# Attach BPF program
+./flow_dissector_load -p bpf_flow.o -s dissect
+
+# Setup
+tc qdisc add dev lo ingress
+
+echo "Testing IPv4..."
+# Drops all IP/UDP packets coming from port 9
+tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \
+ udp src_port 9 action drop
+
+# Send 10 IPv4/UDP packets from port 8. Filter should not drop any.
+./test_flow_dissector -i 4 -f 8
+# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 4 -f 9 -F
+# Send 10 IPv4/UDP packets from port 10. Filter should not drop any.
+./test_flow_dissector -i 4 -f 10
+
+echo "Testing IPIP..."
+# Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 8
+# Send 10 IPv4/IPv4/UDP packets from port 9. Filter should drop all.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 9 -F
+# Send 10 IPv4/IPv4/UDP packets from port 10. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 10
+
+echo "Testing IPv4 + GRE..."
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 8. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 8
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 9. Filter should drop all.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 9 -F
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 10. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 10
+
+tc filter del dev lo ingress pref 1337
+
+echo "Testing IPv6..."
+# Drops all IPv6/UDP packets coming from port 9
+tc filter add dev lo parent ffff: protocol ipv6 pref 1337 flower ip_proto \
+ udp src_port 9 action drop
+
+# Send 10 IPv6/UDP packets from port 8. Filter should not drop any.
+./test_flow_dissector -i 6 -f 8
+# Send 10 IPv6/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 6 -f 9 -F
+# Send 10 IPv6/UDP packets from port 10. Filter should not drop any.
+./test_flow_dissector -i 6 -f 10
+
+exit 0
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 0ef68204c84b..63a671803ed6 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -112,13 +112,13 @@ static void test_pkt_access(void)
err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
NULL, NULL, &retval, &duration);
- CHECK(err || errno || retval, "ipv4",
+ CHECK(err || retval, "ipv4",
"err %d errno %d retval %d duration %d\n",
err, errno, retval, duration);
err = bpf_prog_test_run(prog_fd, 100000, &pkt_v6, sizeof(pkt_v6),
NULL, NULL, &retval, &duration);
- CHECK(err || errno || retval, "ipv6",
+ CHECK(err || retval, "ipv6",
"err %d errno %d retval %d duration %d\n",
err, errno, retval, duration);
bpf_object__close(obj);
@@ -153,14 +153,14 @@ static void test_xdp(void)
err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration);
- CHECK(err || errno || retval != XDP_TX || size != 74 ||
+ CHECK(err || retval != XDP_TX || size != 74 ||
iph->protocol != IPPROTO_IPIP, "ipv4",
"err %d errno %d retval %d size %d\n",
err, errno, retval, size);
err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration);
- CHECK(err || errno || retval != XDP_TX || size != 114 ||
+ CHECK(err || retval != XDP_TX || size != 114 ||
iph6->nexthdr != IPPROTO_IPV6, "ipv6",
"err %d errno %d retval %d size %d\n",
err, errno, retval, size);
@@ -185,13 +185,13 @@ static void test_xdp_adjust_tail(void)
err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration);
- CHECK(err || errno || retval != XDP_DROP,
+ CHECK(err || retval != XDP_DROP,
"ipv4", "err %d errno %d retval %d size %d\n",
err, errno, retval, size);
err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration);
- CHECK(err || errno || retval != XDP_TX || size != 54,
+ CHECK(err || retval != XDP_TX || size != 54,
"ipv6", "err %d errno %d retval %d size %d\n",
err, errno, retval, size);
bpf_object__close(obj);
@@ -254,14 +254,14 @@ static void test_l4lb(const char *file)
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration);
- CHECK(err || errno || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 ||
+ CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 ||
*magic != MAGIC_VAL, "ipv4",
"err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic);
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration);
- CHECK(err || errno || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 ||
+ CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 ||
*magic != MAGIC_VAL, "ipv6",
"err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic);
@@ -343,14 +343,14 @@ static void test_xdp_noinline(void)
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration);
- CHECK(err || errno || retval != 1 || size != 54 ||
+ CHECK(err || retval != 1 || size != 54 ||
*magic != MAGIC_VAL, "ipv4",
"err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic);
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration);
- CHECK(err || errno || retval != 1 || size != 74 ||
+ CHECK(err || retval != 1 || size != 74 ||
*magic != MAGIC_VAL, "ipv6",
"err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic);
diff --git a/tools/testing/selftests/bpf/with_addr.sh b/tools/testing/selftests/bpf/with_addr.sh
new file mode 100755
index 000000000000..ffcd3953f94c
--- /dev/null
+++ b/tools/testing/selftests/bpf/with_addr.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# add private ipv4 and ipv6 addresses to loopback
+
+readonly V6_INNER='100::a/128'
+readonly V4_INNER='192.168.0.1/32'
+
+if getopts ":s" opt; then
+ readonly SIT_DEV_NAME='sixtofourtest0'
+ readonly V6_SIT='2::/64'
+ readonly V4_SIT='172.17.0.1/32'
+ shift
+fi
+
+fail() {
+ echo "error: $*" 1>&2
+ exit 1
+}
+
+setup() {
+ ip -6 addr add "${V6_INNER}" dev lo || fail 'failed to setup v6 address'
+ ip -4 addr add "${V4_INNER}" dev lo || fail 'failed to setup v4 address'
+
+ if [[ -n "${V6_SIT}" ]]; then
+ ip link add "${SIT_DEV_NAME}" type sit remote any local any \
+ || fail 'failed to add sit'
+ ip link set dev "${SIT_DEV_NAME}" up \
+ || fail 'failed to bring sit device up'
+ ip -6 addr add "${V6_SIT}" dev "${SIT_DEV_NAME}" \
+ || fail 'failed to setup v6 SIT address'
+ ip -4 addr add "${V4_SIT}" dev "${SIT_DEV_NAME}" \
+ || fail 'failed to setup v4 SIT address'
+ fi
+
+ sleep 2 # avoid race causing bind to fail
+}
+
+cleanup() {
+ if [[ -n "${V6_SIT}" ]]; then
+ ip -4 addr del "${V4_SIT}" dev "${SIT_DEV_NAME}"
+ ip -6 addr del "${V6_SIT}" dev "${SIT_DEV_NAME}"
+ ip link del "${SIT_DEV_NAME}"
+ fi
+
+ ip -4 addr del "${V4_INNER}" dev lo
+ ip -6 addr del "${V6_INNER}" dev lo
+}
+
+trap cleanup EXIT
+
+setup
+"$@"
+exit "$?"
diff --git a/tools/testing/selftests/bpf/with_tunnels.sh b/tools/testing/selftests/bpf/with_tunnels.sh
new file mode 100755
index 000000000000..e24949ed3a20
--- /dev/null
+++ b/tools/testing/selftests/bpf/with_tunnels.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# setup tunnels for flow dissection test
+
+readonly SUFFIX="test_$(mktemp -u XXXX)"
+CONFIG="remote 127.0.0.2 local 127.0.0.1 dev lo"
+
+setup() {
+ ip link add "ipip_${SUFFIX}" type ipip ${CONFIG}
+ ip link add "gre_${SUFFIX}" type gre ${CONFIG}
+ ip link add "sit_${SUFFIX}" type sit ${CONFIG}
+
+ echo "tunnels before test:"
+ ip tunnel show
+
+ ip link set "ipip_${SUFFIX}" up
+ ip link set "gre_${SUFFIX}" up
+ ip link set "sit_${SUFFIX}" up
+}
+
+
+cleanup() {
+ ip tunnel del "ipip_${SUFFIX}"
+ ip tunnel del "gre_${SUFFIX}"
+ ip tunnel del "sit_${SUFFIX}"
+
+ echo "tunnels after test:"
+ ip tunnel show
+}
+
+trap cleanup EXIT
+
+setup
+"$@"
+exit "$?"