aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/net/addrconf.h1
-rw-r--r--include/net/lwtunnel.h2
-rw-r--r--include/uapi/linux/bpf.h26
-rw-r--r--net/core/filter.c49
-rw-r--r--net/core/lwt_bpf.c254
-rw-r--r--net/ipv6/addrconf_core.c6
-rw-r--r--net/ipv6/af_inet6.c7
-rw-r--r--tools/include/uapi/linux/bpf.h26
-rw-r--r--tools/testing/selftests/bpf/Makefile3
-rw-r--r--tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c85
-rwxr-xr-xtools/testing/selftests/bpf/test_lwt_ip_encap.sh311
11 files changed, 758 insertions, 12 deletions
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 20d523ee2fec..269ec27385e9 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -248,6 +248,7 @@ struct ipv6_stub {
const struct in6_addr *addr);
int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
struct dst_entry **dst, struct flowi6 *fl6);
+ int (*ipv6_route_input)(struct sk_buff *skb);
struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 33fd9ba7e0e5..671113bcb2cc 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int lwtunnel_input(struct sk_buff *skb);
int lwtunnel_xmit(struct sk_buff *skb);
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+ bool ingress);
static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 25c8c0e62ecf..bcdd2474eee7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2016,6 +2016,19 @@ union bpf_attr {
* Only works if *skb* contains an IPv6 packet. Insert a
* Segment Routing Header (**struct ipv6_sr_hdr**) inside
* the IPv6 header.
+ * **BPF_LWT_ENCAP_IP**
+ * IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ * must be IPv4 or IPv6, followed by zero or more
+ * additional headers, up to LWT_BPF_MAX_HEADROOM total
+ * bytes in all prepended headers. Please note that
+ * if skb_is_gso(skb) is true, no more than two headers
+ * can be prepended, and the inner header, if present,
+ * should be either GRE or UDP/GUE.
+ *
+ * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
+ * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
+ * by bpf programs of types BPF_PROG_TYPE_LWT_IN and
+ * BPF_PROG_TYPE_LWT_XMIT.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
@@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off {
/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
enum bpf_lwt_encap_mode {
BPF_LWT_ENCAP_SEG6,
- BPF_LWT_ENCAP_SEG6_INLINE
+ BPF_LWT_ENCAP_SEG6_INLINE,
+ BPF_LWT_ENCAP_IP,
};
#define __bpf_md_ptr(type, name) \
@@ -2606,7 +2620,15 @@ enum bpf_ret_code {
BPF_DROP = 2,
/* 3-6 reserved */
BPF_REDIRECT = 7,
- /* >127 are reserved for prog type specific return codes */
+ /* >127 are reserved for prog type specific return codes.
+ *
+ * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
+ * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
+ * changed and should be routed based on its new L3 header.
+ * (This is an L3 redirect, as opposed to L2 redirect
+ * represented by BPF_REDIRECT above).
+ */
+ BPF_LWT_REROUTE = 128,
};
struct bpf_sock {
diff --git a/net/core/filter.c b/net/core/filter.c
index 353735575204..a78deb2656e1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -73,6 +73,7 @@
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
+#include <net/lwtunnel.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -4815,7 +4816,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
}
#endif /* CONFIG_IPV6_SEG6_BPF */
-BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+ bool ingress)
+{
+ return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
+}
+#endif
+
+BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
u32, len)
{
switch (type) {
@@ -4824,13 +4833,40 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
case BPF_LWT_ENCAP_SEG6_INLINE:
return bpf_push_seg6_encap(skb, type, hdr, len);
#endif
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+ case BPF_LWT_ENCAP_IP:
+ return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
+ void *, hdr, u32, len)
+{
+ switch (type) {
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+ case BPF_LWT_ENCAP_IP:
+ return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
+#endif
default:
return -EINVAL;
}
}
-static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
- .func = bpf_lwt_push_encap,
+static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
+ .func = bpf_lwt_in_push_encap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
+ .func = bpf_lwt_xmit_push_encap,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -5417,7 +5453,8 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_lwt_seg6_adjust_srh ||
func == bpf_lwt_seg6_action ||
#endif
- func == bpf_lwt_push_encap)
+ func == bpf_lwt_in_push_encap ||
+ func == bpf_lwt_xmit_push_encap)
return true;
return false;
@@ -5815,7 +5852,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_lwt_push_encap:
- return &bpf_lwt_push_encap_proto;
+ return &bpf_lwt_in_push_encap_proto;
default:
return lwt_out_func_proto(func_id, prog);
}
@@ -5851,6 +5888,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_l4_csum_replace_proto;
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
+ case BPF_FUNC_lwt_push_encap:
+ return &bpf_lwt_xmit_push_encap_proto;
default:
return lwt_out_func_proto(func_id, prog);
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index a648568c5e8f..32251f3fcda0 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -16,6 +16,8 @@
#include <linux/types.h>
#include <linux/bpf.h>
#include <net/lwtunnel.h>
+#include <net/gre.h>
+#include <net/ip6_route.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@@ -55,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
switch (ret) {
case BPF_OK:
+ case BPF_LWT_REROUTE:
break;
case BPF_REDIRECT:
@@ -87,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
return ret;
}
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
+{
+ int err = -EINVAL;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ struct iphdr *iph = ip_hdr(skb);
+
+ err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb_dst(skb)->dev);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ err = ipv6_stub->ipv6_route_input(skb);
+ } else {
+ err = -EAFNOSUPPORT;
+ }
+
+ if (err)
+ goto err;
+ return dst_input(skb);
+
+err:
+ kfree_skb(skb);
+ return err;
+}
+
static int bpf_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@@ -98,11 +125,11 @@ static int bpf_input(struct sk_buff *skb)
ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
if (ret < 0)
return ret;
+ if (ret == BPF_LWT_REROUTE)
+ return bpf_lwt_input_reroute(skb);
}
if (unlikely(!dst->lwtstate->orig_input)) {
- pr_warn_once("orig_input not set on dst for prog %s\n",
- bpf->out.name);
kfree_skb(skb);
return -EINVAL;
}
@@ -147,6 +174,91 @@ static int xmit_check_hhlen(struct sk_buff *skb)
return 0;
}
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
+{
+ struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+ int oif = l3mdev ? l3mdev->ifindex : 0;
+ struct dst_entry *dst = NULL;
+ struct sock *sk;
+ struct net *net;
+ bool ipv4;
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ ipv4 = true;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ipv4 = false;
+ else
+ return -EAFNOSUPPORT;
+
+ sk = sk_to_full_sk(skb->sk);
+ if (sk) {
+ if (sk->sk_bound_dev_if)
+ oif = sk->sk_bound_dev_if;
+ net = sock_net(sk);
+ } else {
+ net = dev_net(skb_dst(skb)->dev);
+ }
+
+ if (ipv4) {
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {};
+ struct rtable *rt;
+
+ fl4.flowi4_oif = oif;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_uid = sock_net_uid(net, sk);
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ fl4.flowi4_proto = iph->protocol;
+ fl4.daddr = iph->daddr;
+ fl4.saddr = iph->saddr;
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return -EINVAL;
+ dst = &rt->dst;
+ } else {
+ struct ipv6hdr *iph6 = ipv6_hdr(skb);
+ struct flowi6 fl6 = {};
+
+ fl6.flowi6_oif = oif;
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(net, sk);
+ fl6.flowlabel = ip6_flowinfo(iph6);
+ fl6.flowi6_proto = iph6->nexthdr;
+ fl6.daddr = iph6->daddr;
+ fl6.saddr = iph6->saddr;
+
+ err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6);
+ if (err || IS_ERR(dst))
+ return -EINVAL;
+ }
+ if (unlikely(dst->error)) {
+ dst_release(dst);
+ return -EINVAL;
+ }
+
+ /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
+ * was done for the previous dst, so we are doing it here again, in
+ * case the new dst needs much more space. The call below is a noop
+ * if there is enough header space in skb.
+ */
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ return err;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+ if (unlikely(err))
+ return err;
+
+ /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+ return LWTUNNEL_XMIT_DONE;
+}
+
static int bpf_xmit(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@@ -154,11 +266,20 @@ static int bpf_xmit(struct sk_buff *skb)
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->xmit.prog) {
+ __be16 proto = skb->protocol;
int ret;
ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
switch (ret) {
case BPF_OK:
+ /* If the header changed, e.g. via bpf_lwt_push_encap,
+ * BPF_LWT_REROUTE below should have been used if the
+ * protocol was also changed.
+ */
+ if (skb->protocol != proto) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
/* If the header was expanded, headroom might be too
* small for L2 header to come, expand as needed.
*/
@@ -169,6 +290,8 @@ static int bpf_xmit(struct sk_buff *skb)
return LWTUNNEL_XMIT_CONTINUE;
case BPF_REDIRECT:
return LWTUNNEL_XMIT_DONE;
+ case BPF_LWT_REROUTE:
+ return bpf_lwt_xmit_reroute(skb);
default:
return ret;
}
@@ -390,6 +513,133 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
.owner = THIS_MODULE,
};
+static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
+ int encap_len)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_type |= gso_type;
+ skb_decrease_gso_size(shinfo, encap_len);
+ shinfo->gso_segs = 0;
+ return 0;
+}
+
+static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
+{
+ int next_hdr_offset;
+ void *next_hdr;
+ __u8 protocol;
+
+ /* SCTP and UDP_L4 gso need more nuanced handling than what
+ * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
+ * So at the moment only TCP GSO packets are let through.
+ */
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ return -ENOTSUPP;
+
+ if (ipv4) {
+ protocol = ip_hdr(skb)->protocol;
+ next_hdr_offset = sizeof(struct iphdr);
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
+ } else {
+ protocol = ipv6_hdr(skb)->nexthdr;
+ next_hdr_offset = sizeof(struct ipv6hdr);
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
+ }
+
+ switch (protocol) {
+ case IPPROTO_GRE:
+ next_hdr_offset += sizeof(struct gre_base_hdr);
+ if (next_hdr_offset > encap_len)
+ return -EINVAL;
+
+ if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
+ return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
+ encap_len);
+ return handle_gso_type(skb, SKB_GSO_GRE, encap_len);
+
+ case IPPROTO_UDP:
+ next_hdr_offset += sizeof(struct udphdr);
+ if (next_hdr_offset > encap_len)
+ return -EINVAL;
+
+ if (((struct udphdr *)next_hdr)->check)
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
+ encap_len);
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);
+
+ case IPPROTO_IP:
+ case IPPROTO_IPV6:
+ if (ipv4)
+ return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
+ else
+ return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);
+
+ default:
+ return -EPROTONOSUPPORT;
+ }
+}
+
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
+{
+ struct iphdr *iph;
+ bool ipv4;
+ int err;
+
+ if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
+ return -EINVAL;
+
+ /* validate protocol and length */
+ iph = (struct iphdr *)hdr;
+ if (iph->version == 4) {
+ ipv4 = true;
+ if (unlikely(len < iph->ihl * 4))
+ return -EINVAL;
+ } else if (iph->version == 6) {
+ ipv4 = false;
+ if (unlikely(len < sizeof(struct ipv6hdr)))
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ if (ingress)
+ err = skb_cow_head(skb, len + skb->mac_len);
+ else
+ err = skb_cow_head(skb,
+ len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
+ if (unlikely(err))
+ return err;
+
+ /* push the encap headers and fix pointers */
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ skb_push(skb, len);
+ if (ingress)
+ skb_postpush_rcsum(skb, iph, len);
+ skb_reset_network_header(skb);
+ memcpy(skb_network_header(skb), hdr, len);
+ bpf_compute_data_pointers(skb);
+ skb_clear_hash(skb);
+
+ if (ipv4) {
+ skb->protocol = htons(ETH_P_IP);
+ iph = ip_hdr(skb);
+
+ if (!iph->check)
+ iph->check = ip_fast_csum((unsigned char *)iph,
+ iph->ihl);
+ } else {
+ skb->protocol = htons(ETH_P_IPV6);
+ }
+
+ if (skb_is_gso(skb))
+ return handle_gso_encap(skb, ipv4, len);
+
+ return 0;
+}
+
static int __init bpf_lwt_init(void)
{
return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 5cd0029d930e..6c79af056d9b 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,6 +134,11 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
return -EAFNOSUPPORT;
}
+static int eafnosupport_ipv6_route_input(struct sk_buff *skb)
+{
+ return -EAFNOSUPPORT;
+}
+
static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
{
return NULL;
@@ -170,6 +175,7 @@ eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+ .ipv6_route_input = eafnosupport_ipv6_route_input,
.fib6_get_table = eafnosupport_fib6_get_table,
.fib6_table_lookup = eafnosupport_fib6_table_lookup,
.fib6_lookup = eafnosupport_fib6_lookup,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d99753b5e39b..2f45d2a3e3a3 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -900,10 +900,17 @@ static struct pernet_operations inet6_net_ops = {
.exit = inet6_net_exit,
};
+static int ipv6_route_input(struct sk_buff *skb)
+{
+ ip6_route_input(skb);
+ return skb_dst(skb)->error;
+}
+
static const struct ipv6_stub ipv6_stub_impl = {
.ipv6_sock_mc_join = ipv6_sock_mc_join,
.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
.ipv6_dst_lookup = ip6_dst_lookup,
+ .ipv6_route_input = ipv6_route_input,
.fib6_get_table = fib6_get_table,
.fib6_table_lookup = fib6_table_lookup,
.fib6_lookup = fib6_lookup,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 25c8c0e62ecf..bcdd2474eee7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2016,6 +2016,19 @@ union bpf_attr {
* Only works if *skb* contains an IPv6 packet. Insert a
* Segment Routing Header (**struct ipv6_sr_hdr**) inside
* the IPv6 header.
+ * **BPF_LWT_ENCAP_IP**
+ * IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ * must be IPv4 or IPv6, followed by zero or more
+ * additional headers, up to LWT_BPF_MAX_HEADROOM total
+ * bytes in all prepended headers. Please note that
+ * if skb_is_gso(skb) is true, no more than two headers
+ * can be prepended, and the inner header, if present,
+ * should be either GRE or UDP/GUE.
+ *
+ * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
+ * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
+ * by bpf programs of types BPF_PROG_TYPE_LWT_IN and
+ * BPF_PROG_TYPE_LWT_XMIT.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
@@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off {
/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
enum bpf_lwt_encap_mode {
BPF_LWT_ENCAP_SEG6,
- BPF_LWT_ENCAP_SEG6_INLINE
+ BPF_LWT_ENCAP_SEG6_INLINE,
+ BPF_LWT_ENCAP_IP,
};
#define __bpf_md_ptr(type, name) \
@@ -2606,7 +2620,15 @@ enum bpf_ret_code {
BPF_DROP = 2,
/* 3-6 reserved */
BPF_REDIRECT = 7,
- /* >127 are reserved for prog type specific return codes */
+ /* >127 are reserved for prog type specific return codes.
+ *
+ * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
+ * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
+ * changed and should be routed based on its new L3 header.
+ * (This is an L3 redirect, as opposed to L2 redirect
+ * represented by BPF_REDIRECT above).
+ */
+ BPF_LWT_REROUTE = 128,
};
struct bpf_sock {
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index c3edf47da05d..ccffaa0a0787 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -50,7 +50,8 @@ TEST_PROGS := test_kmod.sh \
test_lirc_mode2.sh \
test_skb_cgroup_id.sh \
test_flow_dissector.sh \
- test_xdp_vlan.sh
+ test_xdp_vlan.sh \
+ test_lwt_ip_encap.sh
TEST_PROGS_EXTENDED := with_addr.sh \
with_tunnels.sh \
diff --git a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c
new file mode 100644
index 000000000000..c957d6dfe6d7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+struct grehdr {
+ __be16 flags;
+ __be16 protocol;
+};
+
+SEC("encap_gre")
+int bpf_lwt_encap_gre(struct __sk_buff *skb)
+{
+ struct encap_hdr {
+ struct iphdr iph;
+ struct grehdr greh;
+ } hdr;
+ int err;
+
+ memset(&hdr, 0, sizeof(struct encap_hdr));
+
+ hdr.iph.ihl = 5;
+ hdr.iph.version = 4;
+ hdr.iph.ttl = 0x40;
+ hdr.iph.protocol = 47; /* IPPROTO_GRE */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ hdr.iph.saddr = 0x640110ac; /* 172.16.1.100 */
+ hdr.iph.daddr = 0x641010ac; /* 172.16.16.100 */
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ hdr.iph.saddr = 0xac100164; /* 172.16.1.100 */
+ hdr.iph.daddr = 0xac101064; /* 172.16.16.100 */
+#else
+#error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+ hdr.iph.tot_len = bpf_htons(skb->len + sizeof(struct encap_hdr));
+
+ hdr.greh.protocol = skb->protocol;
+
+ err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+ sizeof(struct encap_hdr));
+ if (err)
+ return BPF_DROP;
+
+ return BPF_LWT_REROUTE;
+}
+
+SEC("encap_gre6")
+int bpf_lwt_encap_gre6(struct __sk_buff *skb)
+{
+ struct encap_hdr {
+ struct ipv6hdr ip6hdr;
+ struct grehdr greh;
+ } hdr;
+ int err;
+
+ memset(&hdr, 0, sizeof(struct encap_hdr));
+
+ hdr.ip6hdr.version = 6;
+ hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(struct grehdr));
+ hdr.ip6hdr.nexthdr = 47; /* IPPROTO_GRE */
+ hdr.ip6hdr.hop_limit = 0x40;
+ /* fb01::1 */
+ hdr.ip6hdr.saddr.s6_addr[0] = 0xfb;
+ hdr.ip6hdr.saddr.s6_addr[1] = 1;
+ hdr.ip6hdr.saddr.s6_addr[15] = 1;
+ /* fb10::1 */
+ hdr.ip6hdr.daddr.s6_addr[0] = 0xfb;
+ hdr.ip6hdr.daddr.s6_addr[1] = 0x10;
+ hdr.ip6hdr.daddr.s6_addr[15] = 1;
+
+ hdr.greh.protocol = skb->protocol;
+
+ err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+ sizeof(struct encap_hdr));
+ if (err)
+ return BPF_DROP;
+
+ return BPF_LWT_REROUTE;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
new file mode 100755
index 000000000000..4ca714e23ab0
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Setup/topology:
+#
+# NS1 NS2 NS3
+# veth1 <---> veth2 veth3 <---> veth4 (the top route)
+# veth5 <---> veth6 veth7 <---> veth8 (the bottom route)
+#
+# each vethN gets IPv[4|6]_N address
+#
+# IPv*_SRC = IPv*_1
+# IPv*_DST = IPv*_4
+#
+# all tests test pings from IPv*_SRC to IPv*_DST
+#
+# by default, routes are configured to allow packets to go
+# IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route)
+#
+# a GRE device is installed in NS3 with IPv*_GRE, and
+# NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8
+# (the bottom route)
+#
+# Tests:
+#
+# 1. routes NS2->IPv*_DST are brought down, so the only way a ping
+# from IP*_SRC to IP*_DST can work is via IPv*_GRE
+#
+# 2a. in an egress test, a bpf LWT_XMIT program is installed on veth1
+# that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+# ping: SRC->[encap at veth1:egress]->GRE:decap->DST
+# ping replies go DST->SRC directly
+#
+# 2b. in an ingress test, a bpf LWT_IN program is installed on veth2
+# that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+# ping: SRC->[encap at veth2:ingress]->GRE:decap->DST
+# ping replies go DST->SRC directly
+
+set -e # exit on error
+
+if [[ $EUID -ne 0 ]]; then
+ echo "This script must be run as root"
+ echo "FAIL"
+ exit 1
+fi
+
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
+readonly NS3="ns3-$(mktemp -u XXXXXX)"
+
+readonly IPv4_1="172.16.1.100"
+readonly IPv4_2="172.16.2.100"
+readonly IPv4_3="172.16.3.100"
+readonly IPv4_4="172.16.4.100"
+readonly IPv4_5="172.16.5.100"
+readonly IPv4_6="172.16.6.100"
+readonly IPv4_7="172.16.7.100"
+readonly IPv4_8="172.16.8.100"
+readonly IPv4_GRE="172.16.16.100"
+
+readonly IPv4_SRC=$IPv4_1
+readonly IPv4_DST=$IPv4_4
+
+readonly IPv6_1="fb01::1"
+readonly IPv6_2="fb02::1"
+readonly IPv6_3="fb03::1"
+readonly IPv6_4="fb04::1"
+readonly IPv6_5="fb05::1"
+readonly IPv6_6="fb06::1"
+readonly IPv6_7="fb07::1"
+readonly IPv6_8="fb08::1"
+readonly IPv6_GRE="fb10::1"
+
+readonly IPv6_SRC=$IPv6_1
+readonly IPv6_DST=$IPv6_4
+
+setup() {
+set -e # exit on error
+ # create devices and namespaces
+ ip netns add "${NS1}"
+ ip netns add "${NS2}"
+ ip netns add "${NS3}"
+
+ ip link add veth1 type veth peer name veth2
+ ip link add veth3 type veth peer name veth4
+ ip link add veth5 type veth peer name veth6
+ ip link add veth7 type veth peer name veth8
+
+ ip netns exec ${NS2} sysctl -wq net.ipv4.ip_forward=1
+ ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.forwarding=1
+
+ ip link set veth1 netns ${NS1}
+ ip link set veth2 netns ${NS2}
+ ip link set veth3 netns ${NS2}
+ ip link set veth4 netns ${NS3}
+ ip link set veth5 netns ${NS1}
+ ip link set veth6 netns ${NS2}
+ ip link set veth7 netns ${NS2}
+ ip link set veth8 netns ${NS3}
+
+ # configure addesses: the top route (1-2-3-4)
+ ip -netns ${NS1} addr add ${IPv4_1}/24 dev veth1
+ ip -netns ${NS2} addr add ${IPv4_2}/24 dev veth2
+ ip -netns ${NS2} addr add ${IPv4_3}/24 dev veth3
+ ip -netns ${NS3} addr add ${IPv4_4}/24 dev veth4
+ ip -netns ${NS1} -6 addr add ${IPv6_1}/128 nodad dev veth1
+ ip -netns ${NS2} -6 addr add ${IPv6_2}/128 nodad dev veth2
+ ip -netns ${NS2} -6 addr add ${IPv6_3}/128 nodad dev veth3
+ ip -netns ${NS3} -6 addr add ${IPv6_4}/128 nodad dev veth4
+
+ # configure addresses: the bottom route (5-6-7-8)
+ ip -netns ${NS1} addr add ${IPv4_5}/24 dev veth5
+ ip -netns ${NS2} addr add ${IPv4_6}/24 dev veth6
+ ip -netns ${NS2} addr add ${IPv4_7}/24 dev veth7
+ ip -netns ${NS3} addr add ${IPv4_8}/24 dev veth8
+ ip -netns ${NS1} -6 addr add ${IPv6_5}/128 nodad dev veth5
+ ip -netns ${NS2} -6 addr add ${IPv6_6}/128 nodad dev veth6
+ ip -netns ${NS2} -6 addr add ${IPv6_7}/128 nodad dev veth7
+ ip -netns ${NS3} -6 addr add ${IPv6_8}/128 nodad dev veth8
+
+
+ ip -netns ${NS1} link set dev veth1 up
+ ip -netns ${NS2} link set dev veth2 up
+ ip -netns ${NS2} link set dev veth3 up
+ ip -netns ${NS3} link set dev veth4 up
+ ip -netns ${NS1} link set dev veth5 up
+ ip -netns ${NS2} link set dev veth6 up
+ ip -netns ${NS2} link set dev veth7 up
+ ip -netns ${NS3} link set dev veth8 up
+
+ # configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default;
+ # the bottom route to specific bottom addresses
+
+ # NS1
+ # top route
+ ip -netns ${NS1} route add ${IPv4_2}/32 dev veth1
+ ip -netns ${NS1} route add default dev veth1 via ${IPv4_2} # go top by default
+ ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1
+ ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2} # go top by default
+ # bottom route
+ ip -netns ${NS1} route add ${IPv4_6}/32 dev veth5
+ ip -netns ${NS1} route add ${IPv4_7}/32 dev veth5 via ${IPv4_6}
+ ip -netns ${NS1} route add ${IPv4_8}/32 dev veth5 via ${IPv4_6}
+ ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5
+ ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6}
+ ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6}
+
+ # NS2
+ # top route
+ ip -netns ${NS2} route add ${IPv4_1}/32 dev veth2
+ ip -netns ${NS2} route add ${IPv4_4}/32 dev veth3
+ ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2
+ ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3
+ # bottom route
+ ip -netns ${NS2} route add ${IPv4_5}/32 dev veth6
+ ip -netns ${NS2} route add ${IPv4_8}/32 dev veth7
+ ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6
+ ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7
+
+ # NS3
+ # top route
+ ip -netns ${NS3} route add ${IPv4_3}/32 dev veth4
+ ip -netns ${NS3} route add ${IPv4_1}/32 dev veth4 via ${IPv4_3}
+ ip -netns ${NS3} route add ${IPv4_2}/32 dev veth4 via ${IPv4_3}
+ ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4
+ ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3}
+ ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3}
+ # bottom route
+ ip -netns ${NS3} route add ${IPv4_7}/32 dev veth8
+ ip -netns ${NS3} route add ${IPv4_5}/32 dev veth8 via ${IPv4_7}
+ ip -netns ${NS3} route add ${IPv4_6}/32 dev veth8 via ${IPv4_7}
+ ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8
+ ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7}
+ ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7}
+
+ # configure IPv4 GRE device in NS3, and a route to it via the "bottom" route
+ ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255
+ ip -netns ${NS3} link set gre_dev up
+ ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev
+ ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6}
+ ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8}
+
+
+ # configure IPv6 GRE device in NS3, and a route to it via the "bottom" route
+ ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255
+ ip -netns ${NS3} link set gre6_dev up
+ ip -netns ${NS3} -6 addr add ${IPv6_GRE} nodad dev gre6_dev
+ ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6}
+ ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8}
+
+ # rp_filter gets confused by what these tests are doing, so disable it
+ ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0
+ ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0
+ ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0
+}
+
+cleanup() {
+ ip netns del ${NS1} 2> /dev/null
+ ip netns del ${NS2} 2> /dev/null
+ ip netns del ${NS3} 2> /dev/null
+}
+
+trap cleanup EXIT
+
+test_ping() {
+ local readonly PROTO=$1
+ local readonly EXPECTED=$2
+ local RET=0
+
+ set +e
+ if [ "${PROTO}" == "IPv4" ] ; then
+ ip netns exec ${NS1} ping -c 1 -W 1 -I ${IPv4_SRC} ${IPv4_DST} 2>&1 > /dev/null
+ RET=$?
+ elif [ "${PROTO}" == "IPv6" ] ; then
+ ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST} 2>&1 > /dev/null
+ RET=$?
+ else
+ echo "test_ping: unknown PROTO: ${PROTO}"
+ exit 1
+ fi
+ set -e
+
+ if [ "0" != "${RET}" ]; then
+ RET=1
+ fi
+
+ if [ "${EXPECTED}" != "${RET}" ] ; then
+ echo "FAIL: test_ping: ${RET}"
+ exit 1
+ fi
+}
+
+test_egress() {
+ local readonly ENCAP=$1
+ echo "starting egress ${ENCAP} encap test"
+ setup
+
+ # need to wait a bit for IPv6 to autoconf, otherwise
+ # ping6 sometimes fails with "unable to bind to address"
+
+ # by default, pings work
+ test_ping IPv4 0
+ test_ping IPv6 0
+
+ # remove NS2->DST routes, ping fails
+ ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3
+ ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
+ test_ping IPv4 1
+ test_ping IPv6 1
+
+ # install replacement routes (LWT/eBPF), pings succeed
+ if [ "${ENCAP}" == "IPv4" ] ; then
+ ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
+ ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
+ elif [ "${ENCAP}" == "IPv6" ] ; then
+ ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
+ ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
+ else
+ echo "FAIL: unknown encap ${ENCAP}"
+ fi
+ test_ping IPv4 0
+ test_ping IPv6 0
+
+ cleanup
+ echo "PASS"
+}
+
+test_ingress() {
+ local readonly ENCAP=$1
+ echo "starting ingress ${ENCAP} encap test"
+ setup
+
+ # need to wait a bit for IPv6 to autoconf, otherwise
+ # ping6 sometimes fails with "unable to bind to address"
+
+ # by default, pings work
+ test_ping IPv4 0
+ test_ping IPv6 0
+
+ # remove NS2->DST routes, pings fail
+ ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3
+ ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
+ test_ping IPv4 1
+ test_ping IPv6 1
+
+ # install replacement routes (LWT/eBPF), pings succeed
+ if [ "${ENCAP}" == "IPv4" ] ; then
+ ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
+ ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
+ elif [ "${ENCAP}" == "IPv6" ] ; then
+ ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
+ ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
+ else
+ echo "FAIL: unknown encap ${ENCAP}"
+ fi
+ test_ping IPv4 0
+ test_ping IPv6 0
+
+ cleanup
+ echo "PASS"
+}
+
+test_egress IPv4
+test_egress IPv6
+
+test_ingress IPv4
+test_ingress IPv6
+
+echo "all tests passed"