diff options
31 files changed, 1504 insertions, 230 deletions
diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 848735b3a7c0..813a19122ebb 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -11,6 +11,9 @@ struct netdev_xmit { #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) u8 sched_mirred_nest; #endif +#if IS_ENABLED(CONFIG_NF_DUP_NETDEV) + u8 nf_dup_skb_recursion; +#endif }; #endif diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2b8aac2c70ad..5f896fcc074d 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -95,6 +95,9 @@ enum nf_hook_ops_type { }; struct nf_hook_ops { + struct list_head list; + struct rcu_head rcu; + /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; @@ -470,6 +473,7 @@ struct nf_ct_hook { void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); int (*confirm)(struct sk_buff *skb); + u32 (*get_id)(const struct nf_conntrack *nfct); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; @@ -498,17 +502,6 @@ extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; /* - * nf_skb_duplicated - TEE target has sent a packet - * - * When a xtables target sends a packet, the OUTPUT and POSTROUTING - * hooks are traversed again, i.e. nft and xtables are invoked recursively. - * - * This is used by xtables TEE target to prevent the duplicated skb from - * being duplicated again. - */ -DECLARE_PER_CPU(bool, nf_skb_duplicated); - -/* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..52d9c52dc8f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1044,6 +1044,7 @@ struct task_struct { /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif + unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 803d5f1601f9..e4d8e451e935 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1142,6 +1142,11 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); +struct nft_hook; +void nf_tables_chain_device_notify(const struct nft_chain *chain, + const struct nft_hook *hook, + const struct net_device *dev, int event); + enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, @@ -1199,12 +1204,17 @@ struct nft_stats { struct nft_hook { struct list_head list; - struct nf_hook_ops ops; + struct list_head ops_list; struct rcu_head rcu; char ifname[IFNAMSIZ]; u8 ifnamelen; }; +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev); +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev); + /** * struct nft_base_chain - nf_tables base chain * diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 6e202ed5e63f..7370fba844ef 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -2,6 +2,7 @@ #ifndef _NFT_FIB_H_ #define _NFT_FIB_H_ +#include <net/l3mdev.h> #include <net/netfilter/nf_tables.h> struct nft_fib { @@ -39,6 +40,14 @@ static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) return nft_fib_is_loopback(pkt->skb, indev); } +static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt, + const struct net_device *iif) +{ + const struct net_device *dev = iif ? iif : pkt->skb->dev; + + return l3mdev_master_ifindex_rcu(dev); +} + int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 7d6bc19a0153..518ba144544c 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -142,6 +142,8 @@ enum nf_tables_msg_types { NFT_MSG_DESTROYOBJ, NFT_MSG_DESTROYFLOWTABLE, NFT_MSG_GETSETELEM_RESET, + NFT_MSG_NEWDEV, + NFT_MSG_DELDEV, NFT_MSG_MAX, }; @@ -1784,10 +1786,18 @@ enum nft_synproxy_attributes { * enum nft_device_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) + * @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING) + * @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING) + * @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING) + * @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING) */ enum nft_devices_attributes { NFTA_DEVICE_UNSPEC, NFTA_DEVICE_NAME, + NFTA_DEVICE_TABLE, + NFTA_DEVICE_FLOWTABLE, + NFTA_DEVICE_CHAIN, + NFTA_DEVICE_SPEC, __NFTA_DEVICE_MAX }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) @@ -1841,6 +1851,10 @@ enum nft_xfrm_keys { * @NFTA_TRACE_MARK: nfmark (NLA_U32) * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32) * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32) + * @NFTA_TRACE_CT_ID: conntrack id (NLA_U32) + * @NFTA_TRACE_CT_DIRECTION: packets direction (NLA_U8) + * @NFTA_TRACE_CT_STATUS: conntrack status (NLA_U32) + * @NFTA_TRACE_CT_STATE: packet state (new, established, ...) (NLA_U32) */ enum nft_trace_attributes { NFTA_TRACE_UNSPEC, @@ -1861,6 +1875,10 @@ enum nft_trace_attributes { NFTA_TRACE_NFPROTO, NFTA_TRACE_POLICY, NFTA_TRACE_PAD, + NFTA_TRACE_CT_ID, + NFTA_TRACE_CT_DIRECTION, + NFTA_TRACE_CT_STATUS, + NFTA_TRACE_CT_STATE, __NFTA_TRACE_MAX }; #define NFTA_TRACE_MAX (__NFTA_TRACE_MAX - 1) diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 6cd58cd2a6f0..50d807af2649 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -25,6 +25,8 @@ enum nfnetlink_groups { #define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA NFNLGRP_NFTRACE, #define NFNLGRP_NFTRACE NFNLGRP_NFTRACE + NFNLGRP_NFT_DEV, +#define NFNLGRP_NFT_DEV NFNLGRP_NFT_DEV __NFNLGRP_MAX, }; #define NFNLGRP_MAX (__NFNLGRP_MAX - 1) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 3d101613f27f..23c8deff8095 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -270,7 +270,7 @@ ipt_do_table(void *priv, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 25e1e8eb18dd..ed08fb78cfa8 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, struct iphdr *iph; local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->ttl; if (nf_dup_ipv4_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 9082ca17e845..7e7c49535e3f 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -50,7 +50,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, else addr = iph->saddr; - *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) { + *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + return; + } + + *dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr); } EXPORT_SYMBOL_GPL(nft_fib4_eval_type); @@ -65,8 +70,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi4 fl4 = { .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_proto = pkt->tprot, .flowi4_uid = sock_net_uid(nft_net(pkt), NULL), - .flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; const struct net_device *oif; const struct net_device *found; @@ -90,6 +95,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; + fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif); + iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { regs->verdict.code = NFT_BREAK; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d5602950ae7..d585ac3c1113 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 0c39c77fe8a8..b903c62c00c9 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -48,7 +48,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in6_addr *gw, int oif) { local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) @@ -64,9 +64,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->hop_limit; } if (nf_dup_ipv6_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip6_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 7fd9d7b21cd4..421036a3605b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -50,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, fl6->flowi6_mark = pkt->skb->mark; fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; + fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev); return lookup_flags; } @@ -73,8 +74,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); - fl6.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); - nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) @@ -158,6 +157,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, { const struct nft_fib *priv = nft_expr_priv(expr); int noff = skb_network_offset(pkt->skb); + const struct net_device *found = NULL; const struct net_device *oif = NULL; u32 *dest = ®s->data[priv->dreg]; struct ipv6hdr *iph, _iph; @@ -165,7 +165,6 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), - .flowi6_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; struct rt6_info *rt; int lookup_flags; @@ -203,11 +202,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; - if (oif && oif != rt->rt6i_idev->dev && - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) - goto put_rt_err; + if (!oif) { + found = rt->rt6i_idev->dev; + } else { + if (oif == rt->rt6i_idev->dev || + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex) + found = oif; + } - nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); + nft_fib_store_result(dest, priv, found); put_rt_err: ip6_rt_put(rt); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b9f551f02c81..11a702065bab 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -31,9 +31,6 @@ const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); -DEFINE_PER_CPU(bool, nf_skb_duplicated); -EXPORT_SYMBOL_GPL(nf_skb_duplicated); - #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index de8d50af9b5b..201d3c4ec623 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -505,6 +505,11 @@ u32 nf_ct_get_id(const struct nf_conn *ct) } EXPORT_SYMBOL_GPL(nf_ct_get_id); +static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) +{ + return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); +} + static void clean_from_lists(struct nf_conn *ct) { @@ -2710,6 +2715,7 @@ static const struct nf_ct_hook nf_conntrack_hook = { .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, .confirm = __nf_conntrack_confirm, + .get_id = nf_conntrack_get_id, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a8e2425e43b0..fab8b9011098 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -15,12 +15,26 @@ #define NF_RECURSION_LIMIT 2 -static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); +#ifndef CONFIG_PREEMPT_RT +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +} +#else + +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return ¤t->net_xmit.nf_dup_skb_recursion; +} + +#endif static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { - if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); + + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) goto err; if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { @@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb_clear_tstamp(skb); - __this_cpu_inc(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)++; dev_queue_xmit(skb); - __this_cpu_dec(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)--; return; err: kfree_skb(skb); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b28f6730e26d..24c71ecb2179 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -300,40 +300,75 @@ void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { + struct nf_hook_ops *ops; struct nft_hook *hook; int err, j; j = 0; list_for_each_entry(hook, hook_list, list) { - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) - goto err_register; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nf_register_net_hook(net, ops); + if (err < 0) + goto err_register; - j++; + j++; + } } return 0; err_register: list_for_each_entry(hook, hook_list, list) { - if (j-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (j-- <= 0) + break; - nf_unregister_net_hook(net, &hook->ops); + nf_unregister_net_hook(net, ops); + } } return err; } +static void nft_netdev_hook_free_ops(struct nft_hook *hook) +{ + struct nf_hook_ops *ops, *next; + + list_for_each_entry_safe(ops, next, &hook->ops_list, list) { + list_del(&ops->list); + kfree(ops); + } +} + +static void nft_netdev_hook_free(struct nft_hook *hook) +{ + nft_netdev_hook_free_ops(hook); + kfree(hook); +} + +static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu) +{ + struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu); + + nft_netdev_hook_free(hook); +} + +static void nft_netdev_hook_free_rcu(struct nft_hook *hook) +{ + call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu); +} + static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } @@ -2253,7 +2288,7 @@ void nf_tables_chain_destroy(struct nft_chain *chain) list_for_each_entry_safe(hook, next, &basechain->hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } module_put(basechain->type->owner); @@ -2274,19 +2309,20 @@ void nf_tables_chain_destroy(struct nft_chain *chain) static struct nft_hook *nft_netdev_hook_alloc(struct net *net, const struct nlattr *attr) { + struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; int err; hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); - if (!hook) { - err = -ENOMEM; - goto err_hook_alloc; - } + if (!hook) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&hook->ops_list); err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); if (err < 0) - goto err_hook_dev; + goto err_hook_free; hook->ifnamelen = nla_len(attr); @@ -2294,18 +2330,22 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, * indirectly serializing all the other holders of the commit_mutex with * the rtnl_mutex. */ - dev = __dev_get_by_name(net, hook->ifname); - if (!dev) { - err = -ENOENT; - goto err_hook_dev; - } - hook->ops.dev = dev; + for_each_netdev(net, dev) { + if (strncmp(dev->name, hook->ifname, hook->ifnamelen)) + continue; + ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); + if (!ops) { + err = -ENOMEM; + goto err_hook_free; + } + ops->dev = dev; + list_add_tail(&ops->list, &hook->ops_list); + } return hook; -err_hook_dev: - kfree(hook); -err_hook_alloc: +err_hook_free: + nft_netdev_hook_free(hook); return ERR_PTR(err); } @@ -2315,7 +2355,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { - if (!strcmp(hook->ifname, this->ifname)) + if (!strncmp(hook->ifname, this->ifname, + min(hook->ifnamelen, this->ifnamelen))) return hook; } @@ -2345,7 +2386,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, } if (nft_hook_list_find(hook_list, hook)) { NL_SET_BAD_ATTR(extack, tmp); - kfree(hook); + nft_netdev_hook_free(hook); err = -EEXIST; goto err_hook; } @@ -2363,7 +2404,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, err_hook: list_for_each_entry_safe(hook, next, hook_list, list) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } return err; } @@ -2506,7 +2547,7 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) list_for_each_entry_safe(h, next, &hook->list, list) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } module_put(hook->type->owner); } @@ -2559,6 +2600,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, struct nft_chain_hook *hook, u32 flags) { struct nft_chain *chain; + struct nf_hook_ops *ops; struct nft_hook *h; basechain->type = hook->type; @@ -2567,8 +2609,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, if (nft_base_chain_netdev(family, hook->num)) { list_splice_init(&hook->list, &basechain->hook_list); - list_for_each_entry(h, &basechain->hook_list, list) - nft_basechain_hook_init(&h->ops, family, hook, chain); + list_for_each_entry(h, &basechain->hook_list, list) { + list_for_each_entry(ops, &h->ops_list, list) + nft_basechain_hook_init(ops, family, hook, chain); + } } nft_basechain_hook_init(&basechain->ops, family, hook, chain); @@ -2787,15 +2831,17 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { list_for_each_entry_safe(h, next, &hook.list, list) { - h->ops.pf = basechain->ops.pf; - h->ops.hooknum = basechain->ops.hooknum; - h->ops.priority = basechain->ops.priority; - h->ops.priv = basechain->ops.priv; - h->ops.hook = basechain->ops.hook; + list_for_each_entry(ops, &h->ops_list, list) { + ops->pf = basechain->ops.pf; + ops->hooknum = basechain->ops.hooknum; + ops->priority = basechain->ops.priority; + ops->priv = basechain->ops.priv; + ops->hook = basechain->ops.hook; + } if (nft_hook_list_find(&basechain->hook_list, h)) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } } } else { @@ -2913,10 +2959,12 @@ err_trans: err_hooks: if (nla[NFTA_CHAIN_HOOK]) { list_for_each_entry_safe(h, next, &hook.list, list) { - if (unregister) - nf_unregister_net_hook(ctx->net, &h->ops); + if (unregister) { + list_for_each_entry(ops, &h->ops_list, list) + nf_unregister_net_hook(ctx->net, ops); + } list_del(&h->list); - kfree_rcu(h, rcu); + nft_netdev_hook_free_rcu(h); } module_put(hook.type->owner); } @@ -8785,6 +8833,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, struct netlink_ext_ack *extack, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; + struct nf_hook_ops *ops; struct nft_hook *hook; int hooknum, priority; int err; @@ -8839,11 +8888,13 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, } list_for_each_entry(hook, &flowtable_hook->list, list) { - hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = flowtable_hook->num; - hook->ops.priority = flowtable_hook->priority; - hook->ops.priv = &flowtable->data; - hook->ops.hook = flowtable->data.type->hook; + list_for_each_entry(ops, &hook->ops_list, list) { + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable_hook->num; + ops->priority = flowtable_hook->priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + } } return err; @@ -8885,12 +8936,12 @@ nft_flowtable_type_get(struct net *net, u8 family) } /* Only called from error and netdev event paths. */ -static void nft_unregister_flowtable_hook(struct net *net, - struct nft_flowtable *flowtable, - struct nft_hook *hook) +static void nft_unregister_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + nf_unregister_net_hook(net, ops); + flowtable->data.type->setup(&flowtable->data, ops->dev, FLOW_BLOCK_UNBIND); } @@ -8900,14 +8951,14 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } @@ -8919,6 +8970,26 @@ static void nft_unregister_flowtable_net_hooks(struct net *net, __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } +static int nft_register_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) +{ + int err; + + err = flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_BIND); + if (err < 0) + return err; + + err = nf_register_net_hook(net, ops); + if (!err) + return 0; + + flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_UNBIND); + return err; +} + static int nft_register_flowtable_net_hooks(struct net *net, struct nft_table *table, struct list_head *hook_list, @@ -8926,6 +8997,7 @@ static int nft_register_flowtable_net_hooks(struct net *net, { struct nft_hook *hook, *next; struct nft_flowtable *ft; + struct nf_hook_ops *ops; int err, i = 0; list_for_each_entry(hook, hook_list, list) { @@ -8939,33 +9011,27 @@ static int nft_register_flowtable_net_hooks(struct net *net, } } - err = flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_BIND); - if (err < 0) - goto err_unregister_net_hooks; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nft_register_flowtable_ops(net, flowtable, ops); + if (err < 0) + goto err_unregister_net_hooks; - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) { - flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_UNBIND); - goto err_unregister_net_hooks; + i++; } - - i++; } return 0; err_unregister_net_hooks: list_for_each_entry_safe(hook, next, hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - nft_unregister_flowtable_hook(net, flowtable, hook); + nft_unregister_flowtable_ops(net, flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; @@ -8977,7 +9043,7 @@ static void nft_hooks_destroy(struct list_head *hook_list) list_for_each_entry_safe(hook, next, hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } @@ -8988,6 +9054,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; struct nft_hook *hook, *next; + struct nf_hook_ops *ops; struct nft_trans *trans; bool unregister = false; u32 flags; @@ -9001,7 +9068,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } } @@ -9045,10 +9112,13 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, err_flowtable_update_hook: list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { - if (unregister) - nft_unregister_flowtable_hook(ctx->net, flowtable, hook); + if (unregister) { + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(ctx->net, + flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; @@ -9194,7 +9264,7 @@ static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook list_for_each_entry_safe(this, next, &flowtable_hook->list, list) { list_del(&this->list); - kfree(this); + nft_netdev_hook_free(this); } } @@ -9557,7 +9627,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } kfree(flowtable->name); module_put(flowtable->data.type->owner); @@ -9590,46 +9660,190 @@ nla_put_failure: return -EMSGSIZE; } -static void nft_flowtable_event(unsigned long event, struct net_device *dev, - struct nft_flowtable *flowtable) +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev) +{ + struct nf_hook_ops *ops; + + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops); + +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev) { + struct nf_hook_ops *ops; + + list_for_each_entry_rcu(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); + +static void +nf_tables_device_notify(const struct nft_table *table, int attr, + const char *name, const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + struct net *net = dev_net(dev); + struct nlmsghdr *nlh; + struct sk_buff *skb; + u16 flags = 0; + + if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + goto err; + + event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); + nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) + goto err; + + if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) || + nla_put_string(skb, attr, name) || + nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) || + nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) + goto err; + + nlmsg_end(skb, nlh); + nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV, + nlmsg_report(nlh), GFP_KERNEL); + return; +err: + if (skb) + kfree_skb(skb); + nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS); +} + +void +nf_tables_chain_device_notify(const struct nft_chain *chain, + const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN, + chain->name, hook, dev, event); +} + +static void +nf_tables_flowtable_device_notify(const struct nft_flowtable *ft, + const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE, + ft->name, hook, dev, event); +} + +static int nft_flowtable_event(unsigned long event, struct net_device *dev, + struct nft_flowtable *flowtable, bool changename) +{ + struct nf_hook_ops *ops; struct nft_hook *hook; + bool match; list_for_each_entry(hook, &flowtable->hook_list, list) { - if (hook->ops.dev != dev) - continue; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - /* flow_offload_netdev_event() cleans up entries for us. */ - nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + /* flow_offload_netdev_event() cleans up entries for us. */ + nft_unregister_flowtable_ops(dev_net(dev), + flowtable, ops); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; + + ops = kzalloc(sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable->hooknum; + ops->priority = flowtable->data.priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->dev = dev; + if (nft_register_flowtable_ops(dev_net(dev), + flowtable, ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } + nf_tables_flowtable_device_notify(flowtable, hook, dev, event); break; } + return 0; +} + +static int __nf_tables_flowtable_event(unsigned long event, + struct net_device *dev, + bool changename) +{ + struct nftables_pernet *nft_net = nft_pernet(dev_net(dev)); + struct nft_flowtable *flowtable; + struct nft_table *table; + + list_for_each_entry(table, &nft_net->tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { + if (nft_flowtable_event(event, dev, + flowtable, changename)) + return 1; + } + } + return 0; } static int nf_tables_flowtable_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_flowtable *flowtable; struct nftables_pernet *nft_net; - struct nft_table *table; + int ret = NOTIFY_DONE; struct net *net; - if (event != NETDEV_UNREGISTER) - return 0; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; net = dev_net(dev); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); - list_for_each_entry(table, &nft_net->tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - nft_flowtable_event(event, dev, flowtable); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; } + __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_flowtable_event(event, dev, false)) { + ret = NOTIFY_BAD; } +out_unlock: mutex_unlock(&nft_net->commit_mutex); - - return NOTIFY_DONE; + return ret; } static struct notifier_block nf_tables_flowtable_notifier = { diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 64675f1c7f29..fd30e205de84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -220,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain) bool nft_chain_offload_support(const struct nft_base_chain *basechain) { + struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; @@ -227,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain) return false; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.pf != NFPROTO_NETDEV || - hook->ops.hooknum != NF_NETDEV_INGRESS) - return false; - - dev = hook->ops.dev; - if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) - return false; + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->pf != NFPROTO_NETDEV || + ops->hooknum != NF_NETDEV_INGRESS) + return false; + + dev = ops->dev; + if (!dev->netdev_ops->ndo_setup_tc && + !flow_indr_dev_exists()) + return false; + } } return true; @@ -455,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { - struct net_device *dev; + struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { - dev = hook->ops.dev; - if (this_dev && this_dev != dev) - continue; + list_for_each_entry(ops, &hook->ops_list, list) { + if (this_dev && this_dev != ops->dev) + continue; - err = nft_chain_offload_cmd(basechain, dev, cmd); - if (err < 0 && cmd == FLOW_BLOCK_BIND) { - if (!this_dev) - goto err_flow_block; + err = nft_chain_offload_cmd(basechain, ops->dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; - return err; + return err; + } + i++; } - i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - dev = hook->ops.dev; - nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + nft_chain_offload_cmd(basechain, ops->dev, + FLOW_BLOCK_UNBIND); + } } return err; } @@ -638,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops(hook, dev)) continue; found = hook; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 580c55268f65..ae3fe87195ab 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -15,6 +15,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> @@ -90,6 +91,49 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb, return 0; } +static int nf_trace_fill_ct_info(struct sk_buff *nlskb, + const struct sk_buff *skb) +{ + const struct nf_ct_hook *ct_hook; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + u32 state; + + ct_hook = rcu_dereference(nf_ct_hook); + if (!ct_hook) + return 0; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { + if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */ + return 0; + + state = NF_CT_STATE_UNTRACKED_BIT; + } else { + state = NF_CT_STATE_BIT(ctinfo); + } + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state))) + return -1; + + if (ct) { + u32 id = ct_hook->get_id(&ct->ct_general); + u32 status = READ_ONCE(ct->status); + u8 dir = CTINFO2DIR(ctinfo); + + if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir)) + return -1; + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id)) + return -1; + + if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status))) + return -1; + } + + return 0; +} + static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, const struct nft_pktinfo *pkt) { @@ -210,7 +254,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ - nla_total_size(sizeof(u32)) + /* id */ + nla_total_size(sizeof(u32)) + /* ct id */ + nla_total_size(sizeof(u8)) + /* ct direction */ + nla_total_size(sizeof(u32)) + /* ct state */ + nla_total_size(sizeof(u32)) + /* ct status */ + nla_total_size(sizeof(u32)) + /* trace id */ nla_total_size(NFT_TRACETYPE_LL_HSIZE) + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + @@ -291,6 +339,10 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, if (nf_trace_fill_pkt_info(skb, pkt)) goto nla_put_failure; + + if (nf_trace_fill_ct_info(skb, pkt->skb)) + goto nla_put_failure; + info->packet_dumped = true; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e598a2a252b0..ac77fc21632d 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -86,6 +86,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, + [NFNLGRP_NFT_DEV] = NFNL_SUBSYS_NFTABLES, }; static struct nfnl_net *nfnl_pernet(struct net *net) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 19a553550c76..846d48ba8965 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -318,38 +318,68 @@ static const struct nft_chain_type nft_chain_filter_netdev = { }, }; -static void nft_netdev_event(unsigned long event, struct net_device *dev, - struct nft_base_chain *basechain) +static int nft_netdev_event(unsigned long event, struct net_device *dev, + struct nft_base_chain *basechain, bool changename) { + struct nft_table *table = basechain->chain.table; + struct nf_hook_ops *ops; struct nft_hook *hook; + bool match; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) - continue; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); + + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nf_unregister_net_hook(dev_net(dev), ops); - if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT)) - nf_unregister_net_hook(dev_net(dev), &hook->ops); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + ops = kmemdup(&basechain->ops, + sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->dev = dev; + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + nf_register_net_hook(dev_net(dev), ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } + nf_tables_chain_device_notify(&basechain->chain, + hook, dev, event); break; } + return 0; } -static int nf_tables_netdev_event(struct notifier_block *this, - unsigned long event, void *ptr) +static int __nf_tables_netdev_event(unsigned long event, + struct net_device *dev, + bool changename) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_base_chain *basechain; struct nftables_pernet *nft_net; struct nft_chain *chain; struct nft_table *table; - if (event != NETDEV_UNREGISTER) - return NOTIFY_DONE; - nft_net = nft_pernet(dev_net(dev)); - mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV && table->family != NFPROTO_INET) @@ -364,12 +394,40 @@ static int nf_tables_netdev_event(struct notifier_block *this, basechain->ops.hooknum != NF_INET_INGRESS) continue; - nft_netdev_event(event, dev, basechain); + if (nft_netdev_event(event, dev, basechain, changename)) + return 1; } } - mutex_unlock(&nft_net->commit_mutex); + return 0; +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; + int ret = NOTIFY_DONE; - return NOTIFY_DONE; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; + + nft_net = nft_pernet(dev_net(dev)); + mutex_lock(&nft_net->commit_mutex); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; + } + __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_netdev_event(event, dev, false)) { + ret = NOTIFY_BAD; + } +out_unlock: + mutex_unlock(&nft_net->commit_mutex); + return ret; } static struct notifier_block nf_tables_netdev_notifier = { diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 221d50223018..225ff293cd50 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -175,7 +175,7 @@ static bool nft_flowtable_find_dev(const struct net_device *dev, bool found = false; list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops_rcu(hook, dev)) continue; found = true; diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 817ab978d24a..c4569d4b9228 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -23,7 +23,14 @@ #include <linux/ip.h> #include <linux/ipv6.h> -static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); +struct nft_inner_tun_ctx_locked { + struct nft_inner_tun_ctx ctx; + local_lock_t bh_lock; +}; + +static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /* Same layout as nft_expr but it embeds the private expression data area. */ struct __nft_expr { @@ -237,12 +244,15 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); - this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { local_bh_enable(); + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); return false; } *tun_ctx = *this_cpu_tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); local_bh_enable(); return true; @@ -254,9 +264,11 @@ static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); - this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != tun_ctx->cookie) *this_cpu_tun_ctx = *tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); local_bh_enable(); } diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 0c63d1367cf7..a12486ae089d 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -621,10 +621,10 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct geneve_opt *opt; int offset = 0; - inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); - if (!inner) - goto failure; while (opts->len > offset) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); + if (!inner) + goto failure; opt = (struct geneve_opt *)(opts->u.data + offset); if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, opt->opt_class) || @@ -634,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, opt->length * 4, opt->opt_data)) goto inner_failure; offset += sizeof(*opt) + opt->length * 4; + nla_nest_end(skb, inner); } - nla_nest_end(skb, inner); } nla_nest_end(skb, nest); return 0; diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 30e99464171b..93f064306901 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_tcpoptstrip_target_info), .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TCPOPTSTRIP", .family = NFPROTO_IPV6, diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 65b965ca40ea..59b9d04400ca 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -48,7 +48,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_mark_tginfo2), .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) +#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP) { .name = "MARK", .revision = 2, diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index 3bdcbbdba925..e9b2f553588d 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -24,6 +24,7 @@ TEST_PROGS += nft_concat_range.sh TEST_PROGS += nft_conntrack_helper.sh TEST_PROGS += nft_fib.sh TEST_PROGS += nft_flowtable.sh +TEST_PROGS += nft_interface_stress.sh TEST_PROGS += nft_meta.sh TEST_PROGS += nft_nat.sh TEST_PROGS += nft_nat_zones.sh diff --git a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh index 025b58f2ae91..207b79932d91 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh @@ -32,7 +32,6 @@ source lib.sh IP0=172.30.30.1 IP1=172.30.30.2 -DUMMYNET=10.9.9 PFXL=30 ret=0 @@ -52,8 +51,6 @@ trap cleanup EXIT setup_ns ns0 ns1 -ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.forwarding=1 - if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then echo "SKIP: Could not add veth device" exit $ksft_skip @@ -64,18 +61,13 @@ if ! ip -net "$ns0" li add tvrf type vrf table 9876; then exit $ksft_skip fi -ip -net "$ns0" link add dummy0 type dummy - ip -net "$ns0" li set veth0 master tvrf -ip -net "$ns0" li set dummy0 master tvrf ip -net "$ns0" li set tvrf up ip -net "$ns0" li set veth0 up -ip -net "$ns0" li set dummy0 up ip -net "$ns1" li set veth0 up ip -net "$ns0" addr add $IP0/$PFXL dev veth0 ip -net "$ns1" addr add $IP1/$PFXL dev veth0 -ip -net "$ns0" addr add $DUMMYNET.1/$PFXL dev dummy0 listener_ready() { @@ -216,35 +208,9 @@ EOF fi } -test_fib() -{ -ip netns exec "$ns0" nft -f - <<EOF -flush ruleset -table ip t { - counter fibcount { } - - chain prerouting { - type filter hook prerouting priority 0; - meta iifname veth0 ip daddr $DUMMYNET.2 fib daddr oif dummy0 counter name fibcount notrack - } -} -EOF - ip -net "$ns1" route add 10.9.9.0/24 via "$IP0" dev veth0 - ip netns exec "$ns1" ping -q -w 1 -c 1 "$DUMMYNET".2 > /dev/null - - if ip netns exec "$ns0" nft list counter t fibcount | grep -q "packets 1"; then - echo "PASS: fib lookup returned exepected output interface" - else - echo "FAIL: fib lookup did not return exepected output interface" - ret=1 - return - fi -} - test_ct_zone_in test_masquerade_vrf "default" test_masquerade_vrf "pfifo" test_masquerade_veth -test_fib exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index 1f5979c1510c..efea93cf23d4 100755 --- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -15,10 +15,12 @@ source lib.sh # Available test groups: # - reported_issues: check for issues that were reported in the past # - correctness: check that packets match given entries, and only those +# - correctness_large: same but with additional non-matching entries # - concurrency: attempt races between insertion, deletion and lookup # - timeout: check that packets match entries until they expire # - performance: estimate matching rate, compare with rbtree and hash baselines -TESTS="reported_issues correctness concurrency timeout" +TESTS="reported_issues correctness correctness_large concurrency timeout" + [ -n "$NFT_CONCAT_RANGE_TESTS" ] && TESTS="${NFT_CONCAT_RANGE_TESTS}" # Set types, defined by TYPE_ variables below @@ -1257,9 +1259,7 @@ send_nomatch() { # - add ranged element, check that packets match it # - check that packets outside range don't match it # - remove some elements, check that packets don't match anymore -test_correctness() { - setup veth send_"${proto}" set || return ${ksft_skip} - +test_correctness_main() { range_size=1 for i in $(seq "${start}" $((start + count))); do end=$((start + range_size)) @@ -1293,6 +1293,163 @@ test_correctness() { done } +test_correctness() { + setup veth send_"${proto}" set || return ${ksft_skip} + + test_correctness_main +} + +# Repeat the correctness tests, but add extra non-matching entries. +# This exercises the more compact '4 bit group' representation that +# gets picked when the default 8-bit representation exceed +# NFT_PIPAPO_LT_SIZE_HIGH bytes of memory. +# See usage of NFT_PIPAPO_LT_SIZE_HIGH in pipapo_lt_bits_adjust(). +# +# The format() helper is way too slow when generating lots of +# entries so its not used here. +test_correctness_large() { + setup veth send_"${proto}" set || return ${ksft_skip} + # number of dummy (filler) entries to add. + local dcount=16385 + + ( + echo -n "add element inet filter test { " + + case "$type_spec" in + "ether_addr . ipv4_addr") + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + format_mac $((1000000 + i)) + printf ". 172.%i.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) + done + ;; + "inet_proto . ipv6_addr") + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + printf "%i . " $((RANDOM%256)) + format_addr6 $((1000000 + i)) + done + ;; + "inet_service . inet_proto") + # smaller key sizes, need more entries to hit the + # 4-bit threshold. + dcount=65536 + for i in $(seq 1 $dcount); do + local proto=$((RANDOM%256)) + + # Test uses UDP to match, as it also fails when matching + # an entry that doesn't exist, so skip 'udp' entries + # to not trigger a wrong failure. + [ $proto -eq 17 ] && proto=18 + [ $i -gt 1 ] && echo ", " + printf "%i . %i " $(((i%65534) + 1)) $((proto)) + done + ;; + "inet_service . ipv4_addr") + dcount=32768 + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + printf "%i . 172.%i.%i.%i " $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) $((i%256)) + done + ;; + "ipv4_addr . ether_addr") + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + printf "172.%i.%i.%i . " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) + format_mac $((1000000 + i)) + done + ;; + "ipv4_addr . inet_service") + dcount=32768 + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + printf "172.%i.%i.%i . %i" $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) + done + ;; + "ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr") + dcount=65536 + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + printf "172.%i.%i.%i . %i . " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) + format_mac $((1000000 + i)) + printf ". %i . 192.168.%i.%i" $((RANDOM%256)) $((RANDOM%256)) $((i%256)) + done + ;; + "ipv4_addr . inet_service . inet_proto") + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + printf "172.%i.%i.%i . %i . %i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) + done + ;; + "ipv4_addr . inet_service . inet_proto . ipv4_addr") + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + printf "172.%i.%i.%i . %i . %i . 192.168.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) $((RANDOM%256)) + done + ;; + "ipv4_addr . inet_service . ipv4_addr") + dcount=32768 + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + printf "172.%i.%i.%i . %i . 192.168.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) + done + ;; + "ipv6_addr . ether_addr") + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + format_addr6 $((i + 1000000)) + echo -n " . " + format_mac $((1000000 + i)) + done + ;; + "ipv6_addr . inet_service") + dcount=32768 + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + format_addr6 $((i + 1000000)) + echo -n " . $(((RANDOM%65534) + 1))" + done + ;; + "ipv6_addr . inet_service . ether_addr") + dcount=32768 + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + format_addr6 $((i + 1000000)) + echo -n " . $(((RANDOM%65534) + 1)) . " + format_mac $((i + 1000000)) + done + ;; + "ipv6_addr . inet_service . ether_addr . inet_proto") + dcount=65536 + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + format_addr6 $((i + 1000000)) + echo -n " . $(((RANDOM%65534) + 1)) . " + format_mac $((i + 1000000)) + echo -n " . $((RANDOM%256))" + done + ;; + "ipv6_addr . inet_service . ipv6_addr . inet_service") + dcount=32768 + for i in $(seq 1 $dcount); do + [ $i -gt 1 ] && echo ", " + format_addr6 $((i + 1000000)) + echo -n " . $(((RANDOM%65534) + 1)) . " + format_addr6 $((i + 2123456)) + echo -n " . $((RANDOM%256))" + done + ;; + *) + "Unhandled $type_spec" + return 1 + esac + echo -n "}" + + ) | nft -f - || return 1 + + test_correctness_main +} + # Concurrency test template: # - add all the elements # - start a thread for each physical thread that: diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh index 82780b39277c..9929a9ffef65 100755 --- a/tools/testing/selftests/net/netfilter/nft_fib.sh +++ b/tools/testing/selftests/net/netfilter/nft_fib.sh @@ -3,6 +3,10 @@ # This tests the fib expression. # # Kselftest framework requirement - SKIP code is 4. +# +# 10.0.1.99 10.0.1.1 10.0.2.1 10.0.2.99 +# dead:1::99 dead:1::1 dead:2::1 dead:2::99 +# ns1 <-------> [ veth0 ] nsrouter [veth1] <-------> ns2 source lib.sh @@ -72,6 +76,89 @@ table inet filter { EOF } +load_type_ruleset() { + local netns=$1 + + for family in ip ip6;do +ip netns exec "$netns" nft -f /dev/stdin <<EOF +table $family filter { + chain type_match_in { + fib daddr type local counter comment "daddr configured on other iface" + fib daddr . iif type local counter comment "daddr configured on iif" + fib daddr type unicast counter comment "daddr not local" + fib daddr . iif type unicast counter comment "daddr not configured on iif" + } + + chain type_match_out { + fib daddr type unicast counter + fib daddr . oif type unicast counter + fib daddr type local counter + fib daddr . oif type local counter + } + + chain prerouting { + type filter hook prerouting priority 0; + icmp type echo-request counter jump type_match_in + icmpv6 type echo-request counter jump type_match_in + } + + chain input { + type filter hook input priority 0; + icmp type echo-request counter jump type_match_in + icmpv6 type echo-request counter jump type_match_in + } + + chain forward { + type filter hook forward priority 0; + icmp type echo-request counter jump type_match_in + icmpv6 type echo-request counter jump type_match_in + } + + chain output { + type filter hook output priority 0; + icmp type echo-request counter jump type_match_out + icmpv6 type echo-request counter jump type_match_out + } + + chain postrouting { + type filter hook postrouting priority 0; + icmp type echo-request counter jump type_match_out + icmpv6 type echo-request counter jump type_match_out + } +} +EOF +done +} + +reload_type_ruleset() { + ip netns exec "$1" nft flush table ip filter + ip netns exec "$1" nft flush table ip6 filter + load_type_ruleset "$1" +} + +check_fib_type_counter_family() { + local family="$1" + local want="$2" + local ns="$3" + local chain="$4" + local what="$5" + local errmsg="$6" + + if ! ip netns exec "$ns" nft list chain "$family" filter "$chain" | grep "$what" | grep -q "packets $want";then + echo "Netns $ns $family fib type counter doesn't match expected packet count of $want for $what $errmsg" 1>&2 + ip netns exec "$ns" nft list chain "$family" filter "$chain" + ret=1 + return 1 + fi + + return 0 +} + +check_fib_type_counter() { + check_fib_type_counter_family "ip" "$@" || return 1 + check_fib_type_counter_family "ip6" "$@" || return 1 +} + load_ruleset_count() { local netns=$1 @@ -90,6 +177,7 @@ check_drops() { if dmesg | grep -q ' nft_rpfilter: ';then dmesg | grep ' nft_rpfilter: ' echo "FAIL: rpfilter did drop packets" + ret=1 return 1 fi @@ -164,17 +252,496 @@ test_ping() { return 0 } +test_ping_unreachable() { + local daddr4=$1 + local daddr6=$2 + + if ip netns exec "$ns1" ping -c 1 -w 1 -q "$daddr4" > /dev/null; then + echo "FAIL: ${ns1} could reach $daddr4" 1>&2 + return 1 + fi + + if ip netns exec "$ns1" ping -c 1 -w 1 -q "$daddr6" > /dev/null; then + echo "FAIL: ${ns1} could reach $daddr6" 1>&2 + return 1 + fi + + return 0 +} + +test_fib_type() { + local notice="$1" + local errmsg="addr-on-if" + local lret=0 + + if ! load_type_ruleset "$nsrouter";then + echo "SKIP: Could not load fib type ruleset" + [ $ret -eq 0 ] && ret=$ksft_skip + return + fi + + # makes router receive packet for addresses configured on incoming + # interface. + test_ping 10.0.1.1 dead:1::1 || return 1 + + # expectation: triggers all 'local' in prerouting/input. + check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type local" "$errmsg" || lret=1 + check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type local" "$errmsg" || lret=1 + + reload_type_ruleset "$nsrouter" + # makes router receive packet for address configured on a different (but local) + # interface. + test_ping 10.0.2.1 dead:2::1 || return 1 + + # expectation: triggers 'unicast' in prerouting/input for daddr . iif and local for 'daddr'. + errmsg="addr-on-host" + check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type local" "$errmsg" || lret=1 + check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type unicast" "$errmsg" || lret=1 + + reload_type_ruleset "$nsrouter" + test_ping 10.0.2.99 dead:2::99 || return 1 + errmsg="addr-on-otherhost" + check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type unicast" "$errmsg" || lret=1 + check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type unicast" "$errmsg" || lret=1 + + if [ $lret -eq 0 ];then + echo "PASS: fib expression address types match ($notice)" + else + echo "FAIL: fib expression address types match ($notice)" + ret=1 + fi +} + +test_fib_vrf_dev_add_dummy() +{ + if ! ip -net "$nsrouter" link add dummy0 type dummy ;then + echo "SKIP: VRF tests: dummy device type not supported" + return 1 + fi + + if ! ip -net "$nsrouter" link add tvrf type vrf table 9876;then + echo "SKIP: VRF tests: vrf device type not supported" + return 1 + fi + + ip -net "$nsrouter" link set dummy0 master tvrf + ip -net "$nsrouter" link set dummy0 up + ip -net "$nsrouter" link set tvrf up +} + +load_ruleset_vrf() +{ +# Due to the many different possible combinations using named counters +# or one-rule-per-expected-result is complex. +# +# Instead, add dynamic sets for the fib modes +# (fib address type, fib output interface lookup .. ), +# and then add the obtained fib results to them. +# +# The test is successful if the sets contain the expected results +# and no unexpected extra entries existed. +ip netns exec "$nsrouter" nft -f - <<EOF +flush ruleset +table inet t { + set fibif4 { + typeof meta iif . ip daddr . fib daddr oif + flags dynamic + counter + } + + set fibif4iif { + typeof meta iif . ip daddr . fib daddr . iif oif + flags dynamic + counter + } + + set fibif6 { + typeof meta iif . ip6 daddr . fib daddr oif + flags dynamic + counter + } + + set fibif6iif { + typeof meta iif . ip6 daddr . fib daddr . iif oif + flags dynamic + counter + } + + set fibtype4 { + typeof meta iif . ip daddr . fib daddr type + flags dynamic + counter + } + + set fibtype4iif { + typeof meta iif . ip daddr . fib daddr . iif type + flags dynamic + counter + } + + set fibtype6 { + typeof meta iif . ip6 daddr . fib daddr type + flags dynamic + counter + } + + set fibtype6iif { + typeof meta iif . ip6 daddr . fib daddr . iif type + flags dynamic + counter + } + + chain fib_test { + meta nfproto ipv4 jump { + add @fibif4 { meta iif . ip daddr . fib daddr oif } + add @fibif4iif { meta iif . ip daddr . fib daddr . iif oif } + add @fibtype4 { meta iif . ip daddr . fib daddr type } + add @fibtype4iif { meta iif . ip daddr . fib daddr . iif type } + + add @fibif4 { meta iif . ip saddr . fib saddr oif } + add @fibif4iif { meta iif . ip saddr . fib saddr . iif oif } + } + + meta nfproto ipv6 jump { + add @fibif6 { meta iif . ip6 daddr . fib daddr oif } + add @fibif6iif { meta iif . ip6 daddr . fib daddr . iif oif } + add @fibtype6 { meta iif . ip6 daddr . fib daddr type } + add @fibtype6iif { meta iif . ip6 daddr . fib daddr . iif type } + + add @fibif6 { meta iif . ip6 saddr . fib saddr oif } + add @fibif6iif { meta iif . ip6 saddr . fib saddr . iif oif } + } + } + + chain prerouting { + type filter hook prerouting priority 0; + icmp type echo-request counter jump fib_test + + # neighbour discovery to be ignored. + icmpv6 type echo-request counter jump fib_test + } +} +EOF + +if [ $? -ne 0 ] ;then + echo "SKIP: Could not load ruleset for fib vrf test" + [ $ret -eq 0 ] && ret=$ksft_skip + return 1 +fi +} + +check_type() +{ + local setname="$1" + local iifname="$2" + local addr="$3" + local type="$4" + local count="$5" + + [ -z "$count" ] && count=1 + + if ! ip netns exec "$nsrouter" nft get element inet t "$setname" { "$iifname" . "$addr" . "$type" } |grep -q "counter packets $count";then + echo "FAIL: did not find $iifname . $addr . $type in $setname" + ip netns exec "$nsrouter" nft list set inet t "$setname" + ret=1 + return 1 + fi + + # delete the entry, this allows to check if anything unexpected appeared + # at the end of the test run: all dynamic sets should be empty by then. + if ! ip netns exec "$nsrouter" nft delete element inet t "$setname" { "$iifname" . "$addr" . "$type" } ; then + echo "FAIL: can't delete $iifname . $addr . $type in $setname" + ip netns exec "$nsrouter" nft list set inet t "$setname" + ret=1 + return 1 + fi + + return 0 +} + +check_local() +{ + check_type $@ "local" 1 +} + +check_unicast() +{ + check_type $@ "unicast" 1 +} + +check_rpf() +{ + check_type $@ +} + +check_fib_vrf_sets_empty() +{ + local setname="" + local lret=0 + + # A non-empty set means that we have seen unexpected packets OR + # that a fib lookup provided unexpected results. + for setname in "fibif4" "fibif4iif" "fibif6" "fibif6iif" \ + "fibtype4" "fibtype4iif" "fibtype6" "fibtype6iif";do + if ip netns exec "$nsrouter" nft list set inet t "$setname" | grep -q elements;then + echo "FAIL: $setname not empty" + ip netns exec "$nsrouter" nft list set inet t "$setname" + ret=1 + lret=1 + fi + done + + return $lret +} + +check_fib_vrf_type() +{ + local msg="$1" + + local addr + # the incoming interface is always veth0. As its not linked to a VRF, + # the 'tvrf' device should NOT show up anywhere. + local ifname="veth0" + local lret=0 + + # local_veth0, local_veth1 + for addr in "10.0.1.1" "10.0.2.1"; do + check_local fibtype4 "$ifname" "$addr" || lret=1 + check_type fibif4 "$ifname" "$addr" "0" || lret=1 + done + for addr in "dead:1::1" "dead:2::1";do + check_local fibtype6 "$ifname" "$addr" || lret=1 + check_type fibif6 "$ifname" "$addr" "0" || lret=1 + done + + # when restricted to the incoming interface, 10.0.1.1 should + # be 'local', but 10.0.2.1 unicast. + check_local fibtype4iif "$ifname" "10.0.1.1" || lret=1 + check_unicast fibtype4iif "$ifname" "10.0.2.1" || lret=1 + + # same for the ipv6 addresses. + check_local fibtype6iif "$ifname" "dead:1::1" || lret=1 + check_unicast fibtype6iif "$ifname" "dead:2::1" || lret=1 + + # None of these addresses should find a valid route when restricting + # to the incoming interface (we ask for daddr - 10.0.1.1/2.1 are + # reachable via 'lo'. + for addr in "10.0.1.1" "10.0.2.1" "10.9.9.1" "10.9.9.2";do + check_type fibif4iif "$ifname" "$addr" "0" || lret=1 + done + + # expect default route (veth1), dummy0 is part of VRF but iif isn't. + for addr in "10.9.9.1" "10.9.9.2";do + check_unicast fibtype4 "$ifname" "$addr" || lret=1 + check_unicast fibtype4iif "$ifname" "$addr" || lret=1 + check_type fibif4 "$ifname" "$addr" "veth1" || lret=1 + done + for addr in "dead:9::1" "dead:9::2";do + check_unicast fibtype6 "$ifname" "$addr" || lret=1 + check_unicast fibtype6iif "$ifname" "$addr" || lret=1 + check_type fibif6 "$ifname" "$addr" "veth1" || lret=1 + done + + # same for the IPv6 equivalent addresses. + for addr in "dead:1::1" "dead:2::1" "dead:9::1" "dead:9::2";do + check_type fibif6iif "$ifname" "$addr" "0" || lret=1 + done + + check_unicast fibtype4 "$ifname" "10.0.2.99" || lret=1 + check_unicast fibtype4iif "$ifname" "10.0.2.99" || lret=1 + check_unicast fibtype6 "$ifname" "dead:2::99" || lret=1 + check_unicast fibtype6iif "$ifname" "dead:2::99" || lret=1 + + check_type fibif4 "$ifname" "10.0.2.99" "veth1" || lret=1 + check_type fibif4iif "$ifname" "10.0.2.99" 0 || lret=1 + check_type fibif6 "$ifname" "dead:2::99" "veth1" || lret=1 + check_type fibif6iif "$ifname" "dead:2::99" 0 || lret=1 + + check_rpf fibif4 "$ifname" "10.0.1.99" "veth0" 5 || lret=1 + check_rpf fibif4iif "$ifname" "10.0.1.99" "veth0" 5 || lret=1 + check_rpf fibif6 "$ifname" "dead:1::99" "veth0" 5 || lret=1 + check_rpf fibif6iif "$ifname" "dead:1::99" "veth0" 5 || lret=1 + + check_fib_vrf_sets_empty || lret=1 + + if [ $lret -eq 0 ];then + echo "PASS: $msg" + else + echo "FAIL: $msg" + ret=1 + fi +} + +check_fib_veth_vrf_type() +{ + local msg="$1" + + local addr + local ifname + local setname + local lret=0 + + # as veth0 is now part of tvrf interface, packets will be seen + # twice, once with iif veth0, then with iif tvrf. + + for ifname in "veth0" "tvrf"; do + for addr in "10.0.1.1" "10.9.9.1"; do + check_local fibtype4 "$ifname" "$addr" || lret=1 + # addr local, but nft_fib doesn't return routes with RTN_LOCAL. + check_type fibif4 "$ifname" "$addr" 0 || lret=1 + check_type fibif4iif "$ifname" "$addr" 0 || lret=1 + done + + for addr in "dead:1::1" "dead:9::1"; do + check_local fibtype6 "$ifname" "$addr" || lret=1 + # same, address is local but no route is returned for lo. + check_type fibif6 "$ifname" "$addr" 0 || lret=1 + check_type fibif6iif "$ifname" "$addr" 0 || lret=1 + done + + for t in fibtype4 fibtype4iif; do + check_unicast "$t" "$ifname" 10.9.9.2 || lret=1 + done + for t in fibtype6 fibtype6iif; do + check_unicast "$t" "$ifname" dead:9::2 || lret=1 + done + + check_unicast fibtype4iif "$ifname" "10.9.9.1" || lret=1 + check_unicast fibtype6iif "$ifname" "dead:9::1" || lret=1 + + check_unicast fibtype4 "$ifname" "10.0.2.99" || lret=1 + check_unicast fibtype4iif "$ifname" "10.0.2.99" || lret=1 + + check_unicast fibtype6 "$ifname" "dead:2::99" || lret=1 + check_unicast fibtype6iif "$ifname" "dead:2::99" || lret=1 + + check_type fibif4 "$ifname" "10.0.2.99" "veth1" || lret=1 + check_type fibif6 "$ifname" "dead:2::99" "veth1" || lret=1 + check_type fibif4 "$ifname" "10.9.9.2" "dummy0" || lret=1 + check_type fibif6 "$ifname" "dead:9::2" "dummy0" || lret=1 + + # restricted to iif -- MUST NOT provide result, its != $ifname. + check_type fibif4iif "$ifname" "10.0.2.99" 0 || lret=1 + check_type fibif6iif "$ifname" "dead:2::99" 0 || lret=1 + + check_rpf fibif4 "$ifname" "10.0.1.99" "veth0" 4 || lret=1 + check_rpf fibif6 "$ifname" "dead:1::99" "veth0" 4 || lret=1 + check_rpf fibif4iif "$ifname" "10.0.1.99" "$ifname" 4 || lret=1 + check_rpf fibif6iif "$ifname" "dead:1::99" "$ifname" 4 || lret=1 + done + + check_local fibtype4iif "veth0" "10.0.1.1" || lret=1 + check_local fibtype6iif "veth0" "dead:1::1" || lret=1 + + check_unicast fibtype4iif "tvrf" "10.0.1.1" || lret=1 + check_unicast fibtype6iif "tvrf" "dead:1::1" || lret=1 + + # 10.9.9.2 should not provide a result for iif veth, but + # should when iif is tvrf. + # This is because its reachable via dummy0 which is part of + # tvrf. iif veth0 MUST conceal the dummy0 result (i.e. return oif 0). + check_type fibif4iif "veth0" "10.9.9.2" 0 || lret=1 + check_type fibif6iif "veth0" "dead:9::2" 0 || lret=1 + + check_type fibif4iif "tvrf" "10.9.9.2" "tvrf" || lret=1 + check_type fibif6iif "tvrf" "dead:9::2" "tvrf" || lret=1 + + check_fib_vrf_sets_empty || lret=1 + + if [ $lret -eq 0 ];then + echo "PASS: $msg" + else + echo "FAIL: $msg" + ret=1 + fi +} + +# Extends nsrouter config by adding dummy0+vrf. +# +# 10.0.1.99 10.0.1.1 10.0.2.1 10.0.2.99 +# dead:1::99 dead:1::1 dead:2::1 dead:2::99 +# ns1 <-------> [ veth0 ] nsrouter [veth1] <-------> ns2 +# [dummy0] +# 10.9.9.1 +# dead:9::1 +# [tvrf] +test_fib_vrf() +{ + local cntname="" + + if ! test_fib_vrf_dev_add_dummy; then + [ $ret -eq 0 ] && ret=$ksft_skip + return + fi + + ip -net "$nsrouter" addr add "10.9.9.1"/24 dev dummy0 + ip -net "$nsrouter" addr add "dead:9::1"/64 dev dummy0 nodad + + ip -net "$nsrouter" route add default via 10.0.2.99 + ip -net "$nsrouter" route add default via dead:2::99 + + load_ruleset_vrf || return + + # no echo reply for these addresses: The dummy interface is part of tvrf, + # but veth0 (incoming interface) isn't linked to it. + test_ping_unreachable "10.9.9.1" "dead:9::1" & + test_ping_unreachable "10.9.9.2" "dead:9::2" & + + # expect replies from these. + test_ping "10.0.1.1" "dead:1::1" + test_ping "10.0.2.1" "dead:2::1" + test_ping "10.0.2.99" "dead:2::99" + + wait + + check_fib_vrf_type "fib expression address types match (iif not in vrf)" + + # second round: this time, make veth0 (rx interface) part of the vrf. + # 10.9.9.1 / dead:9::1 become reachable from ns1, while ns2 + # becomes unreachable. + ip -net "$nsrouter" link set veth0 master tvrf + ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + + # this reload should not be needed, but in case + # there is some error (missing or unexpected entry) this will prevent them + # from leaking into round 2. + load_ruleset_vrf || return + + test_ping "10.0.1.1" "dead:1::1" + test_ping "10.9.9.1" "dead:9::1" + + # ns2 should no longer be reachable (veth1 not in vrf) + test_ping_unreachable "10.0.2.99" "dead:2::99" & + + # vrf via dummy0, but host doesn't exist + test_ping_unreachable "10.9.9.2" "dead:9::2" & + + wait + + check_fib_veth_vrf_type "fib expression address types match (iif in vrf)" +} + ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null test_ping 10.0.2.1 dead:2::1 || exit 1 -check_drops || exit 1 +check_drops test_ping 10.0.2.99 dead:2::99 || exit 1 -check_drops || exit 1 +check_drops + +[ $ret -eq 0 ] && echo "PASS: fib expression did not cause unwanted packet drops" + +load_input_ruleset "$ns1" + +test_ping 127.0.0.1 ::1 +check_drops + +test_ping 10.0.1.99 dead:1::99 +check_drops -echo "PASS: fib expression did not cause unwanted packet drops" +[ $ret -eq 0 ] && echo "PASS: fib expression did not discard loopback packets" load_input_ruleset "$ns1" @@ -234,7 +801,7 @@ ip -net "$nsrouter" addr del dead:2::1/64 dev veth0 # ... pbr ruleset for the router, check iif+oif. if ! load_pbr_ruleset "$nsrouter";then echo "SKIP: Could not load fib forward ruleset" - exit $ksft_skip + [ "$ret" -eq 0 ] && ret=$ksft_skip fi ip -net "$nsrouter" rule add from all table 128 @@ -245,11 +812,36 @@ ip -net "$nsrouter" route add table 129 to 10.0.2.0/24 dev veth1 # drop main ipv4 table ip -net "$nsrouter" -4 rule delete table main -if ! test_ping 10.0.2.99 dead:2::99;then - ip -net "$nsrouter" nft list ruleset - echo "FAIL: fib mismatch in pbr setup" - exit 1 +if test_ping 10.0.2.99 dead:2::99;then + echo "PASS: fib expression forward check with policy based routing" +else + echo "FAIL: fib expression forward check with policy based routing" + ret=1 fi -echo "PASS: fib expression forward check with policy based routing" -exit 0 +test_fib_type "policy routing" +ip netns exec "$nsrouter" nft delete table ip filter +ip netns exec "$nsrouter" nft delete table ip6 filter + +# Un-do policy routing changes +ip -net "$nsrouter" rule del from all table 128 +ip -net "$nsrouter" rule del from all iif veth0 table 129 + +ip -net "$nsrouter" route del table 128 to 10.0.1.0/24 dev veth0 +ip -net "$nsrouter" route del table 129 to 10.0.2.0/24 dev veth1 + +ip -net "$ns1" -4 route del default +ip -net "$ns1" -6 route del default + +ip -net "$ns1" -4 route add default via 10.0.1.1 +ip -net "$ns1" -6 route add default via dead:1::1 + +ip -net "$nsrouter" -4 rule add from all table main priority 32766 + +test_fib_type "default table" +ip netns exec "$nsrouter" nft delete table ip filter +ip netns exec "$nsrouter" nft delete table ip6 filter + +test_fib_vrf + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_interface_stress.sh b/tools/testing/selftests/net/netfilter/nft_interface_stress.sh new file mode 100755 index 000000000000..11d82d11495e --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_interface_stress.sh @@ -0,0 +1,151 @@ +#!/bin/bash -e +# +# SPDX-License-Identifier: GPL-2.0 +# +# Torture nftables' netdevice notifier callbacks and related code by frequent +# renaming of interfaces which netdev-family chains and flowtables hook into. + +source lib.sh + +checktool "nft --version" "run test without nft tool" +checktool "iperf3 --version" "run test without iperf3 tool" + +# how many seconds to torture the kernel? +# default to 80% of max run time but don't exceed 48s +TEST_RUNTIME=$((${kselftest_timeout:-60} * 8 / 10)) +[[ $TEST_RUNTIME -gt 48 ]] && TEST_RUNTIME=48 + +trap "cleanup_all_ns" EXIT + +setup_ns nsc nsr nss + +ip -net $nsc link add cr0 type veth peer name rc0 netns $nsr +ip -net $nsc addr add 10.0.0.1/24 dev cr0 +ip -net $nsc link set cr0 up +ip -net $nsc route add default via 10.0.0.2 + +ip -net $nss link add sr0 type veth peer name rs0 netns $nsr +ip -net $nss addr add 10.1.0.1/24 dev sr0 +ip -net $nss link set sr0 up +ip -net $nss route add default via 10.1.0.2 + +ip -net $nsr addr add 10.0.0.2/24 dev rc0 +ip -net $nsr link set rc0 up +ip -net $nsr addr add 10.1.0.2/24 dev rs0 +ip -net $nsr link set rs0 up +ip netns exec $nsr sysctl -q net.ipv4.ip_forward=1 +ip netns exec $nsr sysctl -q net.ipv4.conf.all.forwarding=1 + +{ + echo "table netdev t {" + for ((i = 0; i < 10; i++)); do + cat <<-EOF + chain chain_rc$i { + type filter hook ingress device rc$i priority 0 + counter + } + chain chain_rs$i { + type filter hook ingress device rs$i priority 0 + counter + } + EOF + done + echo "}" + echo "table ip t {" + for ((i = 0; i < 10; i++)); do + cat <<-EOF + flowtable ft_${i} { + hook ingress priority 0 + devices = { rc$i, rs$i } + } + EOF + done + echo "chain c {" + echo "type filter hook forward priority 0" + for ((i = 0; i < 10; i++)); do + echo -n "iifname rc$i oifname rs$i " + echo "ip protocol tcp counter flow add @ft_${i}" + done + echo "counter" + echo "}" + echo "}" +} | ip netns exec $nsr nft -f - || { + echo "SKIP: Could not load nft ruleset" + exit $ksft_skip +} + +for ((o=0, n=1; ; o=n, n++, n %= 10)); do + ip -net $nsr link set rc$o name rc$n + ip -net $nsr link set rs$o name rs$n +done & +rename_loop_pid=$! + +while true; do ip netns exec $nsr nft list ruleset >/dev/null 2>&1; done & +nft_list_pid=$! + +ip netns exec $nsr nft monitor >/dev/null & +nft_monitor_pid=$! + +ip netns exec $nss iperf3 --server --daemon -1 +summary_expr='s,^\[SUM\] .* \([0-9\.]\+\) Kbits/sec .* receiver,\1,p' +rate=$(ip netns exec $nsc iperf3 \ + --format k -c 10.1.0.1 --time $TEST_RUNTIME \ + --length 56 --parallel 10 -i 0 | sed -n "$summary_expr") + +kill $nft_list_pid +kill $nft_monitor_pid +kill $rename_loop_pid +wait + +ip netns exec $nsr nft -f - <<EOF +table ip t { + flowtable ft_wild { + hook ingress priority 0 + devices = { wild* } + } +} +EOF +if [[ $? -ne 0 ]]; then + echo "SKIP wildcard tests: not supported by host's nft?" +else + for ((i = 0; i < 100; i++)); do + ip -net $nsr link add wild$i type dummy & + done + wait + for ((i = 80; i < 100; i++)); do + ip -net $nsr link del wild$i & + done + for ((i = 0; i < 80; i++)); do + ip -net $nsr link del wild$i & + done + wait + for ((i = 0; i < 100; i += 10)); do + ( + for ((j = 0; j < 10; j++)); do + ip -net $nsr link add wild$((i + j)) type dummy + done + for ((j = 0; j < 10; j++)); do + ip -net $nsr link del wild$((i + j)) + done + ) & + done + wait +fi + +[[ $(</proc/sys/kernel/tainted) -eq 0 ]] || { + echo "FAIL: Kernel is tainted!" + exit $ksft_fail +} + +[[ $rate -gt 0 ]] || { + echo "FAIL: Zero throughput in iperf3" + exit $ksft_fail +} + +[[ -f /sys/kernel/debug/kmemleak && \ + -n $(</sys/kernel/debug/kmemleak) ]] && { + echo "FAIL: non-empty kmemleak report" + exit $ksft_fail +} + +exit $ksft_pass |