From 2c82c7e724ff51cab78e1afd5c2aaa31994fe41e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 Apr 2019 14:53:11 +0200 Subject: netfilter: nf_tables: fix oops during rule dump We can oops in nf_tables_fill_rule_info(). Its not possible to fetch previous element in rcu-protected lists when deletions are not prevented somehow: list_del_rcu poisons the ->prev pointer value. Before rcu-conversion this was safe as dump operations did hold nfnetlink mutex. Pass previous rule as argument, obtained by keeping a pointer to the previous rule during traversal. Fixes: d9adf22a291883 ("netfilter: nf_tables: use call_rcu in netlink dumps") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 28241e82fd15..4b5159936034 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2270,13 +2270,13 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule) + const struct nft_rule *rule, + const struct nft_rule *prule) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; const struct nft_expr *expr, *next; struct nlattr *list; - const struct nft_rule *prule; u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); @@ -2296,8 +2296,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, NFTA_RULE_PAD)) goto nla_put_failure; - if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) { - prule = list_prev_entry(rule, list); + if (event != NFT_MSG_DELRULE && prule) { if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(prule->handle), NFTA_RULE_PAD)) @@ -2344,7 +2343,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, event, 0, ctx->family, ctx->table, - ctx->chain, rule); + ctx->chain, rule, NULL); if (err < 0) { kfree_skb(skb); goto err; @@ -2369,12 +2368,13 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, const struct nft_chain *chain) { struct net *net = sock_net(skb->sk); + const struct nft_rule *rule, *prule; unsigned int s_idx = cb->args[0]; - const struct nft_rule *rule; + prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_is_active(net, rule)) - goto cont; + goto cont_skip; if (*idx < s_idx) goto cont; if (*idx > s_idx) { @@ -2386,11 +2386,13 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule) < 0) + table, chain, rule, prule) < 0) return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: + prule = rule; +cont_skip: (*idx)++; } return 0; @@ -2546,7 +2548,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule); + family, table, chain, rule, NULL); if (err < 0) goto err; -- cgit v1.2.3-59-g8ed1b From 946c0d8e6ed43dae6527e878d0077c1e11015db0 Mon Sep 17 00:00:00 2001 From: Jagdish Motwani Date: Mon, 13 May 2019 23:47:40 +0530 Subject: netfilter: nf_queue: fix reinject verdict handling This patch fixes netfilter hook traversal when there are more than 1 hooks returning NF_QUEUE verdict. When the first queue reinjects the packet, 'nf_reinject' starts traversing hooks with a proper hook_index. However, if it again receives a NF_QUEUE verdict (by some other netfilter hook), it queues the packet with a wrong hook_index. So, when the second queue reinjects the packet, it re-executes hooks in between. Fixes: 960632ece694 ("netfilter: convert hook list to an array") Signed-off-by: Jagdish Motwani Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_queue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 9dc1d6e04946..b5b2be55ca82 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -255,6 +255,7 @@ static unsigned int nf_iterate(struct sk_buff *skb, repeat: verdict = nf_hook_entry_hookfn(hook, skb, state); if (verdict != NF_ACCEPT) { + *index = i; if (verdict != NF_REPEAT) return verdict; goto repeat; -- cgit v1.2.3-59-g8ed1b From e633508a95289489d28faacb68b32c3e7e68ef6f Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 15 May 2019 20:15:32 +0200 Subject: netfilter: nft_fib: Fix existence check support NFTA_FIB_F_PRESENT flag was not always honored since eval functions did not call nft_fib_store_result in all cases. Given that in all callsites there is a struct net_device pointer available which holds the interface data to be stored in destination register, simplify nft_fib_store_result() to just accept that pointer instead of the nft_pktinfo pointer and interface index. This also allows to drop the index to interface lookup previously needed to get the name associated with given index. Fixes: 055c4b34b94f6 ("netfilter: nft_fib: Support existence check") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 2 +- net/ipv4/netfilter/nft_fib_ipv4.c | 23 +++-------------------- net/ipv6/netfilter/nft_fib_ipv6.c | 16 ++-------------- net/netfilter/nft_fib.c | 6 +++--- 4 files changed, 9 insertions(+), 38 deletions(-) diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index a88f92737308..e4c4d8eaca8c 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -34,5 +34,5 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_fib_store_result(void *reg, const struct nft_fib *priv, - const struct nft_pktinfo *pkt, int index); + const struct net_device *dev); #endif diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 94eb25bc8d7e..c8888e52591f 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -58,11 +58,6 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, } EXPORT_SYMBOL_GPL(nft_fib4_eval_type); -static int get_ifindex(const struct net_device *dev) -{ - return dev ? dev->ifindex : 0; -} - void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { @@ -94,8 +89,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, pkt, - nft_in(pkt)->ifindex); + nft_fib_store_result(dest, priv, nft_in(pkt)); return; } @@ -108,8 +102,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (ipv4_is_zeronet(iph->saddr)) { if (ipv4_is_lbcast(iph->daddr) || ipv4_is_local_multicast(iph->daddr)) { - nft_fib_store_result(dest, priv, pkt, - get_ifindex(pkt->skb->dev)); + nft_fib_store_result(dest, priv, pkt->skb->dev); return; } } @@ -150,17 +143,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, found = oif; } - switch (priv->result) { - case NFT_FIB_RESULT_OIF: - *dest = found->ifindex; - break; - case NFT_FIB_RESULT_OIFNAME: - strncpy((char *)dest, found->name, IFNAMSIZ); - break; - default: - WARN_ON_ONCE(1); - break; - } + nft_fib_store_result(dest, priv, found); } EXPORT_SYMBOL_GPL(nft_fib4_eval); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 73cdc0bc63f7..ec068b0cffca 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -169,8 +169,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, pkt, - nft_in(pkt)->ifindex); + nft_fib_store_result(dest, priv, nft_in(pkt)); return; } @@ -187,18 +186,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (oif && oif != rt->rt6i_idev->dev) goto put_rt_err; - switch (priv->result) { - case NFT_FIB_RESULT_OIF: - *dest = rt->rt6i_idev->dev->ifindex; - break; - case NFT_FIB_RESULT_OIFNAME: - strncpy((char *)dest, rt->rt6i_idev->dev->name, IFNAMSIZ); - break; - default: - WARN_ON_ONCE(1); - break; - } - + nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); put_rt_err: ip6_rt_put(rt); } diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 21df8cccea65..77f00a99dfab 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -135,17 +135,17 @@ int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr) EXPORT_SYMBOL_GPL(nft_fib_dump); void nft_fib_store_result(void *reg, const struct nft_fib *priv, - const struct nft_pktinfo *pkt, int index) + const struct net_device *dev) { - struct net_device *dev; u32 *dreg = reg; + int index; switch (priv->result) { case NFT_FIB_RESULT_OIF: + index = dev ? dev->ifindex : 0; *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index; break; case NFT_FIB_RESULT_OIFNAME: - dev = dev_get_by_index_rcu(nft_net(pkt), index); if (priv->flags & NFTA_FIB_F_PRESENT) *dreg = !!dev; else -- cgit v1.2.3-59-g8ed1b From 719c7d563c17b150877cee03a4b812a424989dfa Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 17 May 2019 22:31:49 +0800 Subject: ipvs: Fix use-after-free in ip_vs_in BUG: KASAN: use-after-free in ip_vs_in.part.29+0xe8/0xd20 [ip_vs] Read of size 4 at addr ffff8881e9b26e2c by task sshd/5603 CPU: 0 PID: 5603 Comm: sshd Not tainted 4.19.39+ #30 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Call Trace: dump_stack+0x71/0xab print_address_description+0x6a/0x270 kasan_report+0x179/0x2c0 ip_vs_in.part.29+0xe8/0xd20 [ip_vs] ip_vs_in+0xd8/0x170 [ip_vs] nf_hook_slow+0x5f/0xe0 __ip_local_out+0x1d5/0x250 ip_local_out+0x19/0x60 __tcp_transmit_skb+0xba1/0x14f0 tcp_write_xmit+0x41f/0x1ed0 ? _copy_from_iter_full+0xca/0x340 __tcp_push_pending_frames+0x52/0x140 tcp_sendmsg_locked+0x787/0x1600 ? tcp_sendpage+0x60/0x60 ? inet_sk_set_state+0xb0/0xb0 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x6d/0x80 sock_write_iter+0x121/0x1c0 ? sock_sendmsg+0x80/0x80 __vfs_write+0x23e/0x370 vfs_write+0xe7/0x230 ksys_write+0xa1/0x120 ? __ia32_sys_read+0x50/0x50 ? __audit_syscall_exit+0x3ce/0x450 do_syscall_64+0x73/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7ff6f6147c60 Code: 73 01 c3 48 8b 0d 28 12 2d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 5d 73 2d 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 RSP: 002b:00007ffd772ead18 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000034 RCX: 00007ff6f6147c60 RDX: 0000000000000034 RSI: 000055df30a31270 RDI: 0000000000000003 RBP: 000055df30a31270 R08: 0000000000000000 R09: 0000000000000000 R10: 00007ffd772ead70 R11: 0000000000000246 R12: 00007ffd772ead74 R13: 00007ffd772eae20 R14: 00007ffd772eae24 R15: 000055df2f12ddc0 Allocated by task 6052: kasan_kmalloc+0xa0/0xd0 __kmalloc+0x10a/0x220 ops_init+0x97/0x190 register_pernet_operations+0x1ac/0x360 register_pernet_subsys+0x24/0x40 0xffffffffc0ea016d do_one_initcall+0x8b/0x253 do_init_module+0xe3/0x335 load_module+0x2fc0/0x3890 __do_sys_finit_module+0x192/0x1c0 do_syscall_64+0x73/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 6067: __kasan_slab_free+0x130/0x180 kfree+0x90/0x1a0 ops_free_list.part.7+0xa6/0xc0 unregister_pernet_operations+0x18b/0x1f0 unregister_pernet_subsys+0x1d/0x30 ip_vs_cleanup+0x1d/0xd2f [ip_vs] __x64_sys_delete_module+0x20c/0x300 do_syscall_64+0x73/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff8881e9b26600 which belongs to the cache kmalloc-4096 of size 4096 The buggy address is located 2092 bytes inside of 4096-byte region [ffff8881e9b26600, ffff8881e9b27600) The buggy address belongs to the page: page:ffffea0007a6c800 count:1 mapcount:0 mapping:ffff888107c0e600 index:0x0 compound_mapcount: 0 flags: 0x17ffffc0008100(slab|head) raw: 0017ffffc0008100 dead000000000100 dead000000000200 ffff888107c0e600 raw: 0000000000000000 0000000080070007 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected while unregistering ipvs module, ops_free_list calls __ip_vs_cleanup, then nf_unregister_net_hooks be called to do remove nf hook entries. It need a RCU period to finish, however net->ipvs is set to NULL immediately, which will trigger NULL pointer dereference when a packet is hooked and handled by ip_vs_in where net->ipvs is dereferenced. Another scene is ops_free_list call ops_free to free the net_generic directly while __ip_vs_cleanup finished, then calling ip_vs_in will triggers use-after-free. This patch moves nf_unregister_net_hooks from __ip_vs_cleanup() to __ip_vs_dev_cleanup(), where rcu_barrier() is called by unregister_pernet_device -> unregister_pernet_operations, that will do the needed grace period. Reported-by: Hulk Robot Fixes: efe41606184e ("ipvs: convert to use pernet nf_hook api") Suggested-by: Julian Anastasov Signed-off-by: YueHaibing Acked-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 14457551bcb4..8ebf21149ec3 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2312,7 +2312,6 @@ static void __net_exit __ip_vs_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ ip_vs_conn_net_cleanup(ipvs); ip_vs_app_net_cleanup(ipvs); @@ -2327,6 +2326,7 @@ static void __net_exit __ip_vs_dev_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); EnterFunction(2); + nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); ipvs->enable = 0; /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); -- cgit v1.2.3-59-g8ed1b From 82ce6eb1dd13fd12e449b2ee2c2ec051e6f52c43 Mon Sep 17 00:00:00 2001 From: Jeffrin Jose T Date: Wed, 15 May 2019 12:14:04 +0530 Subject: selftests: netfilter: missing error check when setting up veth interface A test for the basic NAT functionality uses ip command which needs veth device. There is a condition where the kernel support for veth is not compiled into the kernel and the test script breaks. This patch contains code for reasonable error display and correct code exit. Signed-off-by: Jeffrin Jose T Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_nat.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh index 21159f5f3362..f102a65aa8b7 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/netfilter/nft_nat.sh @@ -24,7 +24,11 @@ ip netns add ns0 ip netns add ns1 ip netns add ns2 -ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 +ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi ip link add veth1 netns ns0 type veth peer name eth0 netns ns2 ip -net ns0 link set lo up -- cgit v1.2.3-59-g8ed1b From 6bac76db1da3cb162c425d58ae421486f8e43955 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 20 May 2019 13:48:10 +0200 Subject: netfilter: nat: fix udp checksum corruption Due to copy&paste error nf_nat_mangle_udp_packet passes IPPROTO_TCP, resulting in incorrect udp checksum when payload had to be mangled. Fixes: dac3fe72596f9 ("netfilter: nat: remove csum_recalc hook") Reported-by: Marc Haber Tested-by: Marc Haber Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index ccc06f7539d7..53aeb12b70fb 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -170,7 +170,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) return true; - nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_TCP, + nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_UDP, udph, &udph->check, datalen, oldlen); return true; -- cgit v1.2.3-59-g8ed1b From e75b3e1c9bc5b997d09bdf8eb72ab3dd3c1a7072 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:30 +0200 Subject: netfilter: nf_flow_table: ignore DF bit setting Its irrelevant if the DF bit is set or not, we must pass packet to stack in either case. If the DF bit is set, we must pass it to stack so the appropriate ICMP error can be generated. If the DF is not set, we must pass it to stack for fragmentation. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 0d603e20b519..bfd44db9f214 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -243,8 +243,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; outdev = rt->dst.dev; - if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) && - (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0) + if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) return NF_ACCEPT; if (skb_try_make_writable(skb, sizeof(*iph))) -- cgit v1.2.3-59-g8ed1b From 8437a6209f76f85a2db1abb12a9bde2170801617 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:31 +0200 Subject: netfilter: nft_flow_offload: set liberal tracking mode for tcp Without it, whenever a packet has to be pushed up the stack (e.g. because of mtu mismatch), then conntrack will flag packets as invalid, which in turn breaks NAT. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 69d7a8439c7a..bde63d9c3c4e 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -72,6 +72,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, struct nf_flow_route route; struct flow_offload *flow; enum ip_conntrack_dir dir; + bool is_tcp = false; struct nf_conn *ct; int ret; @@ -84,6 +85,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: + is_tcp = true; + break; case IPPROTO_UDP: break; default: @@ -108,6 +111,11 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (!flow) goto err_flow_alloc; + if (is_tcp) { + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + } + ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; -- cgit v1.2.3-59-g8ed1b From 91a9048f238063dde7feea752b9dd386f7e3808b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:32 +0200 Subject: netfilter: nft_flow_offload: don't offload when sequence numbers need adjustment We can't deal with tcp sequence number rewrite in flow_offload. While at it, simplify helper check, we only need to know if the extension is present, we don't need the helper data. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index bde63d9c3c4e..c97c03c3939a 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -12,7 +12,6 @@ #include #include #include -#include struct nft_flow_offload { struct nft_flowtable *flowtable; @@ -67,7 +66,6 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; - const struct nf_conn_help *help; enum ip_conntrack_info ctinfo; struct nf_flow_route route; struct flow_offload *flow; @@ -93,8 +91,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, goto out; } - help = nfct_help(ct); - if (help) + if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || + ct->status & IPS_SEQ_ADJUST) goto out; if (!nf_ct_is_confirmed(ct)) -- cgit v1.2.3-59-g8ed1b From 69aeb538587e087bfc81dd1f465eab3558ff3158 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:33 +0200 Subject: netfilter: nft_flow_offload: IPCB is only valid for ipv4 family Guard this with a check vs. ipv4, IPCB isn't valid in ipv6 case. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index c97c03c3939a..d70742e95e14 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -48,15 +48,20 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, return 0; } -static bool nft_flow_offload_skip(struct sk_buff *skb) +static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { - struct ip_options *opt = &(IPCB(skb)->opt); - - if (unlikely(opt->optlen)) - return true; if (skb_sec_path(skb)) return true; + if (family == NFPROTO_IPV4) { + const struct ip_options *opt; + + opt = &(IPCB(skb)->opt); + + if (unlikely(opt->optlen)) + return true; + } + return false; } @@ -74,7 +79,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, struct nf_conn *ct; int ret; - if (nft_flow_offload_skip(pkt->skb)) + if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) goto out; ct = nf_ct_get(pkt->skb, &ctinfo); -- cgit v1.2.3-59-g8ed1b From 2de03b45236f3af1800755614fd434d347adf046 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:34 +0200 Subject: selftests: netfilter: add flowtable test script Exercises 3 cases: 1. no pmtu discovery (need to frag) 2. no PMTUd + NAT (don't flag packets as invalid from conntrack) 3. PMTU + NAT (need to send icmp error) The first two cases make sure we handle fragments correctly, i.e. pass them to classic forwarding path. Third case checks we offload everything (in the test case, PMTUd will kick in so all packets should be within link mtu). Nftables rules will filter packets that are supposed to be handled by the fast-path. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/Makefile | 2 +- tools/testing/selftests/netfilter/nft_flowtable.sh | 324 +++++++++++++++++++++ 2 files changed, 325 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/nft_flowtable.sh diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 3e6d1bcc2894..4144984ebee5 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -2,6 +2,6 @@ # Makefile for netfilter selftests TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ - conntrack_icmp_related.sh + conntrack_icmp_related.sh nft_flowtable.sh include ../lib.mk diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh new file mode 100755 index 000000000000..fe52488a6f72 --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -0,0 +1,324 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This tests basic flowtable functionality. +# Creates following topology: +# +# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000) +# Router1 is the one doing flow offloading, Router2 has no special +# purpose other than having a link that is smaller than either Originator +# and responder, i.e. TCPMSS announced values are too large and will still +# result in fragmentation and/or PMTU discovery. + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +ns1in="" +ns2in="" +ns1out="" +ns2out="" + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +which nc > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nc (netcat)" + exit $ksft_skip +fi + +ip netns add nsr1 +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace" + exit $ksft_skip +fi + +ip netns add ns1 +ip netns add ns2 + +ip netns add nsr2 + +cleanup() { + for i in 1 2; do + ip netns del ns$i + ip netns del nsr$i + done + + rm -f "$ns1in" "$ns1out" + rm -f "$ns2in" "$ns2out" + + [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns +} + +trap cleanup EXIT + +sysctl -q net.netfilter.nf_log_all_netns=1 + +ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1 +ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2 + +ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2 + +for dev in lo veth0 veth1; do + for i in 1 2; do + ip -net nsr$i link set $dev up + done +done + +ip -net nsr1 addr add 10.0.1.1/24 dev veth0 +ip -net nsr1 addr add dead:1::1/64 dev veth0 + +ip -net nsr2 addr add 10.0.2.1/24 dev veth1 +ip -net nsr2 addr add dead:2::1/64 dev veth1 + +# set different MTUs so we need to push packets coming from ns1 (large MTU) +# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), +# or to do PTMU discovery (send ICMP error back to originator). +# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers +# is NOT the lowest link mtu. + +ip -net nsr1 link set veth0 mtu 9000 +ip -net ns1 link set eth0 mtu 9000 + +ip -net nsr2 link set veth1 mtu 2000 +ip -net ns2 link set eth0 mtu 2000 + +# transfer-net between nsr1 and nsr2. +# these addresses are not used for connections. +ip -net nsr1 addr add 192.168.10.1/24 dev veth1 +ip -net nsr1 addr add fee1:2::1/64 dev veth1 + +ip -net nsr2 addr add 192.168.10.2/24 dev veth0 +ip -net nsr2 addr add fee1:2::2/64 dev veth0 + +for i in 1 2; do + ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip -net ns$i link set lo up + ip -net ns$i link set eth0 up + ip -net ns$i addr add 10.0.$i.99/24 dev eth0 + ip -net ns$i route add default via 10.0.$i.1 + ip -net ns$i addr add dead:$i::99/64 dev eth0 + ip -net ns$i route add default via dead:$i::1 + ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null + + # don't set ip DF bit for first two tests + ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null +done + +ip -net nsr1 route add default via 192.168.10.2 +ip -net nsr2 route add default via 192.168.10.1 + +ip netns exec nsr1 nft -f - < /dev/null +if [ $? -ne 0 ];then + echo "ERROR: ns1 cannot reach ns2" 1>&2 + bash + exit 1 +fi + +ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null +if [ $? -ne 0 ];then + echo "ERROR: ns2 cannot reach ns1" 1>&2 + exit 1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: netns routing/connectivity: ns1 can reach ns2" +fi + +ns1in=$(mktemp) +ns1out=$(mktemp) +ns2in=$(mktemp) +ns2out=$(mktemp) + +make_file() +{ + name=$1 + who=$2 + + SIZE=$((RANDOM % (1024 * 8))) + TSIZE=$((SIZE * 1024)) + + dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null + + SIZE=$((RANDOM % 1024)) + SIZE=$((SIZE + 128)) + TSIZE=$((TSIZE + SIZE)) + dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null +} + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + cmp "$in" "$out" > /dev/null 2>&1 + if [ $? -ne 0 ] ;then + echo "FAIL: file mismatch for $what" 1>&2 + ls -l "$in" + ls -l "$out" + return 1 + fi + + return 0 +} + +test_tcp_forwarding() +{ + local nsa=$1 + local nsb=$2 + local lret=0 + + ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" & + lpid=$! + + sleep 1 + ip netns exec $nsa nc -w 4 10.0.2.99 12345 < "$ns1in" > "$ns1out" & + cpid=$! + + sleep 3 + + kill $lpid + kill $cpid + wait + + check_transfer "$ns1in" "$ns2out" "ns1 -> ns2" + if [ $? -ne 0 ];then + lret=1 + fi + + check_transfer "$ns2in" "$ns1out" "ns1 <- ns2" + if [ $? -ne 0 ];then + lret=1 + fi + + return $lret +} + +make_file "$ns1in" "ns1" +make_file "$ns2in" "ns2" + +# First test: +# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. +test_tcp_forwarding ns1 ns2 +if [ $? -eq 0 ] ;then + echo "PASS: flow offloaded for ns1/ns2" +else + echo "FAIL: flow offload for ns1/ns2:" 1>&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# delete default route, i.e. ns2 won't be able to reach ns1 and +# will depend on ns1 being masqueraded in nsr1. +# expect ns1 has nsr1 address. +ip -net ns2 route del default via 10.0.2.1 +ip -net ns2 route del default via dead:2::1 +ip -net ns2 route add 192.168.10.1 via 10.0.2.1 + +# Second test: +# Same, but with NAT enabled. +ip netns exec nsr1 nft -f - <&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# Third test: +# Same as second test, but with PMTU discovery enabled. +handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2) + +ip netns exec nsr1 nft delete rule inet filter forward $handle +if [ $? -ne 0 ] ;then + echo "FAIL: Could not delete large-packet accept rule" + exit 1 +fi + +ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null + +test_tcp_forwarding ns1 ns2 +if [ $? -eq 0 ] ;then + echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery" +else + echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 + ip netns exec nsr1 nft list ruleset +fi + +exit $ret -- cgit v1.2.3-59-g8ed1b