From 2232642ec3fb4aad6ae4da1e109f55a0e7f2d204 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Fri, 20 Aug 2021 13:37:52 +0800 Subject: ipvs: add sysctl_run_estimation to support disable estimation estimation_timer will iterate the est_list to do estimation for each ipvs stats. When there are lots of services, the list can be very large. We found that estimation_timer() run for more then 200ms on a machine with 104 CPU and 50K services. yunhong-cgl jiang report the same phenomenon before: https://www.spinics.net/lists/lvs-devel/msg05426.html In some cases(for example a large K8S cluster with many ipvs services), ipvs estimation may not be needed. So adding a sysctl blob to allow users to disable this completely. Default is: 1 (enable) Cc: yunhong-cgl jiang Signed-off-by: Dust Li Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 8 ++++++++ net/netfilter/ipvs/ip_vs_est.c | 5 +++++ 2 files changed, 13 insertions(+) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c25097092a06..cbea5a68afb5 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2017,6 +2017,12 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "run_estimation", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -4090,6 +4096,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; + ipvs->sysctl_run_estimation = 1; + tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 05b8112ffb37..9a1a7af6a186 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -100,6 +100,9 @@ static void estimation_timer(struct timer_list *t) u64 rate; struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); + if (!sysctl_run_estimation(ipvs)) + goto skip; + spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); @@ -131,6 +134,8 @@ static void estimation_timer(struct timer_list *t) spin_unlock(&s->lock); } spin_unlock(&ipvs->est_lock); + +skip: mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } -- cgit v1.2.3-59-g8ed1b From 7b1394892de8d95748d05e3ee41e85edb4abbfa1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 25 Sep 2021 22:40:26 +0200 Subject: netfilter: nft_dynset: relax superfluous check on set updates Relax this condition to make add and update commands idempotent for sets with no timeout. The eval function already checks if the set element timeout is available and updates it if the update command is used. Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_dynset.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 6ba3256fa844..87f3af4645d9 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -198,17 +198,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return -EBUSY; priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); - switch (priv->op) { - case NFT_DYNSET_OP_ADD: - case NFT_DYNSET_OP_DELETE: - break; - case NFT_DYNSET_OP_UPDATE: - if (!(set->flags & NFT_SET_TIMEOUT)) - return -EOPNOTSUPP; - break; - default: + if (priv->op > NFT_DYNSET_OP_DELETE) return -EOPNOTSUPP; - } timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { -- cgit v1.2.3-59-g8ed1b From 42df6e1d221dddc0f2acf2be37e68d553ad65f96 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Fri, 8 Oct 2021 22:06:03 +0200 Subject: netfilter: Introduce egress hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support classifying packets with netfilter on egress to satisfy user requirements such as: * outbound security policies for containers (Laura) * filtering and mangling intra-node Direct Server Return (DSR) traffic on a load balancer (Laura) * filtering locally generated traffic coming in through AF_PACKET, such as local ARP traffic generated for clustering purposes or DHCP (Laura; the AF_PACKET plumbing is contained in a follow-up commit) * L2 filtering from ingress and egress for AVB (Audio Video Bridging) and gPTP with nftables (Pablo) * in the future: in-kernel NAT64/NAT46 (Pablo) The egress hook introduced herein complements the ingress hook added by commit e687ad60af09 ("netfilter: add netfilter ingress hook after handle_ing() under unique static key"). A patch for nftables to hook up egress rules from user space has been submitted separately, so users may immediately take advantage of the feature. Alternatively or in addition to netfilter, packets can be classified with traffic control (tc). On ingress, packets are classified first by tc, then by netfilter. On egress, the order is reversed for symmetry. Conceptually, tc and netfilter can be thought of as layers, with netfilter layered above tc. Traffic control is capable of redirecting packets to another interface (man 8 tc-mirred). E.g., an ingress packet may be redirected from the host namespace to a container via a veth connection: tc ingress (host) -> tc egress (veth host) -> tc ingress (veth container) In this case, netfilter egress classifying is not performed when leaving the host namespace! That's because the packet is still on the tc layer. If tc redirects the packet to a physical interface in the host namespace such that it leaves the system, the packet is never subjected to netfilter egress classifying. That is only logical since it hasn't passed through netfilter ingress classifying either. Packets can alternatively be redirected at the netfilter layer using nft fwd. Such a packet *is* subjected to netfilter egress classifying since it has reached the netfilter layer. Internally, the skb->nf_skip_egress flag controls whether netfilter is invoked on egress by __dev_queue_xmit(). Because __dev_queue_xmit() may be called recursively by tunnel drivers such as vxlan, the flag is reverted to false after sch_handle_egress(). This ensures that netfilter is applied both on the overlay and underlying network. Interaction between tc and netfilter is possible by setting and querying skb->mark. If netfilter egress classifying is not enabled on any interface, it is patched out of the data path by way of a static_key and doesn't make a performance difference that is discernible from noise: Before: 1537 1538 1538 1537 1538 1537 Mb/sec After: 1536 1534 1539 1539 1539 1540 Mb/sec Before + tc accept: 1418 1418 1418 1419 1419 1418 Mb/sec After + tc accept: 1419 1424 1418 1419 1422 1420 Mb/sec Before + tc drop: 1620 1619 1619 1619 1620 1620 Mb/sec After + tc drop: 1616 1624 1625 1624 1622 1619 Mb/sec When netfilter egress classifying is enabled on at least one interface, a minimal performance penalty is incurred for every egress packet, even if the interface it's transmitted over doesn't have any netfilter egress rules configured. That is caused by checking dev->nf_hooks_egress against NULL. Measurements were performed on a Core i7-3615QM. Commands to reproduce: ip link add dev foo type dummy ip link set dev foo up modprobe pktgen echo "add_device foo" > /proc/net/pktgen/kpktgend_3 samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i foo -n 400000000 -m "11:11:11:11:11:11" -d 1.1.1.1 Accept all traffic with tc: tc qdisc add dev foo clsact tc filter add dev foo egress bpf da bytecode '1,6 0 0 0,' Drop all traffic with tc: tc qdisc add dev foo clsact tc filter add dev foo egress bpf da bytecode '1,6 0 0 2,' Apply this patch when measuring packet drops to avoid errors in dmesg: https://lore.kernel.org/netdev/a73dda33-57f4-95d8-ea51-ed483abd6a7a@iogearbox.net/ Signed-off-by: Lukas Wunner Cc: Laura García Liébana Cc: John Fastabend Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Eric Dumazet Cc: Thomas Graf Signed-off-by: Pablo Neira Ayuso --- drivers/net/ifb.c | 3 ++ include/linux/netdevice.h | 4 ++ include/linux/netfilter_netdev.h | 86 ++++++++++++++++++++++++++++++++++++++++ include/linux/skbuff.h | 4 ++ include/uapi/linux/netfilter.h | 1 + net/core/dev.c | 15 ++++++- net/netfilter/Kconfig | 11 +++++ net/netfilter/core.c | 34 ++++++++++++++-- net/netfilter/nfnetlink_hook.c | 16 ++++++-- net/netfilter/nft_chain_filter.c | 4 +- 10 files changed, 168 insertions(+), 10 deletions(-) (limited to 'net/netfilter') diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index e9258a9f3702..2c319dd27f29 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -75,8 +76,10 @@ static void ifb_ri_tasklet(struct tasklet_struct *t) } while ((skb = __skb_dequeue(&txp->tq)) != NULL) { + /* Skip tc and netfilter to prevent redirection loop. */ skb->redirected = 0; skb->tc_skip_classify = 1; + nf_skip_egress(skb, true); u64_stats_update_begin(&txp->tsync); txp->tx_packets++; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d79163208dfd..e9a48068f306 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1861,6 +1861,7 @@ enum netdev_ml_priv_type { * @xps_maps: XXX: need comments on this one * @miniq_egress: clsact qdisc specific data for * egress processing + * @nf_hooks_egress: netfilter hooks executed for egress packets * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) @@ -2161,6 +2162,9 @@ struct net_device { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc __rcu *miniq_egress; #endif +#ifdef CONFIG_NETFILTER_EGRESS + struct nf_hook_entries __rcu *nf_hooks_egress; +#endif #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index 5812b0fb0278..b71b57a83bb4 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -50,11 +50,97 @@ static inline int nf_hook_ingress(struct sk_buff *skb) } #endif /* CONFIG_NETFILTER_INGRESS */ +#ifdef CONFIG_NETFILTER_EGRESS +static inline bool nf_hook_egress_active(void) +{ +#ifdef CONFIG_JUMP_LABEL + if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS])) + return false; +#endif + return true; +} + +/** + * nf_hook_egress - classify packets before transmission + * @skb: packet to be classified + * @rc: result code which shall be returned by __dev_queue_xmit() on failure + * @dev: netdev whose egress hooks shall be applied to @skb + * + * Returns @skb on success or %NULL if the packet was consumed or filtered. + * Caller must hold rcu_read_lock. + * + * On ingress, packets are classified first by tc, then by netfilter. + * On egress, the order is reversed for symmetry. Conceptually, tc and + * netfilter can be thought of as layers, with netfilter layered above tc: + * When tc redirects a packet to another interface, netfilter is not applied + * because the packet is on the tc layer. + * + * The nf_skip_egress flag controls whether netfilter is applied on egress. + * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the + * packet passes through tc and netfilter. Because __dev_queue_xmit() may be + * called recursively by tunnel drivers such as vxlan, the flag is reverted to + * false after sch_handle_egress(). This ensures that netfilter is applied + * both on the overlay and underlying network. + */ +static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, + struct net_device *dev) +{ + struct nf_hook_entries *e; + struct nf_hook_state state; + int ret; + +#ifdef CONFIG_NETFILTER_SKIP_EGRESS + if (skb->nf_skip_egress) + return skb; +#endif + + e = rcu_dereference(dev->nf_hooks_egress); + if (!e) + return skb; + + nf_hook_state_init(&state, NF_NETDEV_EGRESS, + NFPROTO_NETDEV, dev, NULL, NULL, + dev_net(dev), NULL); + ret = nf_hook_slow(skb, &state, e, 0); + + if (ret == 1) { + return skb; + } else if (ret < 0) { + *rc = NET_XMIT_DROP; + return NULL; + } else { /* ret == 0 */ + *rc = NET_XMIT_SUCCESS; + return NULL; + } +} +#else /* CONFIG_NETFILTER_EGRESS */ +static inline bool nf_hook_egress_active(void) +{ + return false; +} + +static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, + struct net_device *dev) +{ + return skb; +} +#endif /* CONFIG_NETFILTER_EGRESS */ + +static inline void nf_skip_egress(struct sk_buff *skb, bool skip) +{ +#ifdef CONFIG_NETFILTER_SKIP_EGRESS + skb->nf_skip_egress = skip; +#endif +} + static inline void nf_hook_netdev_init(struct net_device *dev) { #ifdef CONFIG_NETFILTER_INGRESS RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); #endif +#ifdef CONFIG_NETFILTER_EGRESS + RCU_INIT_POINTER(dev->nf_hooks_egress, NULL); +#endif } #endif /* _NETFILTER_NETDEV_H_ */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 841e2f0f5240..cb96f1e6460c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -652,6 +652,7 @@ typedef unsigned char *sk_buff_data_t; * @tc_at_ingress: used within tc_classify to distinguish in/egress * @redirected: packet was redirected by packet classifier * @from_ingress: packet was redirected from the ingress path + * @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag @@ -868,6 +869,9 @@ struct sk_buff { #ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif +#ifdef CONFIG_NETFILTER_SKIP_EGRESS + __u8 nf_skip_egress:1; +#endif #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index ef9a44286e23..53411ccc69db 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -51,6 +51,7 @@ enum nf_inet_hooks { enum nf_dev_hooks { NF_NETDEV_INGRESS, + NF_NETDEV_EGRESS, NF_NETDEV_NUMHOOKS }; diff --git a/net/core/dev.c b/net/core/dev.c index e4c683029c61..09d74798b440 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3920,6 +3920,7 @@ EXPORT_SYMBOL(dev_loopback_xmit); static struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { +#ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); struct tcf_result cl_res; @@ -3955,6 +3956,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) default: break; } +#endif /* CONFIG_NET_CLS_ACT */ return skb; } @@ -4148,13 +4150,20 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) qdisc_pkt_len_init(skb); #ifdef CONFIG_NET_CLS_ACT skb->tc_at_ingress = 0; -# ifdef CONFIG_NET_EGRESS +#endif +#ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { + if (nf_hook_egress_active()) { + skb = nf_hook_egress(skb, &rc, dev); + if (!skb) + goto out; + } + nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; + nf_skip_egress(skb, false); } -# endif #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. @@ -5296,6 +5305,7 @@ skip_taps: if (static_branch_unlikely(&ingress_needed_key)) { bool another = false; + nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another); if (another) @@ -5303,6 +5313,7 @@ skip_taps: if (!skb) goto out; + nf_skip_egress(skb, false); if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 54395266339d..49c9fae9c62c 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -10,6 +10,17 @@ config NETFILTER_INGRESS This allows you to classify packets from ingress using the Netfilter infrastructure. +config NETFILTER_EGRESS + bool "Netfilter egress support" + default y + select NET_EGRESS + help + This allows you to classify packets before transmission using the + Netfilter infrastructure. + +config NETFILTER_SKIP_EGRESS + def_bool NETFILTER_EGRESS && (NET_CLS_ACT || IFB) + config NETFILTER_NETLINK tristate diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 63d032191e62..3a32a813fcde 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -316,6 +316,12 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, if (dev && dev_net(dev) == net) return &dev->nf_hooks_ingress; } +#endif +#ifdef CONFIG_NETFILTER_EGRESS + if (hooknum == NF_NETDEV_EGRESS) { + if (dev && dev_net(dev) == net) + return &dev->nf_hooks_egress; + } #endif WARN_ON_ONCE(1); return NULL; @@ -344,6 +350,11 @@ static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) return false; } +static inline bool nf_egress_hook(const struct nf_hook_ops *reg, int pf) +{ + return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS; +} + static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL @@ -383,9 +394,18 @@ static int __nf_register_net_hook(struct net *net, int pf, switch (pf) { case NFPROTO_NETDEV: - err = nf_ingress_check(net, reg, NF_NETDEV_INGRESS); - if (err < 0) - return err; +#ifndef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) + return -EOPNOTSUPP; +#endif +#ifndef CONFIG_NETFILTER_EGRESS + if (reg->hooknum == NF_NETDEV_EGRESS) + return -EOPNOTSUPP; +#endif + if ((reg->hooknum != NF_NETDEV_INGRESS && + reg->hooknum != NF_NETDEV_EGRESS) || + !reg->dev || dev_net(reg->dev) != net) + return -EINVAL; break; case NFPROTO_INET: if (reg->hooknum != NF_INET_INGRESS) @@ -417,6 +437,10 @@ static int __nf_register_net_hook(struct net *net, int pf, #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_egress_hook(reg, pf)) + net_inc_egress_queue(); #endif nf_static_key_inc(reg, pf); @@ -474,6 +498,10 @@ static void __nf_unregister_net_hook(struct net *net, int pf, #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_dec_ingress_queue(); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_egress_hook(reg, pf)) + net_dec_egress_queue(); #endif nf_static_key_dec(reg, pf); } else { diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index f554e2ea32ee..d5c719c9e36c 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -185,7 +185,7 @@ static const struct nf_hook_entries * nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev) { const struct nf_hook_entries *hook_head = NULL; -#ifdef CONFIG_NETFILTER_INGRESS +#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) struct net_device *netdev; #endif @@ -221,9 +221,9 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de hook_head = rcu_dereference(net->nf.hooks_decnet[hook]); break; #endif -#ifdef CONFIG_NETFILTER_INGRESS +#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) case NFPROTO_NETDEV: - if (hook != NF_NETDEV_INGRESS) + if (hook >= NF_NETDEV_NUMHOOKS) return ERR_PTR(-EOPNOTSUPP); if (!dev) @@ -233,7 +233,15 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de if (!netdev) return ERR_PTR(-ENODEV); - return rcu_dereference(netdev->nf_hooks_ingress); +#ifdef CONFIG_NETFILTER_INGRESS + if (hook == NF_NETDEV_INGRESS) + return rcu_dereference(netdev->nf_hooks_ingress); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + if (hook == NF_NETDEV_EGRESS) + return rcu_dereference(netdev->nf_hooks_egress); +#endif + fallthrough; #endif default: return ERR_PTR(-EPROTONOSUPPORT); diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 5b02408a920b..680fe557686e 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -310,9 +310,11 @@ static const struct nft_chain_type nft_chain_filter_netdev = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_NETDEV, - .hook_mask = (1 << NF_NETDEV_INGRESS), + .hook_mask = (1 << NF_NETDEV_INGRESS) | + (1 << NF_NETDEV_EGRESS), .hooks = { [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + [NF_NETDEV_EGRESS] = nft_do_chain_netdev, }, }; -- cgit v1.2.3-59-g8ed1b From 9dd43a5f4b11b161c9dfcce9391e843e65d6a4cc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 12 Oct 2021 19:29:56 +0200 Subject: netfilter: ipvs: prepare for hook function reduction ipvs has multiple one-line wrappers for hooks, compact them. To avoid a large patch make the two most common helpers use the same function signature as hooks. Next patches can then remove the oneline wrappers. Signed-off-by: Florian Westphal Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 128690c512df..5a5deee3425c 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1330,12 +1330,15 @@ drop: * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; + int af = state->pf; struct sock *sk; EnterFunction(11); @@ -1477,7 +1480,7 @@ static unsigned int ip_vs_reply4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); + return ip_vs_out_hook(priv, skb, state); } /* @@ -1488,7 +1491,7 @@ static unsigned int ip_vs_local_reply4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); + return ip_vs_out_hook(priv, skb, state); } #ifdef CONFIG_IP_VS_IPV6 @@ -1502,7 +1505,7 @@ static unsigned int ip_vs_reply6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); + return ip_vs_out_hook(priv, skb, state); } /* @@ -1513,7 +1516,7 @@ static unsigned int ip_vs_local_reply6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); + return ip_vs_out_hook(priv, skb, state); } #endif @@ -1957,8 +1960,10 @@ out: * and send it on its way... */ static unsigned int -ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; @@ -1966,6 +1971,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int int ret, pkts; int conn_reuse_mode; struct sock *sk; + int af = state->pf; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -2145,7 +2151,7 @@ static unsigned int ip_vs_remote_request4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); + return ip_vs_in_hook(priv, skb, state); } /* @@ -2156,7 +2162,7 @@ static unsigned int ip_vs_local_request4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); + return ip_vs_in_hook(priv, skb, state); } #ifdef CONFIG_IP_VS_IPV6 @@ -2169,7 +2175,7 @@ static unsigned int ip_vs_remote_request6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); + return ip_vs_in_hook(priv, skb, state); } /* @@ -2180,7 +2186,7 @@ static unsigned int ip_vs_local_request6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); + return ip_vs_in_hook(priv, skb, state); } #endif -- cgit v1.2.3-59-g8ed1b From 8a9941b42de5132eae0cd2c27d5da41024f278a2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 12 Oct 2021 19:29:57 +0200 Subject: netfilter: ipvs: remove unneeded output wrappers After earlier patch we can use ip_vs_out_hook directly. Signed-off-by: Florian Westphal Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 62 ++++------------------------------------- 1 file changed, 6 insertions(+), 56 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5a5deee3425c..2db033455204 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1471,56 +1471,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat return NF_ACCEPT; } -/* - * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, - * used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_reply4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out_hook(priv, skb, state); -} - -/* - * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_local_reply4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out_hook(priv, skb, state); -} - -#ifdef CONFIG_IP_VS_IPV6 - -/* - * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, - * used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_reply6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out_hook(priv, skb, state); -} - -/* - * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_local_reply6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out_hook(priv, skb, state); -} - -#endif - static unsigned int ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, @@ -2243,7 +2193,7 @@ ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, @@ -2259,7 +2209,7 @@ static const struct nf_hook_ops ip_vs_ops4[] = { }, /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_local_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, @@ -2281,7 +2231,7 @@ static const struct nf_hook_ops ip_vs_ops4[] = { }, /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, @@ -2292,7 +2242,7 @@ static const struct nf_hook_ops ip_vs_ops4[] = { static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, @@ -2308,7 +2258,7 @@ static const struct nf_hook_ops ip_vs_ops6[] = { }, /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_local_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, @@ -2330,7 +2280,7 @@ static const struct nf_hook_ops ip_vs_ops6[] = { }, /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, -- cgit v1.2.3-59-g8ed1b From 540ff44b28f0b31d0c74b5e7082b050e1b14b36a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 12 Oct 2021 19:29:58 +0200 Subject: netfilter: ipvs: remove unneeded input wrappers After earlier patch ip_vs_hook_in can be used directly. Signed-off-by: Florian Westphal Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 57 +++-------------------------------------- 1 file changed, 4 insertions(+), 53 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 2db033455204..ad5f5e50547f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2093,55 +2093,6 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state return ret; } -/* - * AF_INET handler in NF_INET_LOCAL_IN chain - * Schedule and forward packets from remote clients - */ -static unsigned int -ip_vs_remote_request4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in_hook(priv, skb, state); -} - -/* - * AF_INET handler in NF_INET_LOCAL_OUT chain - * Schedule and forward packets from local clients - */ -static unsigned int -ip_vs_local_request4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in_hook(priv, skb, state); -} - -#ifdef CONFIG_IP_VS_IPV6 - -/* - * AF_INET6 handler in NF_INET_LOCAL_IN chain - * Schedule and forward packets from remote clients - */ -static unsigned int -ip_vs_remote_request6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in_hook(priv, skb, state); -} - -/* - * AF_INET6 handler in NF_INET_LOCAL_OUT chain - * Schedule and forward packets from local clients - */ -static unsigned int -ip_vs_local_request6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in_hook(priv, skb, state); -} - -#endif - - /* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP * related packets destined for 0.0.0.0/0. @@ -2202,7 +2153,7 @@ static const struct nf_hook_ops ip_vs_ops4[] = { * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_remote_request4, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, @@ -2216,7 +2167,7 @@ static const struct nf_hook_ops ip_vs_ops4[] = { }, /* After mangle, schedule and forward local requests */ { - .hook = ip_vs_local_request4, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, @@ -2251,7 +2202,7 @@ static const struct nf_hook_ops ip_vs_ops6[] = { * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_remote_request6, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, @@ -2265,7 +2216,7 @@ static const struct nf_hook_ops ip_vs_ops6[] = { }, /* After mangle, schedule and forward local requests */ { - .hook = ip_vs_local_request6, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, -- cgit v1.2.3-59-g8ed1b From c650c35a2506d8eebe8a4d8d263317fba29fe078 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 12 Oct 2021 19:29:59 +0200 Subject: netfilter: ipvs: merge ipv4 + ipv6 icmp reply handlers Similar to earlier patches: allow ipv4 and ipv6 to use the same handler. ipv4 and ipv6 specific actions can be done by checking state->pf. v2: split the pf == NFPROTO_IPV4 check (Julian Anastasov) Signed-off-by: Florian Westphal Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index ad5f5e50547f..e93c937a8bf0 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2106,40 +2106,31 @@ static unsigned int ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - int r; struct netns_ipvs *ipvs = net_ipvs(state->net); - - if (ip_hdr(skb)->protocol != IPPROTO_ICMP) - return NF_ACCEPT; + int r; /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp(ipvs, skb, &r, state->hook); -} - + if (state->pf == NFPROTO_IPV4) { + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; #ifdef CONFIG_IP_VS_IPV6 -static unsigned int -ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - int r; - struct netns_ipvs *ipvs = net_ipvs(state->net); - struct ip_vs_iphdr iphdr; + } else { + struct ip_vs_iphdr iphdr; - ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); - if (iphdr.protocol != IPPROTO_ICMPV6) - return NF_ACCEPT; + ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); - /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) - return NF_ACCEPT; + if (iphdr.protocol != IPPROTO_ICMPV6) + return NF_ACCEPT; - return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); -} + return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); #endif + } + return ip_vs_in_icmp(ipvs, skb, &r, state->hook); +} static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ @@ -2224,7 +2215,7 @@ static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { - .hook = ip_vs_forward_icmp_v6, + .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, -- cgit v1.2.3-59-g8ed1b From ffdd33dd9c12a8c263f78d778066709ef94671f9 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sat, 16 Oct 2021 10:13:27 +0200 Subject: netfilter: core: Fix clang warnings about unused static inlines Unlike gcc, clang warns about unused static inlines that are not in an include file: net/netfilter/core.c:344:20: error: unused function 'nf_ingress_hook' [-Werror,-Wunused-function] static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) ^ net/netfilter/core.c:353:20: error: unused function 'nf_egress_hook' [-Werror,-Wunused-function] static inline bool nf_egress_hook(const struct nf_hook_ops *reg, int pf) ^ According to commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"), the proper resolution is to mark the affected functions as __maybe_unused. An alternative approach would be to move them to include/linux/netfilter_netdev.h, but since Pablo didn't do that in commit ddcfa710d40b ("netfilter: add nf_ingress_hook() helper function"), I'm guessing __maybe_unused is preferred. This fixes both the warning introduced by Pablo in v5.10 as well as the one recently introduced by myself with commit 42df6e1d221d ("netfilter: Introduce egress hook"). Fixes: ddcfa710d40b ("netfilter: add nf_ingress_hook() helper function") Reported-by: kernel test robot Signed-off-by: Lukas Wunner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 3a32a813fcde..6dec9cd395f1 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -341,7 +341,8 @@ static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg, return 0; } -static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) +static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg, + int pf) { if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) || (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS)) @@ -350,7 +351,8 @@ static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) return false; } -static inline bool nf_egress_hook(const struct nf_hook_ops *reg, int pf) +static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg, + int pf) { return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS; } -- cgit v1.2.3-59-g8ed1b