aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/Kconfig153
-rw-r--r--net/netfilter/Makefile39
-rw-r--r--net/netfilter/core.c194
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c4
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c4
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c4
-rw-r--r--net/netfilter/ipset/ip_set_core.c258
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h117
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c16
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c17
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c18
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c21
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c23
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c18
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c23
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c2
-rw-r--r--net/netfilter/ipvs/Kconfig70
-rw-r--r--net/netfilter/ipvs/Makefile1
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c148
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c397
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c143
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c12
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c23
-rw-r--r--net/netfilter/ipvs/ip_vs_twos.c139
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c6
-rw-r--r--net/netfilter/nf_conncount.c11
-rw-r--r--net/netfilter/nf_conntrack_acct.c21
-rw-r--r--net/netfilter/nf_conntrack_bpf.c513
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c6
-rw-r--r--net/netfilter/nf_conntrack_core.c1153
-rw-r--r--net/netfilter/nf_conntrack_ecache.c395
-rw-r--r--net/netfilter/nf_conntrack_expect.c49
-rw-r--r--net/netfilter/nf_conntrack_extend.c148
-rw-r--r--net/netfilter/nf_conntrack_ftp.c19
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c6
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c275
-rw-r--r--net/netfilter/nf_conntrack_helper.c96
-rw-r--r--net/netfilter/nf_conntrack_irc.c56
-rw-r--r--net/netfilter/nf_conntrack_labels.c20
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c5
-rw-r--r--net/netfilter/nf_conntrack_netlink.c1189
-rw-r--r--net/netfilter/nf_conntrack_pptp.c128
-rw-r--r--net/netfilter/nf_conntrack_proto.c69
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c28
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c16
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c47
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c45
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c63
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c585
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c43
-rw-r--r--net/netfilter/nf_conntrack_sane.c65
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c16
-rw-r--r--net/netfilter/nf_conntrack_sip.c13
-rw-r--r--net/netfilter/nf_conntrack_standalone.c179
-rw-r--r--net/netfilter/nf_conntrack_timeout.c71
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c20
-rw-r--r--net/netfilter/nf_dup_netdev.c27
-rw-r--r--net/netfilter/nf_flow_table_core.c361
-rw-r--r--net/netfilter/nf_flow_table_inet.c44
-rw-r--r--net/netfilter/nf_flow_table_ip.c517
-rw-r--r--net/netfilter/nf_flow_table_offload.c594
-rw-r--r--net/netfilter/nf_flow_table_procfs.c80
-rw-r--r--net/netfilter/nf_hooks_lwtunnel.c53
-rw-r--r--net/netfilter/nf_internals.h17
-rw-r--r--net/netfilter/nf_log.c16
-rw-r--r--net/netfilter/nf_log_common.c212
-rw-r--r--net/netfilter/nf_log_netdev.c78
-rw-r--r--net/netfilter/nf_log_syslog.c1082
-rw-r--r--net/netfilter/nf_nat_amanda.c14
-rw-r--r--net/netfilter/nf_nat_bpf.c79
-rw-r--r--net/netfilter/nf_nat_core.c142
-rw-r--r--net/netfilter/nf_nat_ftp.c17
-rw-r--r--net/netfilter/nf_nat_helper.c31
-rw-r--r--net/netfilter/nf_nat_irc.c16
-rw-r--r--net/netfilter/nf_nat_masquerade.c173
-rw-r--r--net/netfilter/nf_nat_proto.c75
-rw-r--r--net/netfilter/nf_nat_sip.c14
-rw-r--r--net/netfilter/nf_queue.c151
-rw-r--r--net/netfilter/nf_sockopt.c60
-rw-r--r--net/netfilter/nf_synproxy_core.c43
-rw-r--r--net/netfilter/nf_tables_api.c4495
-rw-r--r--net/netfilter/nf_tables_core.c156
-rw-r--r--net/netfilter/nf_tables_offload.c244
-rw-r--r--net/netfilter/nf_tables_set_core.c31
-rw-r--r--net/netfilter/nf_tables_trace.c63
-rw-r--r--net/netfilter/nfnetlink.c258
-rw-r--r--net/netfilter/nfnetlink_acct.c137
-rw-r--r--net/netfilter/nfnetlink_cthelper.c101
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c302
-rw-r--r--net/netfilter/nfnetlink_hook.c393
-rw-r--r--net/netfilter/nfnetlink_log.c83
-rw-r--r--net/netfilter/nfnetlink_osf.c39
-rw-r--r--net/netfilter/nfnetlink_queue.c195
-rw-r--r--net/netfilter/nft_bitwise.c308
-rw-r--r--net/netfilter/nft_byteorder.c28
-rw-r--r--net/netfilter/nft_chain_filter.c78
-rw-r--r--net/netfilter/nft_chain_nat.c4
-rw-r--r--net/netfilter/nft_chain_route.c8
-rw-r--r--net/netfilter/nft_cmp.c225
-rw-r--r--net/netfilter/nft_compat.c144
-rw-r--r--net/netfilter/nft_connlimit.c39
-rw-r--r--net/netfilter/nft_counter.c89
-rw-r--r--net/netfilter/nft_ct.c96
-rw-r--r--net/netfilter/nft_dup_netdev.c14
-rw-r--r--net/netfilter/nft_dynset.c246
-rw-r--r--net/netfilter/nft_exthdr.c241
-rw-r--r--net/netfilter/nft_fib.c53
-rw-r--r--net/netfilter/nft_fib_inet.c2
-rw-r--r--net/netfilter/nft_fib_netdev.c2
-rw-r--r--net/netfilter/nft_flow_offload.c248
-rw-r--r--net/netfilter/nft_fwd_netdev.c42
-rw-r--r--net/netfilter/nft_hash.c62
-rw-r--r--net/netfilter/nft_immediate.c103
-rw-r--r--net/netfilter/nft_last.c133
-rw-r--r--net/netfilter/nft_limit.c195
-rw-r--r--net/netfilter/nft_log.c24
-rw-r--r--net/netfilter/nft_lookup.c80
-rw-r--r--net/netfilter/nft_masq.c22
-rw-r--r--net/netfilter/nft_meta.c98
-rw-r--r--net/netfilter/nft_nat.c155
-rw-r--r--net/netfilter/nft_numgen.c80
-rw-r--r--net/netfilter/nft_objref.c22
-rw-r--r--net/netfilter/nft_osf.c61
-rw-r--r--net/netfilter/nft_payload.c286
-rw-r--r--net/netfilter/nft_queue.c42
-rw-r--r--net/netfilter/nft_quota.c56
-rw-r--r--net/netfilter/nft_range.c34
-rw-r--r--net/netfilter/nft_redir.c22
-rw-r--r--net/netfilter/nft_reject.c16
-rw-r--r--net/netfilter/nft_reject_inet.c76
-rw-r--r--net/netfilter/nft_reject_netdev.c191
-rw-r--r--net/netfilter/nft_rt.c8
-rw-r--r--net/netfilter/nft_set_bitmap.c14
-rw-r--r--net/netfilter/nft_set_hash.c81
-rw-r--r--net/netfilter/nft_set_pipapo.c757
-rw-r--r--net/netfilter/nft_set_pipapo.h280
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c1228
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h12
-rw-r--r--net/netfilter/nft_set_rbtree.c167
-rw-r--r--net/netfilter/nft_socket.c167
-rw-r--r--net/netfilter/nft_synproxy.c10
-rw-r--r--net/netfilter/nft_tproxy.c66
-rw-r--r--net/netfilter/nft_tunnel.c149
-rw-r--r--net/netfilter/nft_xfrm.c43
-rw-r--r--net/netfilter/utils.c12
-rw-r--r--net/netfilter/x_tables.c261
-rw-r--r--net/netfilter/xt_AUDIT.c2
-rw-r--r--net/netfilter/xt_CONNSECMARK.c2
-rw-r--r--net/netfilter/xt_CT.c38
-rw-r--r--net/netfilter/xt_DSCP.c8
-rw-r--r--net/netfilter/xt_HMARK.c2
-rw-r--r--net/netfilter/xt_IDLETIMER.c253
-rw-r--r--net/netfilter/xt_LOG.c11
-rw-r--r--net/netfilter/xt_NFLOG.c11
-rw-r--r--net/netfilter/xt_RATEEST.c12
-rw-r--r--net/netfilter/xt_SECMARK.c90
-rw-r--r--net/netfilter/xt_TCPMSS.c4
-rw-r--r--net/netfilter/xt_TPROXY.c38
-rw-r--r--net/netfilter/xt_TRACE.c1
-rw-r--r--net/netfilter/xt_bpf.c2
-rw-r--r--net/netfilter/xt_connlimit.c6
-rw-r--r--net/netfilter/xt_connmark.c2
-rw-r--r--net/netfilter/xt_hashlimit.c20
-rw-r--r--net/netfilter/xt_limit.c52
-rw-r--r--net/netfilter/xt_nat.c1
-rw-r--r--net/netfilter/xt_nfacct.c4
-rw-r--r--net/netfilter/xt_recent.c22
-rw-r--r--net/netfilter/xt_socket.c16
-rw-r--r--net/netfilter/xt_statistic.c2
-rw-r--r--net/netfilter/xt_time.c2
180 files changed, 17873 insertions, 7267 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 91efae88e8c2..4b8d04640ff3 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
menu "Core Netfilter Configuration"
- depends on NET && INET && NETFILTER
+ depends on INET && NETFILTER
config NETFILTER_INGRESS
bool "Netfilter ingress support"
@@ -10,6 +10,17 @@ config NETFILTER_INGRESS
This allows you to classify packets from ingress using the Netfilter
infrastructure.
+config NETFILTER_EGRESS
+ bool "Netfilter egress support"
+ default y
+ select NET_EGRESS
+ help
+ This allows you to classify packets before transmission using the
+ Netfilter infrastructure.
+
+config NETFILTER_SKIP_EGRESS
+ def_bool NETFILTER_EGRESS && (NET_CLS_ACT || IFB)
+
config NETFILTER_NETLINK
tristate
@@ -19,6 +30,16 @@ config NETFILTER_FAMILY_BRIDGE
config NETFILTER_FAMILY_ARP
bool
+config NETFILTER_NETLINK_HOOK
+ tristate "Netfilter base hook dump support"
+ depends on NETFILTER_ADVANCED
+ depends on NF_TABLES
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ to list the base netfilter hooks via NFNETLINK.
+ This is helpful for debugging.
+
config NETFILTER_NETLINK_ACCT
tristate "Netfilter NFACCT over NFNETLINK interface"
depends on NETFILTER_ADVANCED
@@ -71,12 +92,17 @@ config NF_CONNTRACK
To compile it as a module, choose M here. If unsure, say N.
-config NF_LOG_COMMON
- tristate
-
-config NF_LOG_NETDEV
- tristate "Netdev packet logging"
- select NF_LOG_COMMON
+config NF_LOG_SYSLOG
+ tristate "Syslog packet logging"
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option enable support for packet logging via syslog.
+ It supports IPv4, IPV6, ARP and common transport protocols such
+ as TCP and UDP.
+ This is a simpler but less flexible logging method compared to
+ CONFIG_NETFILTER_NETLINK_LOG.
+ If both are enabled the backend to use can be configured at run-time
+ by means of per-address-family sysctl tunables.
if NF_CONNTRACK
config NETFILTER_CONNCOUNT
@@ -94,7 +120,7 @@ config NF_CONNTRACK_MARK
config NF_CONNTRACK_SECMARK
bool 'Connection tracking security mark support'
depends on NETWORK_SECMARK
- default m if NETFILTER_ADVANCED=n
+ default y if NETFILTER_ADVANCED=n
help
This option enables security markings to be applied to
connections. Typically they are copied to connections from
@@ -118,9 +144,8 @@ config NF_CONNTRACK_ZONES
config NF_CONNTRACK_PROCFS
bool "Supply CT list in procfs (OBSOLETE)"
- default y
depends on PROC_FS
- ---help---
+ help
This option enables for the list of known conntrack entries
to be shown in procfs under net/netfilter/nf_conntrack. This
is considered obsolete in favor of using the conntrack(8)
@@ -441,13 +466,14 @@ endif # NF_CONNTRACK
config NF_TABLES
select NETFILTER_NETLINK
+ select LIBCRC32C
tristate "Netfilter nf_tables support"
help
nftables is the new packet classification framework that intends to
replace the existing {ip,ip6,arp,eb}_tables infrastructure. It
provides a pseudo-state machine with an extensible instruction-set
(also known as expressions) that the userspace 'nft' utility
- (http://www.netfilter.org/projects/nftables) uses to build the
+ (https://www.netfilter.org/projects/nftables) uses to build the
rule-set. It also comes with the generic set infrastructure that
allows you to construct mappings between matchings and actions
for performance lookups.
@@ -455,14 +481,6 @@ config NF_TABLES
To compile it as a module, choose M here.
if NF_TABLES
-
-config NF_TABLES_SET
- tristate "Netfilter nf_tables set infrastructure"
- help
- This option enables the nf_tables set infrastructure that allows to
- look up for elements in a set and to build one-way mappings between
- matchings and actions.
-
config NF_TABLES_INET
depends on IPV6
select NF_TABLES_IPV4
@@ -496,12 +514,6 @@ config NFT_FLOW_OFFLOAD
This option adds the "flow_offload" expression that you can use to
choose what flows are placed into the hardware.
-config NFT_COUNTER
- tristate "Netfilter nf_tables counter module"
- help
- This option adds the "counter" expression that you can use to
- include packet and byte counters in a rule.
-
config NFT_CONNLIMIT
tristate "Netfilter nf_tables connlimit module"
depends on NF_CONNTRACK
@@ -689,6 +701,16 @@ config NFT_FIB_NETDEV
The lookup will be delegated to the IPv4 or IPv6 FIB depending
on the protocol of the packet.
+config NFT_REJECT_NETDEV
+ depends on NFT_REJECT_IPV4
+ depends on NFT_REJECT_IPV6
+ tristate "Netfilter nf_tables netdev REJECT support"
+ help
+ This option enables the REJECT support from the netdev table.
+ The return packet generation will be delegated to the IPv4
+ or IPv6 ICMP or TCP RST implementation depending on the
+ protocol of the packet.
+
endif # NF_TABLES_NETDEV
endif # NF_TABLES
@@ -711,6 +733,14 @@ config NF_FLOW_TABLE
To compile it as a module, choose M here.
+config NF_FLOW_TABLE_PROCFS
+ bool "Supply flow table statistics in procfs"
+ depends on NF_FLOW_TABLE
+ depends on PROC_FS
+ help
+ This option enables for the flow table offload statistics
+ to be shown in procfs under net/netfilter/nf_flowtable.
+
config NETFILTER_XTABLES
tristate "Netfilter Xtables support (required for ip_tables)"
default m if NETFILTER_ADVANCED=n
@@ -720,12 +750,22 @@ config NETFILTER_XTABLES
if NETFILTER_XTABLES
+config NETFILTER_XTABLES_COMPAT
+ bool "Netfilter Xtables 32bit support"
+ depends on COMPAT
+ default y
+ help
+ This option provides a translation layer to run 32bit arp,ip(6),ebtables
+ binaries on 64bit kernels.
+
+ If unsure, say N.
+
comment "Xtables combined modules"
config NETFILTER_XT_MARK
tristate 'nfmark target and match support'
default m if NETFILTER_ADVANCED=n
- ---help---
+ help
This option adds the "MARK" target and "mark" match.
Netfilter mark matching allows you to match packets based on the
@@ -741,7 +781,7 @@ config NETFILTER_XT_CONNMARK
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NF_CONNTRACK_MARK
- ---help---
+ help
This option adds the "CONNMARK" target and "connmark" match.
Netfilter allows you to store a mark value per connection (a.k.a.
@@ -768,7 +808,7 @@ config NETFILTER_XT_TARGET_AUDIT
tristate "AUDIT target support"
depends on AUDIT
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds a 'AUDIT' target, which can be used to create
audit records for packets dropped/accepted.
@@ -778,7 +818,7 @@ config NETFILTER_XT_TARGET_CHECKSUM
tristate "CHECKSUM target support"
depends on IP_NF_MANGLE || IP6_NF_MANGLE
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds a `CHECKSUM' target, which can be used in the iptables mangle
table to work around buggy DHCP clients in virtualized environments.
@@ -798,7 +838,7 @@ config NETFILTER_XT_TARGET_CLASSIFY
the priority of a packet. Some qdiscs can use this value for
classification, among these are:
- atm, cbq, dsmark, pfifo_fast, htb, prio
+ atm, cbq, dsmark, pfifo_fast, htb, prio
To compile it as a module, choose M here. If unsure, say N.
@@ -807,7 +847,7 @@ config NETFILTER_XT_TARGET_CONNMARK
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NETFILTER_XT_CONNMARK
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
@@ -856,7 +896,7 @@ config NETFILTER_XT_TARGET_HL
tristate '"HL" hoplimit target support'
depends on IP_NF_MANGLE || IP6_NF_MANGLE
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds the "HL" (for IPv6) and "TTL" (for IPv4)
targets, which enable the user to change the
hoplimit/time-to-live value of the IP header.
@@ -871,7 +911,7 @@ config NETFILTER_XT_TARGET_HMARK
tristate '"HMARK" target support'
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds the "HMARK" target.
The target allows you to create rules in the "raw" and "mangle" tables
@@ -919,8 +959,7 @@ config NETFILTER_XT_TARGET_LED
config NETFILTER_XT_TARGET_LOG
tristate "LOG target support"
- select NF_LOG_COMMON
- select NF_LOG_IPV4
+ select NF_LOG_SYSLOG
select NF_LOG_IPV6 if IP6_NF_IPTABLES
default m if NETFILTER_ADVANCED=n
help
@@ -933,7 +972,7 @@ config NETFILTER_XT_TARGET_MARK
tristate '"MARK" target support'
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MARK
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
@@ -941,7 +980,7 @@ config NETFILTER_XT_TARGET_MARK
config NETFILTER_XT_NAT
tristate '"SNAT and DNAT" targets support'
depends on NF_NAT
- ---help---
+ help
This option enables the SNAT and DNAT targets.
To compile it as a module, choose M here. If unsure, say N.
@@ -949,7 +988,7 @@ config NETFILTER_XT_NAT
config NETFILTER_XT_TARGET_NETMAP
tristate '"NETMAP" target support'
depends on NF_NAT
- ---help---
+ help
NETMAP is an implementation of static 1:1 NAT mapping of network
addresses. It maps the network address part, while keeping the host
address part intact.
@@ -999,7 +1038,7 @@ config NETFILTER_XT_TARGET_REDIRECT
tristate "REDIRECT target support"
depends on NF_NAT
select NF_NAT_REDIRECT
- ---help---
+ help
REDIRECT is a special case of NAT: all incoming connections are
mapped onto the incoming interface's address, causing the packets to
come to the local machine instead of passing through. This is
@@ -1029,7 +1068,7 @@ config NETFILTER_XT_TARGET_TEE
depends on IP6_NF_IPTABLES || !IP6_NF_IPTABLES
select NF_DUP_IPV4
select NF_DUP_IPV6 if IP6_NF_IPTABLES
- ---help---
+ help
This option adds a "TEE" target with which a packet can be cloned and
this clone be rerouted to another nexthop.
@@ -1051,7 +1090,7 @@ config NETFILTER_XT_TARGET_TPROXY
on Netfilter connection tracking and NAT, unlike REDIRECT.
For it to work you will have to configure certain iptables rules
and use policy routing. For more information on how to set it up
- see Documentation/networking/tproxy.txt.
+ see Documentation/networking/tproxy.rst.
To compile it as a module, choose M here. If unsure, say N.
@@ -1081,7 +1120,7 @@ config NETFILTER_XT_TARGET_TCPMSS
tristate '"TCPMSS" target support'
depends on IPV6 || IPV6=n
default m if NETFILTER_ADVANCED=n
- ---help---
+ help
This option adds a `TCPMSS' target, which allows you to alter the
MSS value of TCP SYN packets, to control the maximum size for that
connection (usually limiting it to your outgoing interface's MTU
@@ -1119,7 +1158,7 @@ comment "Xtables matches"
config NETFILTER_XT_MATCH_ADDRTYPE
tristate '"addrtype" address type match support'
default m if NETFILTER_ADVANCED=n
- ---help---
+ help
This option allows you to match what routing thinks of an address,
eg. UNICAST, LOCAL, BROADCAST, ...
@@ -1140,7 +1179,7 @@ config NETFILTER_XT_MATCH_CGROUP
depends on NETFILTER_ADVANCED
depends on CGROUPS
select CGROUP_NET_CLASSID
- ---help---
+ help
Socket/process control group matching allows you to match locally
generated packets based on which net_cls control group processes
belong to.
@@ -1149,7 +1188,7 @@ config NETFILTER_XT_MATCH_CLUSTER
tristate '"cluster" match support'
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option allows you to build work-load-sharing clusters of
network servers/stateful firewalls without having a dedicated
load-balancing router/server/switch. Basically, this match returns
@@ -1187,7 +1226,7 @@ config NETFILTER_XT_MATCH_CONNLABEL
select NF_CONNTRACK_LABELS
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
- ---help---
+ help
This match allows you to test and assign userspace-defined labels names
to a connection. The kernel only stores bit values - mapping
names to bits is done by userspace.
@@ -1200,7 +1239,7 @@ config NETFILTER_XT_MATCH_CONNLIMIT
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NETFILTER_CONNCOUNT
- ---help---
+ help
This match allows you to match against the number of parallel
connections to a server per client IP address (or address block).
@@ -1209,7 +1248,7 @@ config NETFILTER_XT_MATCH_CONNMARK
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NETFILTER_XT_CONNMARK
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
@@ -1275,7 +1314,7 @@ config NETFILTER_XT_MATCH_DSCP
config NETFILTER_XT_MATCH_ECN
tristate '"ecn" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds an "ECN" match, which allows you to match against
the IPv4 and TCP header ECN fields.
@@ -1318,7 +1357,7 @@ config NETFILTER_XT_MATCH_HELPER
config NETFILTER_XT_MATCH_HL
tristate '"hl" hoplimit/TTL match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
HL matching allows you to match packets based on the hoplimit
in the IPv6 header, or the time-to-live field in the IPv4
header of the packet.
@@ -1335,7 +1374,7 @@ config NETFILTER_XT_MATCH_IPCOMP
config NETFILTER_XT_MATCH_IPRANGE
tristate '"iprange" address range match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds a "iprange" match, which allows you to match based on
an IP address range. (Normal iptables only matches on single addresses
with an optional mask.)
@@ -1356,7 +1395,7 @@ config NETFILTER_XT_MATCH_L2TP
tristate '"l2tp" match support'
depends on NETFILTER_ADVANCED
default L2TP
- ---help---
+ help
This option adds an "L2TP" match, which allows you to match against
L2TP protocol header fields.
@@ -1394,7 +1433,7 @@ config NETFILTER_XT_MATCH_MARK
tristate '"mark" match support'
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MARK
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
@@ -1436,7 +1475,7 @@ config NETFILTER_XT_MATCH_OSF
config NETFILTER_XT_MATCH_OWNER
tristate '"owner" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
Socket owner matching allows you to match locally-generated packets
based on who created the socket: the user or group. It is also
possible to check whether a socket actually exists.
@@ -1511,7 +1550,7 @@ config NETFILTER_XT_MATCH_REALM
config NETFILTER_XT_MATCH_RECENT
tristate '"recent" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
This match is used for creating one or many lists of recently
used addresses and then matching against that/those list(s).
@@ -1594,7 +1633,7 @@ config NETFILTER_XT_MATCH_TCPMSS
config NETFILTER_XT_MATCH_TIME
tristate '"time" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds a "time" match, which allows you to match based on
the packet arrival time (at the machine which netfilter is running)
on) or departure time/date (for locally generated packets).
@@ -1608,7 +1647,7 @@ config NETFILTER_XT_MATCH_TIME
config NETFILTER_XT_MATCH_U32
tristate '"u32" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
u32 allows you to extract quantities of up to 4 bytes from a packet,
AND them with specified masks, shift them by specified amounts and
test whether the results are in any of a set of specified ranges.
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3f572e5a975e..0f060d100880 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -14,6 +14,11 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+ifeq ($(CONFIG_NF_CONNTRACK),m)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_conntrack_bpf.o
+else ifeq ($(CONFIG_NF_CONNTRACK),y)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
+endif
obj-$(CONFIG_NETFILTER) = netfilter.o
@@ -22,6 +27,7 @@ obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o
+obj-$(CONFIG_NETFILTER_NETLINK_HOOK) += nfnetlink_hook.o
# connection tracking
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
@@ -48,16 +54,18 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
nf_nat-y := nf_nat_core.o nf_nat_proto.o nf_nat_helper.o
-# generic transport layer logging
-obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
-
-# packet logging for netdev family
-obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
+obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o
obj-$(CONFIG_NF_NAT) += nf_nat.o
nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o
+ifeq ($(CONFIG_NF_NAT),m)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o
+else ifeq ($(CONFIG_NF_NAT),y)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF) += nf_nat_bpf.o
+endif
+
# NAT helpers
obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
@@ -77,15 +85,18 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
- nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \
- nft_chain_route.o nf_tables_offload.o
+ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \
+ nft_counter.o nft_chain_route.o nf_tables_offload.o \
+ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
+ nft_set_pipapo.o
-nf_tables_set-objs := nf_tables_set_core.o \
- nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
- nft_set_pipapo.o
+ifdef CONFIG_X86_64
+ifndef CONFIG_UML
+nf_tables-objs += nft_set_pipapo_avx2.o
+endif
+endif
obj-$(CONFIG_NF_TABLES) += nf_tables.o
-obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
@@ -98,8 +109,8 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
+obj-$(CONFIG_NFT_REJECT_NETDEV) += nft_reject_netdev.o
obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o
-obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o
obj-$(CONFIG_NFT_MASQ) += nft_masq.o
obj-$(CONFIG_NFT_REDIR) += nft_redir.o
@@ -123,6 +134,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \
nf_flow_table_offload.o
+nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
@@ -211,3 +223,6 @@ obj-$(CONFIG_IP_SET) += ipset/
# IPVS
obj-$(CONFIG_IP_VS) += ipvs/
+
+# lwtunnel
+obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78f046ec506f..5a6705a0e4ec 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -58,7 +58,7 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
if (num == 0)
return NULL;
- e = kvzalloc(alloc, GFP_KERNEL);
+ e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT);
if (e)
e->num_hook_entries = num;
return e;
@@ -282,6 +282,16 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
return NULL;
return net->nf.hooks_bridge + hooknum;
#endif
+#ifdef CONFIG_NETFILTER_INGRESS
+ case NFPROTO_INET:
+ if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS))
+ return NULL;
+ if (!dev || dev_net(dev) != net) {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+ return &dev->nf_hooks_ingress;
+#endif
case NFPROTO_IPV4:
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum))
return NULL;
@@ -290,12 +300,6 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum))
return NULL;
return net->nf.hooks_ipv6 + hooknum;
-#if IS_ENABLED(CONFIG_DECNET)
- case NFPROTO_DECNET:
- if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum))
- return NULL;
- return net->nf.hooks_decnet + hooknum;
-#endif
default:
WARN_ON_ONCE(1);
return NULL;
@@ -307,24 +311,106 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
return &dev->nf_hooks_ingress;
}
#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (hooknum == NF_NETDEV_EGRESS) {
+ if (dev && dev_net(dev) == net)
+ return &dev->nf_hooks_egress;
+ }
+#endif
WARN_ON_ONCE(1);
return NULL;
}
+static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg,
+ int hooknum)
+{
+#ifndef CONFIG_NETFILTER_INGRESS
+ if (reg->hooknum == hooknum)
+ return -EOPNOTSUPP;
+#endif
+ if (reg->hooknum != hooknum ||
+ !reg->dev || dev_net(reg->dev) != net)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg,
+ int pf)
+{
+ if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) ||
+ (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS))
+ return true;
+
+ return false;
+}
+
+static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg,
+ int pf)
+{
+ return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS;
+}
+
+static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int hooknum;
+
+ if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
+ pf = NFPROTO_NETDEV;
+ hooknum = NF_NETDEV_INGRESS;
+ } else {
+ hooknum = reg->hooknum;
+ }
+ static_key_slow_inc(&nf_hooks_needed[pf][hooknum]);
+#endif
+}
+
+static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int hooknum;
+
+ if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
+ pf = NFPROTO_NETDEV;
+ hooknum = NF_NETDEV_INGRESS;
+ } else {
+ hooknum = reg->hooknum;
+ }
+ static_key_slow_dec(&nf_hooks_needed[pf][hooknum]);
+#endif
+}
+
static int __nf_register_net_hook(struct net *net, int pf,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *p, *new_hooks;
struct nf_hook_entries __rcu **pp;
+ int err;
- if (pf == NFPROTO_NETDEV) {
+ switch (pf) {
+ case NFPROTO_NETDEV:
#ifndef CONFIG_NETFILTER_INGRESS
if (reg->hooknum == NF_NETDEV_INGRESS)
return -EOPNOTSUPP;
#endif
- if (reg->hooknum != NF_NETDEV_INGRESS ||
+#ifndef CONFIG_NETFILTER_EGRESS
+ if (reg->hooknum == NF_NETDEV_EGRESS)
+ return -EOPNOTSUPP;
+#endif
+ if ((reg->hooknum != NF_NETDEV_INGRESS &&
+ reg->hooknum != NF_NETDEV_EGRESS) ||
!reg->dev || dev_net(reg->dev) != net)
return -EINVAL;
+ break;
+ case NFPROTO_INET:
+ if (reg->hooknum != NF_INET_INGRESS)
+ break;
+
+ err = nf_ingress_check(net, reg, NF_INET_INGRESS);
+ if (err < 0)
+ return err;
+ break;
}
pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
@@ -336,21 +422,25 @@ static int __nf_register_net_hook(struct net *net, int pf,
p = nf_entry_dereference(*pp);
new_hooks = nf_hook_entries_grow(p, reg);
- if (!IS_ERR(new_hooks))
+ if (!IS_ERR(new_hooks)) {
+ hooks_validate(new_hooks);
rcu_assign_pointer(*pp, new_hooks);
+ }
mutex_unlock(&nf_hook_mutex);
if (IS_ERR(new_hooks))
return PTR_ERR(new_hooks);
- hooks_validate(new_hooks);
#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ if (nf_ingress_hook(reg, pf))
net_inc_ingress_queue();
#endif
-#ifdef CONFIG_JUMP_LABEL
- static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (nf_egress_hook(reg, pf))
+ net_inc_egress_queue();
#endif
+ nf_static_key_inc(reg, pf);
+
BUG_ON(p == new_hooks);
nf_hook_entries_free(p);
return 0;
@@ -376,7 +466,7 @@ static bool nf_remove_net_hook(struct nf_hook_entries *old,
if (orig_ops[i] != unreg)
continue;
WRITE_ONCE(old->hooks[i].hook, accept_all);
- WRITE_ONCE(orig_ops[i], &dummy_ops);
+ WRITE_ONCE(orig_ops[i], (void *)&dummy_ops);
return true;
}
@@ -403,12 +493,14 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
if (nf_remove_net_hook(p, reg)) {
#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ if (nf_ingress_hook(reg, pf))
net_dec_ingress_queue();
#endif
-#ifdef CONFIG_JUMP_LABEL
- static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (nf_egress_hook(reg, pf))
+ net_dec_egress_queue();
#endif
+ nf_static_key_dec(reg, pf);
} else {
WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
}
@@ -425,8 +517,12 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
if (reg->pf == NFPROTO_INET) {
- __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
- __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
+ if (reg->hooknum == NF_INET_INGRESS) {
+ __nf_unregister_net_hook(net, NFPROTO_INET, reg);
+ } else {
+ __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+ __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
+ }
} else {
__nf_unregister_net_hook(net, reg->pf, reg);
}
@@ -451,14 +547,20 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
int err;
if (reg->pf == NFPROTO_INET) {
- err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
- if (err < 0)
- return err;
-
- err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
- if (err < 0) {
- __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
- return err;
+ if (reg->hooknum == NF_INET_INGRESS) {
+ err = __nf_register_net_hook(net, NFPROTO_INET, reg);
+ if (err < 0)
+ return err;
+ } else {
+ err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
+ if (err < 0)
+ return err;
+
+ err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
+ if (err < 0) {
+ __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+ return err;
+ }
}
} else {
err = __nf_register_net_hook(net, reg->pf, reg);
@@ -514,7 +616,8 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
case NF_ACCEPT:
break;
case NF_DROP:
- kfree_skb(skb);
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_NETFILTER_DROP);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
@@ -559,32 +662,29 @@ EXPORT_SYMBOL(nf_hook_slow_list);
/* This needs to be compiled in any case to avoid dependencies between the
* nfnetlink_queue code and nf_conntrack.
*/
-struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
+const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nfnl_ct_hook);
-struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_hook);
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-/* This does not belong here, but locally generated errors need it if connection
- tracking in use: without this, connection may not be in hash table, and hence
- manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
- __rcu __read_mostly;
-EXPORT_SYMBOL(ip_ct_attach);
-
-struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_hook);
+/* This does not belong here, but locally generated errors need it if connection
+ * tracking in use: without this, connection may not be in hash table, and hence
+ * manufactured ICMP or RST packets will not be associated with it.
+ */
void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
- void (*attach)(struct sk_buff *, const struct sk_buff *);
+ const struct nf_ct_hook *ct_hook;
if (skb->_nfct) {
rcu_read_lock();
- attach = rcu_dereference(ip_ct_attach);
- if (attach)
- attach(new, skb);
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook)
+ ct_hook->attach(new, skb);
rcu_read_unlock();
}
}
@@ -592,7 +692,7 @@ EXPORT_SYMBOL(nf_ct_attach);
void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
@@ -605,7 +705,7 @@ EXPORT_SYMBOL(nf_conntrack_destroy);
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
const struct sk_buff *skb)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
bool ret = false;
rcu_read_lock();
@@ -644,10 +744,6 @@ static int __net_init netfilter_net_init(struct net *net)
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
#endif
-#if IS_ENABLED(CONFIG_DECNET)
- __netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet));
-#endif
-
#ifdef CONFIG_PROC_FS
net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
net->proc_net);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 0a2196f59106..a8ce04a4bb72 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -46,7 +46,7 @@ struct bitmap_ip {
u8 netmask; /* subnet netmask */
struct timer_list gc; /* garbage collection */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* data extensions */
+ unsigned char extensions[] /* data extensions */
__aligned(__alignof__(u64));
};
@@ -326,7 +326,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
set->variant = &bitmap_ip;
if (!init_map_ip(set, map, first_ip, last_ip,
elements, hosts, netmask)) {
- kfree(map);
+ ip_set_free(map);
return -ENOMEM;
}
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 739e343efaf6..2c625e0f49ec 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -49,7 +49,7 @@ struct bitmap_ipmac {
size_t memsize; /* members size */
struct timer_list gc; /* garbage collector */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* MAC + data extensions */
+ unsigned char extensions[] /* MAC + data extensions */
__aligned(__alignof__(u64));
};
@@ -363,7 +363,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_ipmac;
if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
- kfree(map);
+ ip_set_free(map);
return -ENOMEM;
}
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index b49978dd810d..7138e080def4 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -37,7 +37,7 @@ struct bitmap_port {
size_t memsize; /* members size */
struct timer_list gc; /* garbage collection */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* data extensions */
+ unsigned char extensions[] /* data extensions */
__aligned(__alignof__(u64));
};
@@ -274,7 +274,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_port;
if (!init_map_port(set, map, first_port, last_port)) {
- kfree(map);
+ ip_set_free(map);
return -ENOMEM;
}
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 8dd17589217d..e7ba5b6dd2b7 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -86,7 +86,8 @@ find_set_type(const char *name, u8 family, u8 revision)
{
struct ip_set_type *type;
- list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ list_for_each_entry_rcu(type, &ip_set_type_list, list,
+ lockdep_is_held(&ip_set_type_mutex))
if (STRNCMP(type->name, name) &&
(type->family == family ||
type->family == NFPROTO_UNSPEC) &&
@@ -249,22 +250,7 @@ EXPORT_SYMBOL_GPL(ip_set_type_unregister);
void *
ip_set_alloc(size_t size)
{
- void *members = NULL;
-
- if (size < KMALLOC_MAX_SIZE)
- members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-
- if (members) {
- pr_debug("%p: allocated with kmalloc\n", members);
- return members;
- }
-
- members = vzalloc(size);
- if (!members)
- return NULL;
- pr_debug("%p: allocated with vmalloc\n", members);
-
- return members;
+ return kvzalloc(size, GFP_KERNEL_ACCOUNT);
}
EXPORT_SYMBOL_GPL(ip_set_alloc);
@@ -285,8 +271,7 @@ flag_nested(const struct nlattr *nla)
static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
[IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 },
- [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY,
- .len = sizeof(struct in6_addr) },
+ [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
};
int
@@ -368,7 +353,7 @@ ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
if (unlikely(!c))
return;
- strlcpy(c->str, ext->comment, len + 1);
+ strscpy(c->str, ext->comment, len + 1);
set->ext_size += sizeof(*c) + strlen(c->str) + 1;
rcu_assign_pointer(comment->c, c);
}
@@ -459,6 +444,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len,
for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
if (!add_extension(id, cadt_flags, tb))
continue;
+ if (align < ip_set_extensions[id].align)
+ align = ip_set_extensions[id].align;
len = ALIGN(len, ip_set_extensions[id].align);
set->offset[id] = len;
set->extensions |= ip_set_extensions[id].type;
@@ -649,13 +636,14 @@ ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext,
if (SET_WITH_COUNTER(set)) {
struct ip_set_counter *counter = ext_counter(data, set);
+ ip_set_update_counter(counter, ext, flags);
+
if (flags & IPSET_FLAG_MATCH_COUNTERS &&
!(ip_set_match_counter(ip_set_get_packets(counter),
mext->packets, mext->packets_op) &&
ip_set_match_counter(ip_set_get_bytes(counter),
mext->bytes, mext->bytes_op)))
return false;
- ip_set_update_counter(counter, ext, flags);
}
if (SET_WITH_SKBINFO(set))
ip_set_get_skbinfo(ext_skbinfo(data, set),
@@ -975,20 +963,9 @@ static struct nlmsghdr *
start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
enum ipset_cmd cmd)
{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
-
- nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd),
- sizeof(*nfmsg), flags);
- if (!nlh)
- return NULL;
-
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = NFPROTO_IPV4;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
- return nlh;
+ return nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags,
+ NFPROTO_IPV4, NFNETLINK_V0, 0);
}
/* Create a set */
@@ -1054,26 +1031,22 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
return 0;
}
-static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
return -EOPNOTSUPP;
}
-static int ip_set_create(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *set, *clash = NULL;
ip_set_id_t index = IPSET_INVALID_ID;
struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {};
const char *name, *typename;
u8 family, revision;
- u32 flags = flag_exist(nlh);
+ u32 flags = flag_exist(info->nlh);
int ret = 0;
if (unlikely(protocol_min_failed(attr) ||
@@ -1099,7 +1072,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
if (!set)
return -ENOMEM;
spin_lock_init(&set->lock);
- strlcpy(set->name, name, IPSET_MAXNAMELEN);
+ strscpy(set->name, name, IPSET_MAXNAMELEN);
set->family = family;
set->revision = revision;
@@ -1121,8 +1094,10 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
ret = -IPSET_ERR_PROTOCOL;
goto put_out;
}
+ /* Set create flags depending on the type revision */
+ set->flags |= set->type->create_flags[revision];
- ret = set->type->create(net, set, tb, flags);
+ ret = set->type->create(info->net, set, tb, flags);
if (ret != 0)
goto put_out;
@@ -1204,12 +1179,10 @@ ip_set_destroy_set(struct ip_set *set)
kfree(set);
}
-static int ip_set_destroy(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *s;
ip_set_id_t i;
int ret = 0;
@@ -1251,10 +1224,12 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
/* Modified by ip_set_destroy() only, which is serialized */
inst->is_destroyed = false;
} else {
+ u32 flags = flag_exist(info->nlh);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
- ret = -ENOENT;
+ if (!(flags & IPSET_FLAG_EXIST))
+ ret = -ENOENT;
goto out;
} else if (s->ref || s->ref_netlink) {
ret = -IPSET_ERR_BUSY;
@@ -1283,12 +1258,10 @@ ip_set_flush_set(struct ip_set *set)
ip_set_unlock(set);
}
-static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *s;
ip_set_id_t i;
@@ -1323,12 +1296,10 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
.len = IPSET_MAXNAMELEN - 1 },
};
-static int ip_set_rename(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *set, *s;
const char *name2;
ip_set_id_t i;
@@ -1373,12 +1344,10 @@ out:
* so the ip_set_list always contains valid pointers to the sets.
*/
-static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *from, *to;
ip_set_id_t from_id, to_id;
char from_name[IPSET_MAXNAMELEN];
@@ -1641,7 +1610,7 @@ dump_last:
goto next_set;
if (set->variant->uref)
set->variant->uref(set, cb, true);
- /* fall through */
+ fallthrough;
default:
ret = set->variant->list(set, skb, cb);
if (!cb->args[IPSET_CB_ARG0])
@@ -1688,10 +1657,8 @@ out:
return ret < 0 ? ret : skb->len;
}
-static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
@@ -1702,7 +1669,7 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
.dump = ip_set_dump_do,
.done = ip_set_dump_done,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
}
@@ -1718,8 +1685,8 @@ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
};
static int
-call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
- struct nlattr *tb[], enum ipset_adt adt,
+call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt,
u32 flags, bool use_lineno)
{
int ret;
@@ -1752,11 +1719,13 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
skb2 = nlmsg_new(payload, GFP_KERNEL);
if (!skb2)
return -ENOMEM;
- rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+ rep = nlmsg_put(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
errmsg = nlmsg_data(rep);
errmsg->error = ret;
- memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+ unsafe_memcpy(&errmsg->msg, nlh, nlh->nlmsg_len,
+ /* Bounds checked by the skb layer. */);
+
cmdattr = (void *)&errmsg->msg + min_len;
ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr,
@@ -1771,8 +1740,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
*errline = lineno;
- netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
+ nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
/* Signal netlink not to send its ACK/errmsg. */
return -EINTR;
}
@@ -1816,7 +1784,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl,
attr[IPSET_ATTR_DATA],
set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, adt, flags,
+ ret = call_ad(net, ctnl, skb, set, tb, adt, flags,
use_lineno);
} else {
int nla_rem;
@@ -1827,7 +1795,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl,
nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, adt,
+ ret = call_ad(net, ctnl, skb, set, tb, adt,
flags, use_lineno);
if (ret < 0)
return ret;
@@ -1836,30 +1804,24 @@ static int ip_set_ad(struct net *net, struct sock *ctnl,
return ret;
}
-static int ip_set_uadd(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- return ip_set_ad(net, ctnl, skb,
- IPSET_ADD, nlh, attr, extack);
+ return ip_set_ad(info->net, info->sk, skb,
+ IPSET_ADD, info->nlh, attr, info->extack);
}
-static int ip_set_udel(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- return ip_set_ad(net, ctnl, skb,
- IPSET_DEL, nlh, attr, extack);
+ return ip_set_ad(info->net, info->sk, skb,
+ IPSET_DEL, info->nlh, attr, info->extack);
}
-static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *set;
struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
int ret = 0;
@@ -1891,16 +1853,13 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
/* Get headed data of a set */
-static int ip_set_header(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
const struct ip_set *set;
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
- int ret = 0;
if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME]))
@@ -1914,7 +1873,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_HEADER);
if (!nlh2)
goto nlmsg_failure;
@@ -1926,11 +1885,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -1948,10 +1903,8 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
};
-static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
@@ -1974,7 +1927,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_TYPE);
if (!nlh2)
goto nlmsg_failure;
@@ -1987,11 +1940,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
nlmsg_end(skb2, nlh2);
pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -2007,14 +1956,11 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
};
-static int ip_set_protocol(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
- int ret = 0;
if (unlikely(!attr[IPSET_ATTR_PROTOCOL]))
return -IPSET_ERR_PROTOCOL;
@@ -2023,7 +1969,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_PROTOCOL);
if (!nlh2)
goto nlmsg_failure;
@@ -2033,11 +1979,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -2048,17 +1990,14 @@ nlmsg_failure:
/* Get set by name or index, from userspace */
-static int ip_set_byname(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
ip_set_id_t id = IPSET_INVALID_ID;
const struct ip_set *set;
- int ret = 0;
if (unlikely(protocol_failed(attr) ||
!attr[IPSET_ATTR_SETNAME]))
@@ -2072,7 +2011,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_GET_BYNAME);
if (!nlh2)
goto nlmsg_failure;
@@ -2082,11 +2021,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -2100,17 +2035,14 @@ static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_INDEX] = { .type = NLA_U16 },
};
-static int ip_set_byindex(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
ip_set_id_t id = IPSET_INVALID_ID;
const struct ip_set *set;
- int ret = 0;
if (unlikely(protocol_failed(attr) ||
!attr[IPSET_ATTR_INDEX]))
@@ -2127,7 +2059,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_GET_BYINDEX);
if (!nlh2)
goto nlmsg_failure;
@@ -2136,11 +2068,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -2152,80 +2080,96 @@ nlmsg_failure:
static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
[IPSET_CMD_NONE] = {
.call = ip_set_none,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
},
[IPSET_CMD_CREATE] = {
.call = ip_set_create,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_create_policy,
},
[IPSET_CMD_DESTROY] = {
.call = ip_set_destroy,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_FLUSH] = {
.call = ip_set_flush,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_RENAME] = {
.call = ip_set_rename,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname2_policy,
},
[IPSET_CMD_SWAP] = {
.call = ip_set_swap,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname2_policy,
},
[IPSET_CMD_LIST] = {
.call = ip_set_dump,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_dump_policy,
},
[IPSET_CMD_SAVE] = {
.call = ip_set_dump,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_ADD] = {
.call = ip_set_uadd,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_adt_policy,
},
[IPSET_CMD_DEL] = {
.call = ip_set_udel,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_adt_policy,
},
[IPSET_CMD_TEST] = {
.call = ip_set_utest,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_adt_policy,
},
[IPSET_CMD_HEADER] = {
.call = ip_set_header,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_TYPE] = {
.call = ip_set_type,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_type_policy,
},
[IPSET_CMD_PROTOCOL] = {
.call = ip_set_protocol,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_protocol_policy,
},
[IPSET_CMD_GET_BYNAME] = {
.call = ip_set_byname,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_GET_BYINDEX] = {
.call = ip_set_byindex,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_index_policy,
},
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index e52d7b7597a0..3adc291d9ce1 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -37,37 +37,12 @@
*/
/* Number of elements to store in an initial array block */
-#define AHASH_INIT_SIZE 4
+#define AHASH_INIT_SIZE 2
/* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE)
+#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE)
/* Max muber of elements in the array block when tuned */
#define AHASH_MAX_TUNED 64
-
-/* Max number of elements can be tuned */
-#ifdef IP_SET_HASH_WITH_MULTI
-#define AHASH_MAX(h) ((h)->ahash_max)
-
-static u8
-tune_ahash_max(u8 curr, u32 multi)
-{
- u32 n;
-
- if (multi < curr)
- return curr;
-
- n = curr + AHASH_INIT_SIZE;
- /* Currently, at listing one hash bucket must fit into a message.
- * Therefore we have a hard limit here.
- */
- return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
-}
-
-#define TUNE_AHASH_MAX(h, multi) \
- ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
-#else
-#define AHASH_MAX(h) AHASH_MAX_SIZE
-#define TUNE_AHASH_MAX(h, multi)
-#endif
+#define AHASH_MAX(h) ((h)->bucketsize)
/* A hash bucket */
struct hbucket {
@@ -76,7 +51,7 @@ struct hbucket {
DECLARE_BITMAP(used, AHASH_MAX_TUNED);
u8 size; /* size of the array */
u8 pos; /* position of the first free entry */
- unsigned char value[0] /* the array of the values */
+ unsigned char value[] /* the array of the values */
__aligned(__alignof__(u64));
};
@@ -109,7 +84,7 @@ struct htable {
u8 htable_bits; /* size of hash table == 2^htable_bits */
u32 maxelem; /* Maxelem per region */
struct ip_set_region *hregion; /* Region locks and ext sizes */
- struct hbucket __rcu *bucket[0]; /* hashtable buckets */
+ struct hbucket __rcu *bucket[]; /* hashtable buckets */
};
#define hbucket(h, i) ((h)->bucket[i])
@@ -132,31 +107,17 @@ htable_size(u8 hbits)
{
size_t hsize;
- /* We must fit both into u32 in jhash and size_t */
+ /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */
if (hbits > 31)
return 0;
hsize = jhash_size(hbits);
- if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *)
+ if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *)
< hsize)
return 0;
return hsize * sizeof(struct hbucket *) + sizeof(struct htable);
}
-/* Compute htable_bits from the user input parameter hashsize */
-static u8
-htable_bits(u32 hashsize)
-{
- /* Assume that hashsize == 2^htable_bits */
- u8 bits = fls(hashsize - 1);
-
- if (jhash_size(bits) != hashsize)
- /* Round up to the first 2^n value */
- bits = fls(hashsize);
-
- return bits;
-}
-
#ifdef IP_SET_HASH_WITH_NETS
#if IPSET_NET_COUNT > 1
#define __CIDR(cidr, i) (cidr[i])
@@ -321,9 +282,7 @@ struct htype {
#ifdef IP_SET_HASH_WITH_MARKMASK
u32 markmask; /* markmask value for mark mask to store */
#endif
-#ifdef IP_SET_HASH_WITH_MULTI
- u8 ahash_max; /* max elements in an array block */
-#endif
+ u8 bucketsize; /* max elements in an array block */
#ifdef IP_SET_HASH_WITH_NETMASK
u8 netmask; /* netmask value for subnets to store */
#endif
@@ -644,7 +603,7 @@ mtype_resize(struct ip_set *set, bool retried)
struct htype *h = set->data;
struct htable *t, *orig;
u8 htable_bits;
- size_t dsize = set->dsize;
+ size_t hsize, dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
struct mtype_elem *tmp;
@@ -668,21 +627,19 @@ mtype_resize(struct ip_set *set, bool retried)
retry:
ret = 0;
htable_bits++;
- if (!htable_bits) {
- /* In case we have plenty of memory :-) */
- pr_warn("Cannot increase the hashsize of set %s further\n",
- set->name);
- ret = -IPSET_ERR_HASH_FULL;
- goto out;
- }
- t = ip_set_alloc(htable_size(htable_bits));
+ if (!htable_bits)
+ goto hbwarn;
+ hsize = htable_size(htable_bits);
+ if (!hsize)
+ goto hbwarn;
+ t = ip_set_alloc(hsize);
if (!t) {
ret = -ENOMEM;
goto out;
}
t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits));
if (!t->hregion) {
- kfree(t);
+ ip_set_free(t);
ret = -ENOMEM;
goto out;
}
@@ -817,6 +774,12 @@ cleanup:
if (ret == -EAGAIN)
goto retry;
goto out;
+
+hbwarn:
+ /* In case we have plenty of memory :-) */
+ pr_warn("Cannot increase the hashsize of set %s further\n", set->name);
+ ret = -IPSET_ERR_HASH_FULL;
+ goto out;
}
/* Get the current number of elements and ext_size in the set */
@@ -950,7 +913,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
- TUNE_AHASH_MAX(h, multi);
+#ifdef IP_SET_HASH_WITH_MULTI
+ if (h->bucketsize >= AHASH_MAX_TUNED)
+ goto set_full;
+ else if (h->bucketsize < multi)
+ h->bucketsize += AHASH_INIT_SIZE;
+#endif
if (n->size >= AHASH_MAX(h)) {
/* Trigger rehashing */
mtype_data_next(&h->next, d);
@@ -1305,6 +1273,11 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
goto nla_put_failure;
#endif
+ if (set->flags & IPSET_CREATE_FLAG_BUCKETSIZE) {
+ if (nla_put_u8(skb, IPSET_ATTR_BUCKETSIZE, h->bucketsize) ||
+ nla_put_net32(skb, IPSET_ATTR_INITVAL, htonl(h->initval)))
+ goto nla_put_failure;
+ }
if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
@@ -1520,7 +1493,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
if (!h)
return -ENOMEM;
- hbits = htable_bits(hashsize);
+ /* Compute htable_bits from the user input parameter hashsize.
+ * Assume that hashsize == 2^htable_bits,
+ * otherwise round up to the first 2^n value.
+ */
+ hbits = fls(hashsize - 1);
hsize = htable_size(hbits);
if (hsize == 0) {
kfree(h);
@@ -1533,7 +1510,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
}
t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits));
if (!t->hregion) {
- kfree(t);
+ ip_set_free(t);
kfree(h);
return -ENOMEM;
}
@@ -1547,8 +1524,20 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#ifdef IP_SET_HASH_WITH_MARKMASK
h->markmask = markmask;
#endif
- get_random_bytes(&h->initval, sizeof(h->initval));
-
+ if (tb[IPSET_ATTR_INITVAL])
+ h->initval = ntohl(nla_get_be32(tb[IPSET_ATTR_INITVAL]));
+ else
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->bucketsize = AHASH_MAX_SIZE;
+ if (tb[IPSET_ATTR_BUCKETSIZE]) {
+ h->bucketsize = nla_get_u8(tb[IPSET_ATTR_BUCKETSIZE]);
+ if (h->bucketsize < AHASH_INIT_SIZE)
+ h->bucketsize = AHASH_INIT_SIZE;
+ else if (h->bucketsize > AHASH_MAX_SIZE)
+ h->bucketsize = AHASH_MAX_SIZE;
+ else if (h->bucketsize % 2)
+ h->bucketsize += 1;
+ }
t->htable_bits = hbits;
t->maxelem = h->maxelem / ahash_numof_locks(hbits);
RCU_INIT_POINTER(h->table, t);
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 5d6d68eaf6a9..dd30c03d5a23 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -23,7 +23,8 @@
/* 1 Counters support */
/* 2 Comments support */
/* 3 Forceadd support */
-#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */
+/* 4 skbinfo support */
+#define IPSET_TYPE_REV_MAX 5 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -131,8 +132,11 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
- if (ip > ip_to)
+ if (ip > ip_to) {
+ if (ip_to == 0)
+ return -IPSET_ERR_HASH_ELEM;
swap(ip, ip_to);
+ }
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
@@ -143,6 +147,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
+ /* 64bit division is not allowed on 32bit */
+ if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE)
+ return -ERANGE;
+
if (retried) {
ip = ntohl(h->next.ip);
e.ip = htonl(ip);
@@ -277,11 +285,13 @@ static struct ip_set_type hash_ip_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ip_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
index eceb7bc4a93a..467c59a83c0a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmac.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -23,7 +23,7 @@
#include <linux/netfilter/ipset/ip_set_hash.h>
#define IPSET_TYPE_REV_MIN 0
-#define IPSET_TYPE_REV_MAX 0
+#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>");
@@ -268,11 +268,13 @@ static struct ip_set_type hash_ipmac_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipmac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index aba1df617d6e..153de3457423 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -21,7 +21,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */
+/* 2 skbinfo support */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
@@ -120,6 +121,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK]));
e.mark &= h->markmask;
+ if (e.mark == 0 && e.ip == 0)
+ return -IPSET_ERR_HASH_ELEM;
if (adt == IPSET_TEST ||
!(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) {
@@ -132,8 +135,11 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
- if (ip > ip_to)
+ if (ip > ip_to) {
+ if (e.mark == 0 && ip_to == 0)
+ return -IPSET_ERR_HASH_ELEM;
swap(ip, ip_to);
+ }
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
@@ -142,6 +148,9 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
ip_set_mask_from_to(ip, ip_to, cidr);
}
+ if (((u64)ip_to - ip + 1) > IPSET_MAX_RANGE)
+ return -ERANGE;
+
if (retried)
ip = ntohl(h->next.ip);
for (; ip <= ip_to; ip++) {
@@ -274,12 +283,14 @@ static struct ip_set_type hash_ipmark_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipmark_create,
.create_policy = {
[IPSET_ATTR_MARKMASK] = { .type = NLA_U32 },
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 1ff228717e29..7303138e46be 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -25,7 +25,8 @@
/* 2 Counters support added */
/* 3 Comments support added */
/* 4 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+/* 5 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -172,6 +173,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(port, port_to);
}
+ if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE)
+ return -ERANGE;
+
if (retried)
ip = ntohl(h->next.ip);
for (; ip <= ip_to; ip++) {
@@ -341,11 +345,13 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipport_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index fa88afd812fa..334fb1ad0e86 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -25,7 +25,8 @@
/* 2 Counters support added */
/* 3 Comments support added */
/* 4 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+/* 5 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -179,6 +180,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(port, port_to);
}
+ if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE)
+ return -ERANGE;
+
if (retried)
ip = ntohl(h->next.ip);
for (; ip <= ip_to; ip++) {
@@ -356,11 +360,13 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipportip_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index eef6ecfcb409..7df94f437f60 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -27,7 +27,8 @@
/* 4 Counters support added */
/* 5 Comments support added */
/* 6 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+/* 7 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -252,6 +253,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(port, port_to);
}
+ if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE)
+ return -ERANGE;
+
ip2_to = ip2_from;
if (tb[IPSET_ATTR_IP2_TO]) {
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
@@ -513,11 +517,13 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipportnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 0b61593165ef..718814730acf 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -16,7 +16,7 @@
#include <linux/netfilter/ipset/ip_set_hash.h>
#define IPSET_TYPE_REV_MIN 0
-#define IPSET_TYPE_REV_MAX 0
+#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -125,11 +125,13 @@ static struct ip_set_type hash_mac_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_mac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 136cf0781d3a..1422739d9aa2 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -24,7 +24,8 @@
/* 3 Counters support added */
/* 4 Comments support added */
/* 5 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */
+/* 6 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 7 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -139,7 +140,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net4_elem e = { .cidr = HOST_MASK };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip = 0, ip_to = 0;
+ u32 ip = 0, ip_to = 0, ipn, n = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -187,6 +188,15 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ip + UINT_MAX == ip_to)
return -IPSET_ERR_HASH_RANGE;
}
+ ipn = ip;
+ do {
+ ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr);
+ n++;
+ } while (ipn++ < ip_to);
+
+ if (n > IPSET_MAX_RANGE)
+ return -ERANGE;
+
if (retried)
ip = ntohl(h->next.ip);
do {
@@ -354,11 +364,13 @@ static struct ip_set_type hash_net_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_net_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index be5e95a0d876..9810f5bf63f5 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -26,7 +26,8 @@
/* 4 Comments support added */
/* 5 Forceadd support added */
/* 6 skbinfo support added */
-#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */
+/* 7 interface wildcard support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -201,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip = 0, ip_to = 0;
+ u32 ip = 0, ip_to = 0, ipn, n = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -225,7 +226,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
- nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+ nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -255,6 +256,14 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
} else {
ip_set_mask_from_to(ip, ip_to, e.cidr);
}
+ ipn = ip;
+ do {
+ ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr);
+ n++;
+ } while (ipn++ < ip_to);
+
+ if (n > IPSET_MAX_RANGE)
+ return -ERANGE;
if (retried)
ip = ntohl(h->next.ip);
@@ -442,7 +451,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
ip6_netmask(&e.ip, e.cidr);
- nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+ nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -470,11 +479,13 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netiface_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index da4ef910b12d..3d09eefe998a 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -22,7 +22,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+/* 2 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -167,7 +168,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0;
- u32 ip2 = 0, ip2_from = 0, ip2_to = 0;
+ u32 ip2 = 0, ip2_from = 0, ip2_to = 0, ipn;
+ u64 n = 0, m = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -243,6 +245,19 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
} else {
ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
}
+ ipn = ip;
+ do {
+ ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]);
+ n++;
+ } while (ipn++ < ip_to);
+ ipn = ip2_from;
+ do {
+ ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]);
+ m++;
+ } while (ipn++ < ip2_to);
+
+ if (n*m > IPSET_MAX_RANGE)
+ return -ERANGE;
if (retried) {
ip = ntohl(h->next.ip[0]);
@@ -459,11 +474,13 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 34448df80fb9..09cf72eb37f8 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -26,7 +26,8 @@
/* 4 Counters support added */
/* 5 Comments support added */
/* 6 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+/* 7 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -157,7 +158,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 port, port_to, p = 0, ip = 0, ip_to = 0;
+ u32 port, port_to, p = 0, ip = 0, ip_to = 0, ipn;
+ u64 n = 0;
bool with_ports = false;
u8 cidr;
int ret;
@@ -234,6 +236,14 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
} else {
ip_set_mask_from_to(ip, ip_to, e.cidr + 1);
}
+ ipn = ip;
+ do {
+ ipn = ip_set_range_to_cidr(ipn, ip_to, &cidr);
+ n++;
+ } while (ipn++ < ip_to);
+
+ if (n*(port_to - port + 1) > IPSET_MAX_RANGE)
+ return -ERANGE;
if (retried) {
ip = ntohl(h->next.ip);
@@ -460,11 +470,13 @@ static struct ip_set_type hash_netport_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netport_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 934c1712cba8..19bcdb3141f6 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -23,7 +23,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 0 Comments support added */
/* 1 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+/* 2 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -181,7 +182,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, p = 0, port, port_to;
- u32 ip2_from = 0, ip2_to = 0, ip2;
+ u32 ip2_from = 0, ip2_to = 0, ip2, ipn;
+ u64 n = 0, m = 0;
bool with_ports = false;
int ret;
@@ -283,6 +285,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
} else {
ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
}
+ ipn = ip;
+ do {
+ ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]);
+ n++;
+ } while (ipn++ < ip_to);
+ ipn = ip2_from;
+ do {
+ ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]);
+ m++;
+ } while (ipn++ < ip2_to);
+
+ if (n*m*(port_to - port + 1) > IPSET_MAX_RANGE)
+ return -ERANGE;
if (retried) {
ip = ntohl(h->next.ip[0]);
@@ -558,11 +573,13 @@ static struct ip_set_type hash_netportnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netportnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index cd747c0962fd..5a67f7966574 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -59,7 +59,7 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
/* Don't lookup sub-counters at all */
opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;
if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
- opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
+ opt->cmdflags |= IPSET_FLAG_SKIP_COUNTER_UPDATE;
list_for_each_entry_rcu(e, &map->members, list) {
ret = ip_set_test(e->id, skb, par, opt);
if (ret <= 0)
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 5b672e05d758..271da8447b29 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -4,9 +4,9 @@
#
menuconfig IP_VS
tristate "IP virtual server support"
- depends on NET && INET && NETFILTER
+ depends on INET && NETFILTER
depends on (NF_CONNTRACK || NF_CONNTRACK=n)
- ---help---
+ help
IP Virtual Server support will let you build a high-performance
virtual server based on cluster of two or more real servers. This
option must be enabled for at least one of the clustered computers
@@ -29,16 +29,15 @@ if IP_VS
config IP_VS_IPV6
bool "IPv6 support for IPVS"
depends on IPV6 = y || IP_VS = IPV6
- select IP6_NF_IPTABLES
select NF_DEFRAG_IPV6
- ---help---
+ help
Add IPv6 support to IPVS.
Say Y if unsure.
config IP_VS_DEBUG
bool "IP virtual server debugging"
- ---help---
+ help
Say Y here if you want to get additional messages useful in
debugging the IP virtual server code. You can change the debug
level in /proc/sys/net/ipv4/vs/debug_level
@@ -47,7 +46,7 @@ config IP_VS_TAB_BITS
int "IPVS connection table size (the Nth power of 2)"
range 8 20
default 12
- ---help---
+ help
The IPVS connection hash table uses the chaining scheme to handle
hash collisions. Using a big IPVS connection hash table will greatly
reduce conflicts when there are hundreds of thousands of connections
@@ -78,13 +77,13 @@ comment "IPVS transport protocol load balancing support"
config IP_VS_PROTO_TCP
bool "TCP load balancing support"
- ---help---
+ help
This option enables support for load balancing TCP transport
protocol. Say Y if unsure.
config IP_VS_PROTO_UDP
bool "UDP load balancing support"
- ---help---
+ help
This option enables support for load balancing UDP transport
protocol. Say Y if unsure.
@@ -93,20 +92,20 @@ config IP_VS_PROTO_AH_ESP
config IP_VS_PROTO_ESP
bool "ESP load balancing support"
- ---help---
+ help
This option enables support for load balancing ESP (Encapsulation
Security Payload) transport protocol. Say Y if unsure.
config IP_VS_PROTO_AH
bool "AH load balancing support"
- ---help---
+ help
This option enables support for load balancing AH (Authentication
Header) transport protocol. Say Y if unsure.
config IP_VS_PROTO_SCTP
bool "SCTP load balancing support"
select LIBCRC32C
- ---help---
+ help
This option enables support for load balancing SCTP transport
protocol. Say Y if unsure.
@@ -114,7 +113,7 @@ comment "IPVS scheduler"
config IP_VS_RR
tristate "round-robin scheduling"
- ---help---
+ help
The robin-robin scheduling algorithm simply directs network
connections to different real servers in a round-robin manner.
@@ -123,7 +122,7 @@ config IP_VS_RR
config IP_VS_WRR
tristate "weighted round-robin scheduling"
- ---help---
+ help
The weighted robin-robin scheduling algorithm directs network
connections to different real servers based on server weights
in a round-robin manner. Servers with higher weights receive
@@ -136,7 +135,7 @@ config IP_VS_WRR
config IP_VS_LC
tristate "least-connection scheduling"
- ---help---
+ help
The least-connection scheduling algorithm directs network
connections to the server with the least number of active
connections.
@@ -146,7 +145,7 @@ config IP_VS_LC
config IP_VS_WLC
tristate "weighted least-connection scheduling"
- ---help---
+ help
The weighted least-connection scheduling algorithm directs network
connections to the server with the least active connections
normalized by the server weight.
@@ -156,7 +155,7 @@ config IP_VS_WLC
config IP_VS_FO
tristate "weighted failover scheduling"
- ---help---
+ help
The weighted failover scheduling algorithm directs network
connections to the server with the highest weight that is
currently available.
@@ -166,7 +165,7 @@ config IP_VS_FO
config IP_VS_OVF
tristate "weighted overflow scheduling"
- ---help---
+ help
The weighted overflow scheduling algorithm directs network
connections to the server with the highest weight that is
currently available and overflows to the next when active
@@ -177,7 +176,7 @@ config IP_VS_OVF
config IP_VS_LBLC
tristate "locality-based least-connection scheduling"
- ---help---
+ help
The locality-based least-connection scheduling algorithm is for
destination IP load balancing. It is usually used in cache cluster.
This algorithm usually directs packet destined for an IP address to
@@ -191,7 +190,7 @@ config IP_VS_LBLC
config IP_VS_LBLCR
tristate "locality-based least-connection with replication scheduling"
- ---help---
+ help
The locality-based least-connection with replication scheduling
algorithm is also for destination IP load balancing. It is
usually used in cache cluster. It differs from the LBLC scheduling
@@ -209,7 +208,7 @@ config IP_VS_LBLCR
config IP_VS_DH
tristate "destination hashing scheduling"
- ---help---
+ help
The destination hashing scheduling algorithm assigns network
connections to the servers through looking up a statically assigned
hash table by their destination IP addresses.
@@ -219,7 +218,7 @@ config IP_VS_DH
config IP_VS_SH
tristate "source hashing scheduling"
- ---help---
+ help
The source hashing scheduling algorithm assigns network
connections to the servers through looking up a statically assigned
hash table by their source IP addresses.
@@ -229,7 +228,7 @@ config IP_VS_SH
config IP_VS_MH
tristate "maglev hashing scheduling"
- ---help---
+ help
The maglev consistent hashing scheduling algorithm provides the
Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
network connections to the servers through looking up a statically
@@ -248,7 +247,7 @@ config IP_VS_MH
config IP_VS_SED
tristate "shortest expected delay scheduling"
- ---help---
+ help
The shortest expected delay scheduling algorithm assigns network
connections to the server with the shortest expected delay. The
expected delay that the job will experience is (Ci + 1) / Ui if
@@ -261,7 +260,7 @@ config IP_VS_SED
config IP_VS_NQ
tristate "never queue scheduling"
- ---help---
+ help
The never queue scheduling algorithm adopts a two-speed model.
When there is an idle server available, the job will be sent to
the idle server, instead of waiting for a fast one. When there
@@ -272,13 +271,24 @@ config IP_VS_NQ
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_TWOS
+ tristate "weighted random twos choice least-connection scheduling"
+ help
+ The weighted random twos choice least-connection scheduling
+ algorithm picks two random real servers and directs network
+ connections to the server with the least active connections
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
comment 'IPVS SH scheduler'
config IP_VS_SH_TAB_BITS
int "IPVS source hashing table size (the Nth power of 2)"
range 4 20
default 8
- ---help---
+ help
The source hashing scheduler maps source IPs to destinations
stored in a hash table. This table is tiled by each destination
until all slots in the table are filled. When using weights to
@@ -293,7 +303,7 @@ config IP_VS_MH_TAB_INDEX
int "IPVS maglev hashing table index of size (the prime numbers)"
range 8 17
default 12
- ---help---
+ help
The maglev hashing scheduler maps source IPs to destinations
stored in a hash table. This table is assigned by a preference
list of the positions to each destination until all slots in
@@ -308,11 +318,11 @@ config IP_VS_MH_TAB_INDEX
comment 'IPVS application helper'
config IP_VS_FTP
- tristate "FTP protocol helper"
+ tristate "FTP protocol helper"
depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \
NF_CONNTRACK_FTP
select IP_VS_NFCT
- ---help---
+ help
FTP is a protocol that transfers IP address and/or port number in
the payload. In the virtual server via Network Address Translation,
the IP address and port number of real servers cannot be sent to
@@ -326,7 +336,7 @@ config IP_VS_FTP
config IP_VS_NFCT
bool "Netfilter connection tracking"
depends on NF_CONNTRACK
- ---help---
+ help
The Netfilter connection tracking support allows the IPVS
connection state to be exported to the Netfilter framework
for filtering purposes.
@@ -335,7 +345,7 @@ config IP_VS_PE_SIP
tristate "SIP persistence engine"
depends on IP_VS_PROTO_UDP
depends on NF_CONNTRACK_SIP
- ---help---
+ help
Allow persistence based on the SIP Call-ID
endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index bfce2677fda2..bb5d8125c82a 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+obj-$(CONFIG_IP_VS_TWOS) += ip_vs_twos.o
# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index f9b16f2b2219..fdacbc3c15be 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -599,13 +599,19 @@ static const struct seq_operations ip_vs_app_seq_ops = {
int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
INIT_LIST_HEAD(&ipvs->app_list);
- proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops,
- sizeof(struct seq_net_private));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net,
+ &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+#endif
return 0;
}
void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
unregister_ip_vs_app(ipvs, NULL /* all */);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_app", ipvs->net->proc_net);
+#endif
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 02f2f636798d..13534e02346c 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -402,6 +402,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
unsigned int hash;
struct ip_vs_conn *cp, *ret=NULL;
+ const union nf_inet_addr *saddr;
+ __be16 sport;
/*
* Check for "full" addressed entries
@@ -411,10 +413,20 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
rcu_read_lock();
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
- if (p->vport == cp->cport && p->cport == cp->dport &&
- cp->af == p->af &&
+ if (p->vport != cp->cport)
+ continue;
+
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ sport = cp->vport;
+ saddr = &cp->vaddr;
+ } else {
+ sport = cp->dport;
+ saddr = &cp->daddr;
+ }
+
+ if (p->cport == sport && cp->af == p->af &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
- ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+ ip_vs_addr_equal(p->af, p->caddr, saddr) &&
p->protocol == cp->protocol &&
cp->ipvs == p->ipvs) {
if (!__ip_vs_conn_get(cp))
@@ -807,6 +819,31 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head)
kmem_cache_free(ip_vs_conn_cachep, cp);
}
+/* Try to delete connection while not holding reference */
+static void ip_vs_conn_del(struct ip_vs_conn *cp)
+{
+ if (del_timer(&cp->timer)) {
+ /* Drop cp->control chain too */
+ if (cp->control)
+ cp->timeout = 0;
+ ip_vs_conn_expire(&cp->timer);
+ }
+}
+
+/* Try to delete connection while holding reference */
+static void ip_vs_conn_del_put(struct ip_vs_conn *cp)
+{
+ if (del_timer(&cp->timer)) {
+ /* Drop cp->control chain too */
+ if (cp->control)
+ cp->timeout = 0;
+ __ip_vs_conn_put(cp);
+ ip_vs_conn_expire(&cp->timer);
+ } else {
+ __ip_vs_conn_put(cp);
+ }
+}
+
static void ip_vs_conn_expire(struct timer_list *t)
{
struct ip_vs_conn *cp = from_timer(cp, t, timer);
@@ -827,14 +864,17 @@ static void ip_vs_conn_expire(struct timer_list *t)
/* does anybody control me? */
if (ct) {
+ bool has_ref = !cp->timeout && __ip_vs_conn_get(ct);
+
ip_vs_control_del(cp);
/* Drop CTL or non-assured TPL if not used anymore */
- if (!cp->timeout && !atomic_read(&ct->n_control) &&
+ if (has_ref && !atomic_read(&ct->n_control) &&
(!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
!(ct->state & IP_VS_CTPL_S_ASSURED))) {
IP_VS_DBG(4, "drop controlling connection\n");
- ct->timeout = 0;
- ip_vs_conn_expire_now(ct);
+ ip_vs_conn_del_put(ct);
+ } else if (has_ref) {
+ __ip_vs_conn_put(ct);
}
}
@@ -1225,8 +1265,8 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
* The drop rate array needs tuning for real environments.
* Called from timer bh only => no locking
*/
- static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
- static char todrop_counter[9] = {0};
+ static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static signed char todrop_counter[9] = {0};
int i;
/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
@@ -1268,7 +1308,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
* Randomly scan 1/32 of the whole table every second
*/
for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
- unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
+ unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask;
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->ipvs != ipvs)
@@ -1317,8 +1357,7 @@ try_drop:
drop:
IP_VS_DBG(4, "drop connection\n");
- cp->timeout = 0;
- ip_vs_conn_expire_now(cp);
+ ip_vs_conn_del(cp);
}
cond_resched_rcu();
}
@@ -1341,19 +1380,15 @@ flush_again:
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
if (cp->ipvs != ipvs)
continue;
- /* As timers are expired in LIFO order, restart
- * the timer of controlling connection first, so
- * that it is expired after us.
- */
+ if (atomic_read(&cp->n_control))
+ continue;
cp_c = cp->control;
- /* cp->control is valid only with reference to cp */
- if (cp_c && __ip_vs_conn_get(cp)) {
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_del(cp);
+ if (cp_c && !atomic_read(&cp_c->n_control)) {
IP_VS_DBG(4, "del controlling connection\n");
- ip_vs_conn_expire_now(cp_c);
- __ip_vs_conn_put(cp);
+ ip_vs_conn_del(cp_c);
}
- IP_VS_DBG(4, "del connection\n");
- ip_vs_conn_expire_now(cp);
}
cond_resched_rcu();
}
@@ -1366,6 +1401,45 @@ flush_again:
goto flush_again;
}
}
+
+#ifdef CONFIG_SYSCTL
+void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
+{
+ int idx;
+ struct ip_vs_conn *cp, *cp_c;
+ struct ip_vs_dest *dest;
+
+ rcu_read_lock();
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ if (cp->ipvs != ipvs)
+ continue;
+
+ dest = cp->dest;
+ if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))
+ continue;
+
+ if (atomic_read(&cp->n_control))
+ continue;
+
+ cp_c = cp->control;
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_del(cp);
+ if (cp_c && !atomic_read(&cp_c->n_control)) {
+ IP_VS_DBG(4, "del controlling connection\n");
+ ip_vs_conn_del(cp_c);
+ }
+ }
+ cond_resched_rcu();
+
+ /* netns clean up started, abort delayed work */
+ if (!ipvs->enable)
+ break;
+ }
+ rcu_read_unlock();
+}
+#endif
+
/*
* per netns init and exit
*/
@@ -1373,20 +1447,36 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
{
atomic_set(&ipvs->conn_count, 0);
- proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
- &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
- proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
- &ip_vs_conn_sync_seq_ops,
- sizeof(struct ip_vs_iter_state));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
+ &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn;
+
+ if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+ &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn_sync;
+#endif
+
return 0;
+
+#ifdef CONFIG_PROC_FS
+err_conn_sync:
+ remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
+err_conn:
+ return -ENOMEM;
+#endif
}
void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
{
/* flush all the connection entries first */
ip_vs_conn_flush(ipvs);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
+#endif
}
int __init ip_vs_conn_init(void)
@@ -1394,6 +1484,10 @@ int __init ip_vs_conn_init(void)
int idx;
/* Compute size and mask */
+ if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) {
+ pr_info("conn_tab_bits not in [8, 20]. Using default value\n");
+ ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+ }
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
@@ -1417,7 +1511,7 @@ int __init ip_vs_conn_init(void)
pr_info("Connection hash table configured "
"(size=%d, memory=%ldKbytes)\n",
ip_vs_conn_tab_size,
- (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
+ (long)(ip_vs_conn_tab_size*sizeof(*ip_vs_conn_tab))/1024);
IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
sizeof(struct ip_vs_conn));
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 512259f579d7..51ad557a525b 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -68,18 +68,6 @@ EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
EXPORT_SYMBOL(ip_vs_new_conn_out);
-#ifdef CONFIG_IP_VS_PROTO_TCP
-INDIRECT_CALLABLE_DECLARE(int
- tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
- struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
-#endif
-
-#ifdef CONFIG_IP_VS_PROTO_UDP
-INDIRECT_CALLABLE_DECLARE(int
- udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
- struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
-#endif
-
#if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP)
#define SNAT_CALL(f, ...) \
INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__)
@@ -694,16 +682,10 @@ static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
return ipvs->sysctl_nat_icmp_send;
}
-static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
-{
- return ipvs->sysctl_expire_nodest_conn;
-}
-
#else
static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; }
static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; }
-static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
#endif
@@ -748,12 +730,12 @@ static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
struct dst_entry *dst = skb_dst(skb);
if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
- ip6_route_me_harder(ipvs->net, skb) != 0)
+ ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0)
return 1;
} else
#endif
if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
+ ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0)
return 1;
return 0;
@@ -881,7 +863,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
unsigned int verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- goto ignore_cp;
+ goto after_nat;
/* Ensure the checksum is correct */
if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
@@ -907,6 +889,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto out;
+after_nat:
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
@@ -915,8 +898,6 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 0);
-
-ignore_cp:
verdict = NF_ACCEPT;
out:
@@ -1282,6 +1263,9 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
{
struct ip_vs_protocol *pp = pd->pp;
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ goto after_nat;
+
IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
if (skb_ensure_writable(skb, iph->len))
@@ -1322,6 +1306,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
+after_nat:
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
@@ -1345,12 +1330,15 @@ drop:
* Check if outgoing packet belongs to the established ip_vs_conn.
*/
static unsigned int
-ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
+ unsigned int hooknum = state->hook;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
+ int af = state->pf;
struct sock *sk;
EnterFunction(11);
@@ -1418,11 +1406,8 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
ipvs, af, skb, &iph);
- if (likely(cp)) {
- if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- goto ignore_cp;
+ if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
- }
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
@@ -1481,66 +1466,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
}
}
-out:
IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
"ip_vs_out: packet continues traversal as normal");
return NF_ACCEPT;
-
-ignore_cp:
- __ip_vs_conn_put(cp);
- goto out;
-}
-
-/*
- * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
- * used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_reply4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-/*
- * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_local_reply4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-
-/*
- * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
- * used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_reply6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-/*
- * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_local_reply6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
}
-#endif
-
static unsigned int
ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
struct ip_vs_proto_data *pd,
@@ -1661,8 +1591,9 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
unsigned int offset, offset2, ihl, verdict;
- bool ipip, new_cp = false;
+ bool tunnel, new_cp = false;
union nf_inet_addr *raddr;
+ char *outer_proto = "IPIP";
*related = 1;
@@ -1703,8 +1634,8 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
return NF_ACCEPT; /* The packet looks wrong, ignore */
raddr = (union nf_inet_addr *)&cih->daddr;
- /* Special case for errors for IPIP packets */
- ipip = false;
+ /* Special case for errors for IPIP/UDP/GRE tunnel packets */
+ tunnel = false;
if (cih->protocol == IPPROTO_IPIP) {
struct ip_vs_dest *dest;
@@ -1721,7 +1652,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- ipip = true;
+ tunnel = true;
} else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */
cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */
/* Error for our tunnel must arrive at LOCAL_IN */
@@ -1729,16 +1660,19 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
__u8 iproto;
int ulen;
- /* Non-first fragment has no UDP header */
+ /* Non-first fragment has no UDP/GRE header */
if (unlikely(cih->frag_off & htons(IP_OFFSET)))
return NF_ACCEPT;
offset2 = offset + cih->ihl * 4;
- if (cih->protocol == IPPROTO_UDP)
+ if (cih->protocol == IPPROTO_UDP) {
ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET,
raddr, &iproto);
- else
+ outer_proto = "UDP";
+ } else {
ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET,
raddr, &iproto);
+ outer_proto = "GRE";
+ }
if (ulen > 0) {
/* Skip IP and UDP/GRE tunnel headers */
offset = offset2 + ulen;
@@ -1747,7 +1681,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
&_ciph);
if (cih && cih->version == 4 && cih->ihl >= 5 &&
iproto == IPPROTO_IPIP)
- ipip = true;
+ tunnel = true;
else
return NF_ACCEPT;
}
@@ -1767,11 +1701,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
"Checking incoming ICMP for");
offset2 = offset;
- ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph);
+ ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph);
offset = ciph.len;
/* The embedded headers contain source and dest in reverse order.
- * For IPIP this is error for request, not for reply.
+ * For IPIP/UDP/GRE tunnel this is error for request, not for reply.
*/
cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
ipvs, AF_INET, skb, &ciph);
@@ -1779,7 +1713,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
if (!cp) {
int v;
- if (ipip || !sysctl_schedule_icmp(ipvs))
+ if (tunnel || !sysctl_schedule_icmp(ipvs))
return NF_ACCEPT;
if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
@@ -1797,7 +1731,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
goto out;
}
- if (ipip) {
+ if (tunnel) {
__be32 info = ic->un.gateway;
__u8 type = ic->type;
__u8 code = ic->code;
@@ -1809,17 +1743,18 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
u32 mtu = ntohs(ic->un.frag.mtu);
__be16 frag_off = cih->frag_off;
- /* Strip outer IP and ICMP, go to IPIP header */
+ /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */
if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
- goto ignore_ipip;
+ goto ignore_tunnel;
offset2 -= ihl + sizeof(_icmph);
skb_reset_network_header(skb);
- IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
- &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
+ IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n",
+ outer_proto, &ip_hdr(skb)->saddr,
+ &ip_hdr(skb)->daddr, mtu);
ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
/* Client uses PMTUD? */
if (!(frag_off & htons(IP_DF)))
- goto ignore_ipip;
+ goto ignore_tunnel;
/* Prefer the resulting PMTU */
if (dest) {
struct ip_vs_dest_dst *dest_dst;
@@ -1832,11 +1767,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
mtu -= sizeof(struct iphdr);
info = htonl(mtu);
}
- /* Strip outer IP, ICMP and IPIP, go to IP header of
+ /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of
* original request.
*/
if (pskb_pull(skb, offset2) == NULL)
- goto ignore_ipip;
+ goto ignore_tunnel;
skb_reset_network_header(skb);
IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1845,7 +1780,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
/* ICMP can be shorter but anyways, account it */
ip_vs_out_stats(cp, skb);
-ignore_ipip:
+ignore_tunnel:
consume_skb(skb);
verdict = NF_STOLEN;
goto out;
@@ -1975,15 +1910,17 @@ out:
* and send it on its way...
*/
static unsigned int
-ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
+ unsigned int hooknum = state->hook;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, pkts;
- int conn_reuse_mode;
struct sock *sk;
+ int af = state->pf;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -2059,16 +1996,17 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
ipvs, af, skb, &iph);
- conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
- if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
- bool uses_ct = false, resched = false;
+ if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) {
+ int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
+ bool old_ct = false, resched = false;
if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
unlikely(!atomic_read(&cp->dest->weight))) {
resched = true;
- uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
- } else if (is_new_conn_expected(cp, conn_reuse_mode)) {
- uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+ old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
+ } else if (conn_reuse_mode &&
+ is_new_conn_expected(cp, conn_reuse_mode)) {
+ old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
if (!atomic_read(&cp->n_control)) {
resched = true;
} else {
@@ -2076,50 +2014,51 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
* that uses conntrack while it is still
* referenced by controlled connection(s).
*/
- resched = !uses_ct;
+ resched = !old_ct;
}
}
if (resched) {
+ if (!old_ct)
+ cp->flags &= ~IP_VS_CONN_F_NFCT;
if (!atomic_read(&cp->n_control))
ip_vs_conn_expire_now(cp);
__ip_vs_conn_put(cp);
- if (uses_ct)
+ if (old_ct)
return NF_DROP;
cp = NULL;
}
}
- if (unlikely(!cp)) {
- int v;
-
- if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
- return v;
- }
-
- IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
-
/* Check the server status */
- if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
+ if (sysctl_expire_nodest_conn(ipvs)) {
+ bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
- __u32 flags = cp->flags;
+ if (!old_ct)
+ cp->flags &= ~IP_VS_CONN_F_NFCT;
- /* when timer already started, silently drop the packet.*/
- if (timer_pending(&cp->timer))
- __ip_vs_conn_put(cp);
- else
- ip_vs_conn_put(cp);
-
- if (sysctl_expire_nodest_conn(ipvs) &&
- !(flags & IP_VS_CONN_F_ONE_PACKET)) {
- /* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ if (old_ct)
+ return NF_DROP;
+ cp = NULL;
+ } else {
+ __ip_vs_conn_put(cp);
+ return NF_DROP;
}
+ }
- return NF_DROP;
+ if (unlikely(!cp)) {
+ int v;
+
+ if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
+ return v;
}
+ IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
+
ip_vs_in_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
if (cp->packet_xmit)
@@ -2142,7 +2081,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
pkts = sysctl_sync_threshold(ipvs);
else
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
@@ -2155,55 +2094,6 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
}
/*
- * AF_INET handler in NF_INET_LOCAL_IN chain
- * Schedule and forward packets from remote clients
- */
-static unsigned int
-ip_vs_remote_request4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-/*
- * AF_INET handler in NF_INET_LOCAL_OUT chain
- * Schedule and forward packets from local clients
- */
-static unsigned int
-ip_vs_local_request4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-
-/*
- * AF_INET6 handler in NF_INET_LOCAL_IN chain
- * Schedule and forward packets from remote clients
- */
-static unsigned int
-ip_vs_remote_request6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-/*
- * AF_INET6 handler in NF_INET_LOCAL_OUT chain
- * Schedule and forward packets from local clients
- */
-static unsigned int
-ip_vs_local_request6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-#endif
-
-
-/*
* It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
* related packets destined for 0.0.0.0/0.
* When fwmark-based virtual service is used, such as transparent
@@ -2216,45 +2106,36 @@ static unsigned int
ip_vs_forward_icmp(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- int r;
struct netns_ipvs *ipvs = net_ipvs(state->net);
-
- if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
- return NF_ACCEPT;
+ int r;
/* ipvs enabled in this netns ? */
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
-}
-
+ if (state->pf == NFPROTO_IPV4) {
+ if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
+ return NF_ACCEPT;
#ifdef CONFIG_IP_VS_IPV6
-static unsigned int
-ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- int r;
- struct netns_ipvs *ipvs = net_ipvs(state->net);
- struct ip_vs_iphdr iphdr;
+ } else {
+ struct ip_vs_iphdr iphdr;
- ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
- if (iphdr.protocol != IPPROTO_ICMPV6)
- return NF_ACCEPT;
+ ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
- /* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
- return NF_ACCEPT;
+ if (iphdr.protocol != IPPROTO_ICMPV6)
+ return NF_ACCEPT;
- return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
-}
+ return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
#endif
+ }
+ return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
+}
-static const struct nf_hook_ops ip_vs_ops[] = {
+static const struct nf_hook_ops ip_vs_ops4[] = {
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply4,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 2,
@@ -2263,21 +2144,21 @@ static const struct nf_hook_ops ip_vs_ops[] = {
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_remote_request4,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
- .hook = ip_vs_local_reply4,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
- .hook = ip_vs_local_request4,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 2,
@@ -2292,15 +2173,18 @@ static const struct nf_hook_ops ip_vs_ops[] = {
},
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply4,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
+};
+
#ifdef CONFIG_IP_VS_IPV6
+static const struct nf_hook_ops ip_vs_ops6[] = {
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply6,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 2,
@@ -2309,21 +2193,21 @@ static const struct nf_hook_ops ip_vs_ops[] = {
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_remote_request6,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
- .hook = ip_vs_local_reply6,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
- .hook = ip_vs_local_request6,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 2,
@@ -2331,20 +2215,76 @@ static const struct nf_hook_ops ip_vs_ops[] = {
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
- .hook = ip_vs_forward_icmp_v6,
+ .hook = ip_vs_forward_icmp,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply6,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
-#endif
};
+#endif
+
+int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af)
+{
+ const struct nf_hook_ops *ops;
+ unsigned int count;
+ unsigned int afmask;
+ int ret = 0;
+
+ if (af == AF_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ ops = ip_vs_ops6;
+ count = ARRAY_SIZE(ip_vs_ops6);
+ afmask = 2;
+#else
+ return -EINVAL;
+#endif
+ } else {
+ ops = ip_vs_ops4;
+ count = ARRAY_SIZE(ip_vs_ops4);
+ afmask = 1;
+ }
+
+ if (!(ipvs->hooks_afmask & afmask)) {
+ ret = nf_register_net_hooks(ipvs->net, ops, count);
+ if (ret >= 0)
+ ipvs->hooks_afmask |= afmask;
+ }
+ return ret;
+}
+
+void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af)
+{
+ const struct nf_hook_ops *ops;
+ unsigned int count;
+ unsigned int afmask;
+
+ if (af == AF_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ ops = ip_vs_ops6;
+ count = ARRAY_SIZE(ip_vs_ops6);
+ afmask = 2;
+#else
+ return;
+#endif
+ } else {
+ ops = ip_vs_ops4;
+ count = ARRAY_SIZE(ip_vs_ops4);
+ afmask = 1;
+ }
+
+ if (ipvs->hooks_afmask & afmask) {
+ nf_unregister_net_hooks(ipvs->net, ops, count);
+ ipvs->hooks_afmask &= ~afmask;
+ }
+}
+
/*
* Initialize IP Virtual Server netns mem.
*/
@@ -2356,7 +2296,7 @@ static int __net_init __ip_vs_init(struct net *net)
if (ipvs == NULL)
return -ENOMEM;
- /* Hold the beast until a service is registerd */
+ /* Hold the beast until a service is registered */
ipvs->enable = 0;
ipvs->net = net;
/* Counters used for creating unique names */
@@ -2420,19 +2360,6 @@ static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list)
}
}
-static int __net_init __ip_vs_dev_init(struct net *net)
-{
- int ret;
-
- ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
- if (ret < 0)
- goto hook_fail;
- return 0;
-
-hook_fail:
- return ret;
-}
-
static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
{
struct netns_ipvs *ipvs;
@@ -2441,7 +2368,8 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
EnterFunction(2);
list_for_each_entry(net, net_list, exit_list) {
ipvs = net_ipvs(net);
- nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ ip_vs_unregister_hooks(ipvs, AF_INET);
+ ip_vs_unregister_hooks(ipvs, AF_INET6);
ipvs->enable = 0; /* Disable packet reception */
smp_wmb();
ip_vs_sync_net_cleanup(ipvs);
@@ -2457,7 +2385,6 @@ static struct pernet_operations ipvs_core_ops = {
};
static struct pernet_operations ipvs_core_dev_ops = {
- .init = __ip_vs_dev_init,
.exit_batch = __ip_vs_dev_cleanup_batch,
};
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 8d14a1acbc37..988222fff9f0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -24,7 +24,6 @@
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
-#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -48,6 +47,8 @@
#include <net/ip_vs.h>
+MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
+
/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DEFINE_MUTEX(__ip_vs_mutex);
@@ -210,6 +211,17 @@ static void update_defense_level(struct netns_ipvs *ipvs)
local_bh_enable();
}
+/* Handler for delayed work for expiring no
+ * destination connections
+ */
+static void expire_nodest_conn_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs;
+
+ ipvs = container_of(work, struct netns_ipvs,
+ expire_nodest_conn_work.work);
+ ip_vs_expire_nodest_conn_flush(ipvs);
+}
/*
* Timer for checking the defense
@@ -224,7 +236,8 @@ static void defense_work_handler(struct work_struct *work)
update_defense_level(ipvs);
if (atomic_read(&ipvs->dropentry))
ip_vs_random_dropentry(ipvs);
- schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+ queue_delayed_work(system_long_wq, &ipvs->defense_work,
+ DEFENSE_TIMER_PERIOD);
}
#endif
@@ -947,8 +960,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
* Create a destination for the given service
*/
static int
-ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
- struct ip_vs_dest **dest_p)
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
unsigned int atype, i;
@@ -1008,8 +1020,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
- *dest_p = dest;
-
LeaveFunction(2);
return 0;
@@ -1083,7 +1093,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Allocate and initialize the dest structure
*/
- ret = ip_vs_new_dest(svc, udest, &dest);
+ ret = ip_vs_new_dest(svc, udest);
}
LeaveFunction(2);
@@ -1163,6 +1173,12 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
list_add(&dest->t_list, &ipvs->dest_trash);
dest->idle_start = 0;
spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ /* Queue up delayed work to expire all no destination connections.
+ * No-op when CONFIG_SYSCTL is disabled.
+ */
+ if (!cleanup)
+ ip_vs_enqueue_expire_nodest_conns(ipvs);
}
@@ -1272,6 +1288,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
+ int ret_hooks = -1;
/* increase the module use count */
if (!ip_vs_use_count_inc())
@@ -1313,6 +1330,14 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
}
#endif
+ if ((u->af == AF_INET && !ipvs->num_services) ||
+ (u->af == AF_INET6 && !ipvs->num_services6)) {
+ ret = ip_vs_register_hooks(ipvs, u->af);
+ if (ret < 0)
+ goto out_err;
+ ret_hooks = ret;
+ }
+
svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
if (svc == NULL) {
IP_VS_DBG(1, "%s(): no memory\n", __func__);
@@ -1340,7 +1365,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
svc->port = u->port;
svc->fwmark = u->fwmark;
- svc->flags = u->flags;
+ svc->flags = u->flags & ~IP_VS_SVC_F_HASHED;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
svc->ipvs = ipvs;
@@ -1374,6 +1399,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services++;
+ else if (svc->af == AF_INET6)
+ ipvs->num_services6++;
/* Hash the service into the service table */
ip_vs_svc_hash(svc);
@@ -1385,6 +1412,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
out_err:
+ if (ret_hooks >= 0)
+ ip_vs_unregister_hooks(ipvs, u->af);
if (svc != NULL) {
ip_vs_unbind_scheduler(svc, sched);
ip_vs_service_free(svc);
@@ -1500,9 +1529,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
struct ip_vs_pe *old_pe;
struct netns_ipvs *ipvs = svc->ipvs;
- /* Count only IPv4 services for old get/setsockopt interface */
- if (svc->af == AF_INET)
+ if (svc->af == AF_INET) {
ipvs->num_services--;
+ if (!ipvs->num_services)
+ ip_vs_unregister_hooks(ipvs, svc->af);
+ } else if (svc->af == AF_INET6) {
+ ipvs->num_services6--;
+ if (!ipvs->num_services6)
+ ip_vs_unregister_hooks(ipvs, svc->af);
+ }
ip_vs_stop_estimator(svc->ipvs, &svc->stats);
@@ -1732,11 +1767,9 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
#ifdef CONFIG_SYSCTL
-static int three = 3;
-
static int
proc_do_defense_mode(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
@@ -1763,7 +1796,7 @@ proc_do_defense_mode(struct ctl_table *table, int write,
static int
proc_do_sync_threshold(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val[2];
@@ -1788,7 +1821,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write,
static int
proc_do_sync_ports(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val = *valp;
@@ -1942,7 +1975,7 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &three,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "nat_icmp_send",
@@ -1980,6 +2013,12 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "run_estimation",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
@@ -2414,7 +2453,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
}
static int
-do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
{
struct net *net = sock_net(sk);
int ret;
@@ -2438,7 +2477,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
return -EINVAL;
}
- if (copy_from_user(arg, user, len) != 0)
+ if (copy_from_sockptr(arg, ptr, len) != 0)
return -EFAULT;
/* Handle daemons since they have another lock */
@@ -2471,6 +2510,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Set timeout values for (tcp tcpfin udp) */
ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
+ } else if (!len) {
+ /* No more commands with len == 0 below */
+ ret = -EINVAL;
+ goto out_unlock;
}
usvc_compat = (struct ip_vs_service_user *)arg;
@@ -2547,9 +2590,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
break;
case IP_VS_SO_SET_DELDEST:
ret = ip_vs_del_dest(svc, &udest);
- break;
- default:
- ret = -EINVAL;
}
out_unlock:
@@ -2571,7 +2611,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
+ strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2765,13 +2805,13 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
mutex_lock(&ipvs->sync_mutex);
if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
+ strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
sizeof(d[0].mcast_ifn));
d[0].syncid = ipvs->mcfg.syncid;
}
if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
+ strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
sizeof(d[1].mcast_ifn));
d[1].syncid = ipvs->bcfg.syncid;
}
@@ -3521,7 +3561,7 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
- strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+ strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
sizeof(c.mcast_ifn));
c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
@@ -3855,7 +3895,7 @@ out:
}
-static const struct genl_ops ip_vs_genl_ops[] = {
+static const struct genl_small_ops ip_vs_genl_ops[] = {
{
.cmd = IPVS_CMD_NEW_SERVICE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -3963,8 +4003,9 @@ static struct genl_family ip_vs_genl_family __ro_after_init = {
.policy = ip_vs_cmd_policy,
.netnsok = true, /* Make ipvsadm to work on netns */
.module = THIS_MODULE,
- .ops = ip_vs_genl_ops,
- .n_ops = ARRAY_SIZE(ip_vs_genl_ops),
+ .small_ops = ip_vs_genl_ops,
+ .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops),
+ .resv_start_op = IPVS_CMD_FLUSH + 1,
};
static int __init ip_vs_genl_register(void)
@@ -4052,6 +4093,13 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
+ ipvs->sysctl_run_estimation = 1;
+ tbl[idx++].data = &ipvs->sysctl_run_estimation;
+#ifdef CONFIG_IP_VS_DEBUG
+ /* Global sysctls must be ro in non-init netns */
+ if (!net_eq(net, &init_net))
+ tbl[idx++].mode = 0444;
+#endif
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
if (ipvs->sysctl_hdr == NULL) {
@@ -4063,7 +4111,12 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
ipvs->sysctl_tbl = tbl;
/* Schedule defense work */
INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
- schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+ queue_delayed_work(system_long_wq, &ipvs->defense_work,
+ DEFENSE_TIMER_PERIOD);
+
+ /* Init delayed work for expiring no dest conn */
+ INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
+ expire_nodest_conn_handler);
return 0;
}
@@ -4072,6 +4125,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
{
struct net *net = ipvs->net;
+ cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work);
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
@@ -4123,12 +4177,18 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
spin_lock_init(&ipvs->tot_stats.lock);
- proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops,
- sizeof(struct ip_vs_iter));
- proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
- ip_vs_stats_show, NULL);
- proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
- ip_vs_stats_percpu_show, NULL);
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
+ &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
+ goto err_vs;
+ if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
+ ip_vs_stats_show, NULL))
+ goto err_stats;
+ if (!proc_create_net_single("ip_vs_stats_percpu", 0,
+ ipvs->net->proc_net,
+ ip_vs_stats_percpu_show, NULL))
+ goto err_percpu;
+#endif
if (ip_vs_control_net_init_sysctl(ipvs))
goto err;
@@ -4136,6 +4196,17 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
return 0;
err:
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
+
+err_percpu:
+ remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
+
+err_stats:
+ remove_proc_entry("ip_vs", ipvs->net->proc_net);
+
+err_vs:
+#endif
free_percpu(ipvs->tot_stats.cpustats);
return -ENOMEM;
}
@@ -4144,9 +4215,11 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
ip_vs_trash_cleanup(ipvs);
ip_vs_control_net_cleanup_sysctl(ipvs);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
remove_proc_entry("ip_vs", ipvs->net->proc_net);
+#endif
free_percpu(ipvs->tot_stats.cpustats);
}
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 05b8112ffb37..9a1a7af6a186 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -100,6 +100,9 @@ static void estimation_timer(struct timer_list *t)
u64 rate;
struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer);
+ if (!sysctl_run_estimation(ipvs))
+ goto skip;
+
spin_lock(&ipvs->est_lock);
list_for_each_entry(e, &ipvs->est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
@@ -131,6 +134,8 @@ static void estimation_timer(struct timer_list *t)
spin_unlock(&s->lock);
}
spin_unlock(&ipvs->est_lock);
+
+skip:
mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index cf925906f59b..ef1f45e43b63 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -591,8 +591,6 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
if (ret)
goto err_unreg;
- pr_info("%s: loaded support on port[%d] = %u\n",
- app->name, i, ports[i]);
}
return 0;
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index da0280cec506..e3d7f5c879ce 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -174,8 +174,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
return 0;
}
- table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
- sizeof(unsigned long), GFP_KERNEL);
+ table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -227,7 +226,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
}
out:
- kfree(table);
+ bitmap_free(table);
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 32b028853a7c..7da51390cea6 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -315,7 +315,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
- /* fall through */
+ fallthrough;
case CHECKSUM_COMPLETE:
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -539,8 +539,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
if (new_state != cp->state) {
struct ip_vs_dest *dest = cp->dest;
- IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
- "%s:%d state: %s->%s conn->refcnt:%d\n",
+ IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d "
+ "d:%s:%d state: %s->%s conn->refcnt:%d\n",
pd->pp->name,
((state_off == TCP_DIR_OUTPUT) ?
"output " : "input "),
@@ -548,10 +548,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
th->fin ? 'F' : '.',
th->ack ? 'A' : '.',
th->rst ? 'R' : '.',
- IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
- ntohs(cp->dport),
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
+ ntohs(cp->dport),
tcp_state_name(cp->state),
tcp_state_name(new_state),
refcount_read(&cp->refcnt));
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 153d89647c87..68260d91c988 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -318,7 +318,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
case CHECKSUM_NONE:
skb->csum = skb_checksum(skb, udphoff,
skb->len - udphoff, 0);
- /* fall through */
+ fallthrough;
case CHECKSUM_COMPLETE:
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 605e0f68f8bd..a56fd0b5a430 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -242,9 +242,6 @@ struct ip_vs_sync_thread_data {
| IPVS Sync Connection (1) |
*/
-#define SYNC_MESG_HEADER_LEN 4
-#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-
/* Version 0 header */
struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
@@ -618,7 +615,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
cp = cp->control;
if (cp) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
ip_vs_sync_conn(ipvs, cp, pkts);
@@ -779,7 +776,7 @@ control:
if (!cp)
return;
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
goto sloop;
@@ -1283,12 +1280,12 @@ static void set_sock_size(struct sock *sk, int mode, int val)
lock_sock(sk);
if (mode) {
val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
- sysctl_wmem_max);
+ READ_ONCE(sysctl_wmem_max));
sk->sk_sndbuf = val * 2;
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
} else {
val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
- sysctl_rmem_max);
+ READ_ONCE(sysctl_rmem_max));
sk->sk_rcvbuf = val * 2;
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
}
@@ -1717,6 +1714,8 @@ static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
struct netns_ipvs *ipvs = tinfo->ipvs;
+ struct sock *sk = tinfo->sock->sk;
+ struct udp_sock *up = udp_sk(sk);
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
@@ -1724,12 +1723,14 @@ static int sync_thread_backup(void *data)
ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
while (!kthread_should_stop()) {
- wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
- !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
- || kthread_should_stop());
+ wait_event_interruptible(*sk_sleep(sk),
+ !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+ !skb_queue_empty_lockless(&up->reader_queue) ||
+ kthread_should_stop());
/* do we have data now? */
- while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
+ while (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+ !skb_queue_empty_lockless(&up->reader_queue)) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
ipvs->bcfg.sync_maxlen);
if (len <= 0) {
diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c
new file mode 100644
index 000000000000..f2579fc9c75b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_twos.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* IPVS: Power of Twos Choice Scheduling module
+ *
+ * Authors: Darby Payne <darby.payne@applovin.com>
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include <net/ip_vs.h>
+
+/* Power of Twos Choice scheduling, algorithm originally described by
+ * Michael Mitzenmacher.
+ *
+ * Randomly picks two destinations and picks the one with the least
+ * amount of connections
+ *
+ * The algorithm calculates a few variables
+ * - total_weight = sum of all weights
+ * - rweight1 = random number between [0,total_weight]
+ * - rweight2 = random number between [0,total_weight]
+ *
+ * For each destination
+ * decrement rweight1 and rweight2 by the destination weight
+ * pick choice1 when rweight1 is <= 0
+ * pick choice2 when rweight2 is <= 0
+ *
+ * Return choice2 if choice2 has less connections than choice 1 normalized
+ * by weight
+ *
+ * References
+ * ----------
+ *
+ * [Mitzenmacher 2016]
+ * The Power of Two Random Choices: A Survey of Techniques and Results
+ * Michael Mitzenmacher, Andrea W. Richa y, Ramesh Sitaraman
+ * http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf
+ *
+ */
+static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc,
+ const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *choice1 = NULL, *choice2 = NULL;
+ int rweight1, rweight2, weight1 = -1, weight2 = -1, overhead1 = 0;
+ int overhead2, total_weight = 0, weight;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* Generate a random weight between [0,sum of all weights) */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD)) {
+ weight = atomic_read(&dest->weight);
+ if (weight > 0) {
+ total_weight += weight;
+ choice1 = dest;
+ }
+ }
+ }
+
+ if (!choice1) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ /* Add 1 to total_weight so that the random weights are inclusive
+ * from 0 to total_weight
+ */
+ total_weight += 1;
+ rweight1 = prandom_u32_max(total_weight);
+ rweight2 = prandom_u32_max(total_weight);
+
+ /* Pick two weighted servers */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ weight = atomic_read(&dest->weight);
+ if (weight <= 0)
+ continue;
+
+ rweight1 -= weight;
+ rweight2 -= weight;
+
+ if (rweight1 <= 0 && weight1 == -1) {
+ choice1 = dest;
+ weight1 = weight;
+ overhead1 = ip_vs_dest_conn_overhead(dest);
+ }
+
+ if (rweight2 <= 0 && weight2 == -1) {
+ choice2 = dest;
+ weight2 = weight;
+ overhead2 = ip_vs_dest_conn_overhead(dest);
+ }
+
+ if (weight1 != -1 && weight2 != -1)
+ goto nextstage;
+ }
+
+nextstage:
+ if (choice2 && (weight2 * overhead1) > (weight1 * overhead2))
+ choice1 = choice2;
+
+ IP_VS_DBG_BUF(6, "twos: server %s:%u conns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(choice1->af, &choice1->addr),
+ ntohs(choice1->port), atomic_read(&choice1->activeconns),
+ refcount_read(&choice1->refcnt),
+ atomic_read(&choice1->weight));
+
+ return choice1;
+}
+
+static struct ip_vs_scheduler ip_vs_twos_scheduler = {
+ .name = "twos",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_twos_scheduler.n_list),
+ .schedule = ip_vs_twos_schedule,
+};
+
+static int __init ip_vs_twos_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_twos_scheduler);
+}
+
+static void __exit ip_vs_twos_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_twos_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_twos_init);
+module_exit(ip_vs_twos_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index b00866d777fe..029171379884 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -609,6 +609,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
if (ret == NF_ACCEPT) {
nf_reset_ct(skb);
skb_forward_csum(skb);
+ if (skb->dev)
+ skb_clear_tstamp(skb);
}
return ret;
}
@@ -649,6 +651,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
skb_forward_csum(skb);
+ if (skb->dev)
+ skb_clear_tstamp(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
@@ -669,6 +673,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
+ if (skb->dev)
+ skb_clear_tstamp(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 82f36beb2e76..5d8ed6c90b7e 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -132,6 +132,9 @@ static int __nf_conncount_add(struct net *net,
struct nf_conn *found_ct;
unsigned int collect = 0;
+ if (time_is_after_eq_jiffies((unsigned long)list->last_gc))
+ goto add_new_node;
+
/* check the saved connections */
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
if (collect > CONNCOUNT_GC_MAX_NODES)
@@ -177,6 +180,7 @@ static int __nf_conncount_add(struct net *net,
nf_ct_put(found_ct);
}
+add_new_node:
if (WARN_ON_ONCE(list->count > INT_MAX))
return -EOVERFLOW;
@@ -190,6 +194,7 @@ static int __nf_conncount_add(struct net *net,
conn->jiffies32 = (u32)jiffies;
list_add_tail(&conn->node, &list->head);
list->count++;
+ list->last_gc = (u32)jiffies;
return 0;
}
@@ -214,6 +219,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list)
spin_lock_init(&list->list_lock);
INIT_LIST_HEAD(&list->head);
list->count = 0;
+ list->last_gc = (u32)jiffies;
}
EXPORT_SYMBOL_GPL(nf_conncount_list_init);
@@ -227,6 +233,10 @@ bool nf_conncount_gc_list(struct net *net,
unsigned int collected = 0;
bool ret = false;
+ /* don't bother if we just did GC */
+ if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list->last_gc)))
+ return false;
+
/* don't bother if other cpu is already doing GC */
if (!spin_trylock(&list->list_lock))
return false;
@@ -258,6 +268,7 @@ bool nf_conncount_gc_list(struct net *net,
if (!list->count)
ret = true;
+ list->last_gc = (u32)jiffies;
spin_unlock(&list->list_lock);
return ret;
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 2ccda8ace796..385a5f458aba 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Accouting handling for netfilter. */
+/* Accounting handling for netfilter. */
/*
* (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl>
@@ -22,26 +22,7 @@ static bool nf_ct_acct __read_mostly;
module_param_named(acct, nf_ct_acct, bool, 0644);
MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
-static const struct nf_ct_ext_type acct_extend = {
- .len = sizeof(struct nf_conn_acct),
- .align = __alignof__(struct nf_conn_acct),
- .id = NF_CT_EXT_ACCT,
-};
-
void nf_conntrack_acct_pernet_init(struct net *net)
{
net->ct.sysctl_acct = nf_ct_acct;
}
-
-int nf_conntrack_acct_init(void)
-{
- int ret = nf_ct_extend_register(&acct_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_acct_fini(void)
-{
- nf_ct_extend_unregister(&acct_extend);
-}
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
new file mode 100644
index 000000000000..8639e7efd0e2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Conntrack Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/btf_ids.h>
+#include <linux/net_namespace.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+/* bpf_ct_opts - Options for CT lookup helpers
+ *
+ * Members:
+ * @netns_id - Specify the network namespace for lookup
+ * Values:
+ * BPF_F_CURRENT_NETNS (-1)
+ * Use namespace associated with ctx (xdp_md, __sk_buff)
+ * [0, S32_MAX]
+ * Network Namespace ID
+ * @error - Out parameter, set for any errors encountered
+ * Values:
+ * -EINVAL - Passed NULL for bpf_tuple pointer
+ * -EINVAL - opts->reserved is not 0
+ * -EINVAL - netns_id is less than -1
+ * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
+ * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
+ * -ENONET - No network namespace found for netns_id
+ * -ENOENT - Conntrack lookup could not find entry for tuple
+ * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
+ * or sizeof(tuple->ipv6)
+ * @l4proto - Layer 4 protocol
+ * Values:
+ * IPPROTO_TCP, IPPROTO_UDP
+ * @dir: - connection tracking tuple direction.
+ * @reserved - Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ */
+struct bpf_ct_opts {
+ s32 netns_id;
+ s32 error;
+ u8 l4proto;
+ u8 dir;
+ u8 reserved[2];
+};
+
+enum {
+ NF_BPF_CT_OPTS_SZ = 12,
+};
+
+static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, u8 protonum, u8 dir,
+ struct nf_conntrack_tuple *tuple)
+{
+ union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
+ union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
+ union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
+ : &tuple->src.u;
+ union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
+ : (void *)&tuple->dst.u;
+
+ if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
+ return -EPROTO;
+
+ memset(tuple, 0, sizeof(*tuple));
+
+ switch (tuple_len) {
+ case sizeof(bpf_tuple->ipv4):
+ tuple->src.l3num = AF_INET;
+ src->ip = bpf_tuple->ipv4.saddr;
+ sport->tcp.port = bpf_tuple->ipv4.sport;
+ dst->ip = bpf_tuple->ipv4.daddr;
+ dport->tcp.port = bpf_tuple->ipv4.dport;
+ break;
+ case sizeof(bpf_tuple->ipv6):
+ tuple->src.l3num = AF_INET6;
+ memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+ sport->tcp.port = bpf_tuple->ipv6.sport;
+ memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+ dport->tcp.port = bpf_tuple->ipv6.dport;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+ tuple->dst.protonum = protonum;
+ tuple->dst.dir = dir;
+
+ return 0;
+}
+
+static struct nf_conn *
+__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len,
+ u32 timeout)
+{
+ struct nf_conntrack_tuple otuple, rtuple;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+ opts_len != NF_BPF_CT_OPTS_SZ)
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &otuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_REPLY, &rtuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
+ }
+
+ ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
+ GFP_ATOMIC);
+ if (IS_ERR(ct))
+ goto out;
+
+ memset(&ct->proto, 0, sizeof(ct->proto));
+ __nf_ct_set_timeout(ct, timeout * HZ);
+
+out:
+ if (opts->netns_id >= 0)
+ put_net(net);
+
+ return ct;
+}
+
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+ struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts,
+ u32 opts_len)
+{
+ struct nf_conntrack_tuple_hash *hash;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+ opts_len != NF_BPF_CT_OPTS_SZ)
+ return ERR_PTR(-EINVAL);
+ if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
+ return ERR_PTR(-EPROTO);
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &tuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
+ }
+
+ hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
+ if (opts->netns_id >= 0)
+ put_net(net);
+ if (!hash)
+ return ERR_PTR(-ENOENT);
+
+ ct = nf_ct_tuplehash_to_ctrack(hash);
+ opts->dir = NF_CT_DIRECTION(hash);
+
+ return ct;
+}
+
+BTF_ID_LIST(btf_nf_conn_ids)
+BTF_ID(struct, nf_conn)
+BTF_ID(struct, nf_conn___init)
+
+/* Check writes into `struct nf_conn` */
+static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int off,
+ int size, enum bpf_access_type atype,
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
+{
+ const struct btf_type *ncit;
+ const struct btf_type *nct;
+ size_t end;
+
+ ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]);
+ nct = btf_type_by_id(btf, btf_nf_conn_ids[0]);
+
+ if (t != nct && t != ncit) {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ /* `struct nf_conn` and `struct nf_conn___init` have the same layout
+ * so we are safe to simply merge offset checks here
+ */
+ switch (off) {
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ case offsetof(struct nf_conn, mark):
+ end = offsetofend(struct nf_conn, mark);
+ break;
+#endif
+ default:
+ bpf_log(log, "no write support to nf_conn at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in nf_conntrack BTF");
+
+/* bpf_xdp_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn___init *
+bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct nf_conn *nfct;
+
+ nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz,
+ opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
+/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ caller_net = dev_net(ctx->rxq->dev);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_skb_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn___init *
+bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct nf_conn *nfct;
+ struct net *net;
+
+ net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
+/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_insert_entry - Add the provided entry into a CT map
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID.
+ *
+ * @nfct - Pointer to referenced nf_conn___init object, obtained
+ * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ */
+struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
+{
+ struct nf_conn *nfct = (struct nf_conn *)nfct_i;
+ int err;
+
+ nfct->status |= IPS_CONFIRMED;
+ err = nf_conntrack_hash_check_insert(nfct);
+ if (err < 0) {
+ nf_conntrack_free(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_release - Release acquired nf_conn object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @nf_conn - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ */
+void bpf_ct_release(struct nf_conn *nfct)
+{
+ if (!nfct)
+ return;
+ nf_ct_put(nfct);
+}
+
+/* bpf_ct_set_timeout - Set timeout of allocated nf_conn
+ *
+ * Sets the default timeout of newly allocated nf_conn before insertion.
+ * This helper must be invoked for refcounted pointer to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @timeout - Timeout in msecs.
+ */
+void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
+{
+ __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_change_timeout - Change timeout of inserted nf_conn
+ *
+ * Change timeout associated of the inserted or looked up nf_conn.
+ * This helper must be invoked for refcounted pointer to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
+ * @timeout - New timeout in msecs.
+ */
+int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
+{
+ return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_set_status - Set status field of allocated nf_conn
+ *
+ * Set the status field of the newly allocated nf_conn before insertion.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @status - New status value.
+ */
+int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
+{
+ return nf_ct_change_status_common((struct nf_conn *)nfct, status);
+}
+
+/* bpf_ct_change_status - Change status of inserted nf_conn
+ *
+ * Change the status field of the provided connection tracking entry.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ * @status - New status value.
+ */
+int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
+{
+ return nf_ct_change_status_common(nfct, status);
+}
+
+__diag_pop()
+
+BTF_SET8_START(nf_ct_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
+BTF_SET8_END(nf_ct_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_ct_kfunc_set,
+};
+
+int register_nf_conntrack_bpf(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
+ if (!ret) {
+ mutex_lock(&nf_conn_btf_access_lock);
+ nfct_btf_struct_access = _nf_conntrack_btf_struct_access;
+ mutex_unlock(&nf_conn_btf_access_lock);
+ }
+
+ return ret;
+}
+
+void cleanup_nf_conntrack_bpf(void)
+{
+ mutex_lock(&nf_conn_btf_access_lock);
+ nfct_btf_struct_access = NULL;
+ mutex_unlock(&nf_conn_btf_access_lock);
+}
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index 1ba6becc3079..9fb9b8031298 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -20,6 +20,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
unsigned int timeout)
{
+ const struct nf_conntrack_helper *helper;
struct nf_conntrack_expect *exp;
struct iphdr *iph = ip_hdr(skb);
struct rtable *rt = skb_rtable(skb);
@@ -58,7 +59,10 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
goto out;
exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+ helper = rcu_dereference(help->helper);
+ if (helper)
+ exp->tuple.src.u.udp.port = helper->tuple.src.u.udp.port;
exp->mask.src.u3.ip = mask;
exp->mask.src.u.udp.port = htons(0xFFFF);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 1927fc296f95..f97bda06d2a9 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -21,7 +21,6 @@
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/random.h>
-#include <linux/jhash.h>
#include <linux/siphash.h>
#include <linux/err.h>
#include <linux/percpu.h>
@@ -35,10 +34,10 @@
#include <linux/rculist_nulls.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
@@ -66,22 +65,39 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
struct conntrack_gc_work {
struct delayed_work dwork;
- u32 last_bucket;
+ u32 next_bucket;
+ u32 avg_timeout;
+ u32 count;
+ u32 start_time;
bool exiting;
bool early_drop;
- long next_gc_run;
};
static __read_mostly struct kmem_cache *nf_conntrack_cachep;
static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
-/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
-#define GC_MAX_BUCKETS_DIV 128u
-/* upper bound of full table scan */
-#define GC_MAX_SCAN_JIFFIES (16u * HZ)
-/* desired ratio of entries found to be expired */
-#define GC_EVICT_RATIO 50u
+/* serialize hash resizes and nf_ct_iterate_cleanup */
+static DEFINE_MUTEX(nf_conntrack_mutex);
+
+#define GC_SCAN_INTERVAL_MAX (60ul * HZ)
+#define GC_SCAN_INTERVAL_MIN (1ul * HZ)
+
+/* clamp timeouts to this value (TCP unacked) */
+#define GC_SCAN_INTERVAL_CLAMP (300ul * HZ)
+
+/* Initial bias pretending we have 100 entries at the upper bound so we don't
+ * wakeup often just because we have three entries with a 1s timeout while still
+ * allowing non-idle machines to wakeup more often when needed.
+ */
+#define GC_SCAN_INITIAL_COUNT 100
+#define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX
+
+#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
+#define GC_SCAN_EXPIRED_MAX (64000u / HZ)
+
+#define MIN_CHAINLEN 8u
+#define MAX_CHAINLEN (32u - MIN_CHAINLEN)
static struct conntrack_gc_work conntrack_gc_work;
@@ -143,12 +159,21 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
}
static void nf_conntrack_all_lock(void)
+ __acquires(&nf_conntrack_locks_all_lock)
{
int i;
spin_lock(&nf_conntrack_locks_all_lock);
- nf_conntrack_locks_all = true;
+ /* For nf_contrack_locks_all, only the latest time when another
+ * CPU will see an update is controlled, by the "release" of the
+ * spin_lock below.
+ * The earliest time is not controlled, an thus KCSAN could detect
+ * a race when nf_conntract_lock() reads the variable.
+ * WRITE_ONCE() is used to ensure the compiler will not
+ * optimize the write.
+ */
+ WRITE_ONCE(nf_conntrack_locks_all, true);
for (i = 0; i < CONNTRACK_LOCKS; i++) {
spin_lock(&nf_conntrack_locks[i]);
@@ -162,6 +187,7 @@ static void nf_conntrack_all_lock(void)
}
static void nf_conntrack_all_unlock(void)
+ __releases(&nf_conntrack_locks_all_lock)
{
/* All prior stores must be complete before we clear
* 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
@@ -178,26 +204,35 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
-seqcount_t nf_conntrack_generation __read_mostly;
-static unsigned int nf_conntrack_hash_rnd __read_mostly;
+seqcount_spinlock_t nf_conntrack_generation __read_mostly;
+static siphash_aligned_key_t nf_conntrack_hash_rnd;
static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
+ unsigned int zoneid,
const struct net *net)
{
- unsigned int n;
- u32 seed;
+ struct {
+ struct nf_conntrack_man src;
+ union nf_inet_addr dst_addr;
+ unsigned int zone;
+ u32 net_mix;
+ u16 dport;
+ u16 proto;
+ } __aligned(SIPHASH_ALIGNMENT) combined;
get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
- /* The direction must be ignored, so we hash everything up to the
- * destination ports (which is a multiple of 4) and treat the last
- * three bytes manually.
- */
- seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
- n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, seed ^
- (((__force __u16)tuple->dst.u.all << 16) |
- tuple->dst.protonum));
+ memset(&combined, 0, sizeof(combined));
+
+ /* The direction must be ignored, so handle usable members manually. */
+ combined.src = tuple->src;
+ combined.dst_addr = tuple->dst.u3;
+ combined.zone = zoneid;
+ combined.net_mix = net_hash_mix(net);
+ combined.dport = (__force __u16)tuple->dst.u.all;
+ combined.proto = tuple->dst.protonum;
+
+ return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd);
}
static u32 scale_hash(u32 hash)
@@ -207,15 +242,17 @@ static u32 scale_hash(u32 hash)
static u32 __hash_conntrack(const struct net *net,
const struct nf_conntrack_tuple *tuple,
+ unsigned int zoneid,
unsigned int size)
{
- return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
+ return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size);
}
static u32 hash_conntrack(const struct net *net,
- const struct nf_conntrack_tuple *tuple)
+ const struct nf_conntrack_tuple *tuple,
+ unsigned int zoneid)
{
- return scale_hash(hash_conntrack_raw(tuple, net));
+ return scale_hash(hash_conntrack_raw(tuple, zoneid, net));
}
static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
@@ -295,20 +332,18 @@ nf_ct_get_tuple(const struct sk_buff *skb,
return gre_pkt_to_tuple(skb, dataoff, net, tuple);
#endif
case IPPROTO_TCP:
- case IPPROTO_UDP: /* fallthrough */
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
+ case IPPROTO_UDP:
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
case IPPROTO_UDPLITE:
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
case IPPROTO_SCTP:
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
#endif
#ifdef CONFIG_NF_CT_PROTO_DCCP
case IPPROTO_DCCP:
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
#endif
+ /* fallthrough */
+ return nf_ct_get_tuple_ports(skb, dataoff, tuple);
default:
break;
}
@@ -461,7 +496,7 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
*/
u32 nf_ct_get_id(const struct nf_conn *ct)
{
- static __read_mostly siphash_key_t ct_id_seed;
+ static siphash_aligned_key_t ct_id_seed;
unsigned long a, b, c, d;
net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
@@ -491,53 +526,9 @@ clean_from_lists(struct nf_conn *ct)
nf_ct_remove_expectations(ct);
}
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_dying_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) dying list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->dying);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) unconfirmed list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->unconfirmed);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* We overload first tuple to link into unconfirmed or dying list.*/
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
- spin_unlock(&pcpu->lock);
-}
-
#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
-/* Released via destroy_conntrack() */
+/* Released via nf_ct_destroy() */
struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
gfp_t flags)
@@ -564,7 +555,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
tmpl->status = IPS_TEMPLATE;
write_pnet(&tmpl->ct_net, net);
nf_ct_zone_add(tmpl, zone);
- atomic_set(&tmpl->ct_general.use, 0);
+ refcount_set(&tmpl->ct_general.use, 1);
return tmpl;
}
@@ -572,7 +563,7 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
void nf_ct_tmpl_free(struct nf_conn *tmpl)
{
- nf_ct_ext_destroy(tmpl);
+ kfree(tmpl->ext);
if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
kfree((char *)tmpl - tmpl->proto.tmpl_padto);
@@ -591,13 +582,12 @@ static void destroy_gre_conntrack(struct nf_conn *ct)
#endif
}
-static void
-destroy_conntrack(struct nf_conntrack *nfct)
+void nf_ct_destroy(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
- pr_debug("destroy_conntrack(%p)\n", ct);
- WARN_ON(atomic_read(&nfct->use) != 0);
+ pr_debug("%s(%p)\n", __func__, ct);
+ WARN_ON(refcount_read(&nfct->use) != 0);
if (unlikely(nf_ct_is_template(ct))) {
nf_ct_tmpl_free(ct);
@@ -607,7 +597,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
destroy_gre_conntrack(ct);
- local_bh_disable();
/* Expectations will have been removed in clean_from_lists,
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
@@ -615,64 +604,91 @@ destroy_conntrack(struct nf_conntrack *nfct)
*/
nf_ct_remove_expectations(ct);
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
-
- local_bh_enable();
-
if (ct->master)
nf_ct_put(ct->master);
- pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
+ pr_debug("%s: returning ct=%p to slab\n", __func__, ct);
nf_conntrack_free(ct);
}
+EXPORT_SYMBOL(nf_ct_destroy);
-static void nf_ct_delete_from_lists(struct nf_conn *ct)
+static void __nf_ct_delete_from_lists(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
unsigned int sequence;
- nf_ct_helper_destroy(ct);
-
- local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
reply_hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
clean_from_lists(ct);
nf_conntrack_double_unlock(hash, reply_hash);
+}
+
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
+{
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
- nf_ct_add_to_dying_list(ct);
+ __nf_ct_delete_from_lists(ct);
local_bh_enable();
}
+static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct));
+
+ spin_lock(&cnet->ecache.dying_lock);
+ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &cnet->ecache.dying_list);
+ spin_unlock(&cnet->ecache.dying_lock);
+#endif
+}
+
bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
struct nf_conn_tstamp *tstamp;
+ struct net *net;
if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
return false;
tstamp = nf_conn_tstamp_find(ct);
- if (tstamp && tstamp->stop == 0)
+ if (tstamp) {
+ s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
+
tstamp->stop = ktime_get_real_ns();
+ if (timeout < 0)
+ tstamp->stop -= jiffies_to_nsecs(-timeout);
+ }
if (nf_conntrack_event_report(IPCT_DESTROY, ct,
portid, report) < 0) {
/* destroy event was not delivered. nf_ct_put will
* be done by event cache worker on redelivery.
*/
- nf_ct_delete_from_lists(ct);
- nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
+ __nf_ct_delete_from_lists(ct);
+ nf_ct_add_to_ecache_list(ct);
+ local_bh_enable();
+
+ nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
return false;
}
- nf_conntrack_ecache_work(nf_ct_net(ct));
+ net = nf_ct_net(ct);
+ if (nf_conntrack_ecache_dwork_pending(net))
+ nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT);
nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
return true;
@@ -711,9 +727,12 @@ nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
/* caller must hold rcu readlock and none of the nf_conntrack_locks */
static void nf_ct_gc_expired(struct nf_conn *ct)
{
- if (!atomic_inc_not_zero(&ct->ct_general.use))
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
return;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (nf_ct_should_gc(ct))
nf_ct_kill(ct);
@@ -779,7 +798,10 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
* in, try to obtain a reference and re-check tuple
*/
ct = nf_ct_tuplehash_to_ctrack(h);
- if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
+ if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
+ /* re-check key after refcount */
+ smp_acquire__after_ctrl_dep();
+
if (likely(nf_ct_key_equal(h, tuple, zone, net)))
goto found;
@@ -799,8 +821,20 @@ struct nf_conntrack_tuple_hash *
nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
- return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple, net));
+ unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
+ struct nf_conntrack_tuple_hash *thash;
+
+ thash = __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone_id, net));
+
+ if (thash)
+ return thash;
+
+ rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
+ if (rid != zone_id)
+ return __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, rid, net));
+ return thash;
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -814,6 +848,33 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
&nf_conntrack_hash[reply_hash]);
}
+static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext)
+{
+ /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions
+ * may contain stale pointers to e.g. helper that has been removed.
+ *
+ * The helper can't clear this because the nf_conn object isn't in
+ * any hash and synchronize_rcu() isn't enough because associated skb
+ * might sit in a queue.
+ */
+ return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid);
+}
+
+static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext)
+{
+ if (!ext)
+ return true;
+
+ if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid))
+ return false;
+
+ /* inserted into conntrack table, nf_ct_iterate_cleanup()
+ * will find it. Disable nf_ct_ext_find() id check.
+ */
+ WRITE_ONCE(ext->gen_id, 0);
+ return true;
+}
+
int
nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
@@ -822,50 +883,78 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
+ unsigned int max_chainlen;
+ unsigned int chainlen = 0;
unsigned int sequence;
+ int err = -EEXIST;
zone = nf_ct_zone(ct);
+ if (!nf_ct_ext_valid_pre(ct->ext)) {
+ NF_CT_STAT_INC(net, insert_failed);
+ return -ETIMEDOUT;
+ }
+
local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
reply_hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
+
/* See if there's one in the list already, including reverse */
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (chainlen++ > max_chainlen)
+ goto chaintoolong;
+ }
+
+ chainlen = 0;
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
+ if (chainlen++ > max_chainlen)
+ goto chaintoolong;
+ }
smp_wmb();
/* The caller holds a reference to this object */
- atomic_set(&ct->ct_general.use, 2);
+ refcount_set(&ct->ct_general.use, 2);
__nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert);
local_bh_enable();
- return 0;
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ nf_ct_kill(ct);
+ NF_CT_STAT_INC(net, drop);
+ return -ETIMEDOUT;
+ }
+
+ return 0;
+chaintoolong:
+ NF_CT_STAT_INC(net, chaintoolong);
+ err = -ENOSPC;
out:
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
- return -EEXIST;
+ return err;
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
-static inline void nf_ct_acct_update(struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int len)
+void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
+ unsigned int bytes)
{
struct nf_conn_acct *acct;
@@ -873,10 +962,11 @@ static inline void nf_ct_acct_update(struct nf_conn *ct,
if (acct) {
struct nf_conn_counter *counter = acct->counter;
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ atomic64_add(packets, &counter[dir].packets);
+ atomic64_add(bytes, &counter[dir].bytes);
}
}
+EXPORT_SYMBOL_GPL(nf_ct_acct_add);
static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
const struct nf_conn *loser_ct)
@@ -890,7 +980,7 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
/* u32 should be fine since we must have seen one packet. */
bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
- nf_ct_acct_update(ct, ctinfo, bytes);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
}
}
@@ -898,8 +988,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
{
struct nf_conn_tstamp *tstamp;
- atomic_inc(&ct->ct_general.use);
- ct->status |= IPS_CONFIRMED;
+ refcount_inc(&ct->ct_general.use);
/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
@@ -907,6 +996,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
tstamp->start = ktime_get_real_ns();
}
+/* caller must hold locks to prevent concurrent changes */
static int __nf_ct_resolve_clash(struct sk_buff *skb,
struct nf_conntrack_tuple_hash *h)
{
@@ -920,23 +1010,20 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
if (nf_ct_is_dying(ct))
return NF_DROP;
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- return NF_DROP;
-
if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
nf_ct_match(ct, loser_ct)) {
struct net *net = nf_ct_net(ct);
+ nf_conntrack_get(&ct->ct_general);
+
nf_ct_acct_merge(ct, ctinfo, loser_ct);
- nf_ct_add_to_dying_list(loser_ct);
- nf_conntrack_put(&loser_ct->ct_general);
+ nf_ct_put(loser_ct);
nf_ct_set(skb, ct, ctinfo);
- NF_CT_STAT_INC(net, insert_failed);
+ NF_CT_STAT_INC(net, clash_resolve);
return NF_ACCEPT;
}
- nf_ct_put(ct);
return NF_DROP;
}
@@ -977,7 +1064,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
}
/* We want the clashing entry to go away real soon: 1 second timeout. */
- loser_ct->timeout = nfct_time_stamp + HZ;
+ WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);
/* IPS_NAT_CLASH removes the entry automatically on the first
* reply. Also prevents UDP tracker from moving the entry to
@@ -996,6 +1083,8 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
&nf_conntrack_hash[repl_idx]);
+
+ NF_CT_STAT_INC(net, clash_resolve);
return NF_ACCEPT;
}
@@ -1004,7 +1093,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
*
* @skb: skb that causes the clash
* @h: tuplehash of the clashing entry already in table
- * @hash_reply: hash slot for reply direction
+ * @reply_hash: hash slot for reply direction
*
* A conntrack entry can be inserted to the connection tracking table
* if there is no existing entry with an identical tuple.
@@ -1025,10 +1114,10 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
*
* Failing that, the new, unconfirmed conntrack is still added to the table
* provided that the collision only occurs in the ORIGINAL direction.
- * The new entry will be added after the existing one in the hash list,
+ * The new entry will be added only in the non-clashing REPLY direction,
* so packets in the ORIGINAL direction will continue to match the existing
* entry. The new entry will also have a fixed timeout so it expires --
- * due to the collision, it will not see bidirectional traffic.
+ * due to the collision, it will only see reply traffic.
*
* Returns NF_DROP if the clash could not be resolved.
*/
@@ -1060,7 +1149,6 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
return ret;
drop:
- nf_ct_add_to_dying_list(loser_ct);
NF_CT_STAT_INC(net, drop);
NF_CT_STAT_INC(net, insert_failed);
return NF_DROP;
@@ -1070,6 +1158,7 @@ drop:
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
+ unsigned int chainlen = 0, sequence, max_chainlen;
const struct nf_conntrack_zone *zone;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
@@ -1078,7 +1167,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
- unsigned int sequence;
int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
@@ -1100,8 +1188,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* We're not in hash table, and we refuse to set up related
@@ -1121,32 +1209,49 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_DROP;
}
+ if (!nf_ct_ext_valid_pre(ct->ext)) {
+ NF_CT_STAT_INC(net, insert_failed);
+ goto dying;
+ }
+
pr_debug("Confirming conntrack %p\n", ct);
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
* further use of that particular connection -JM.
*/
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
+ ct->status |= IPS_CONFIRMED;
if (unlikely(nf_ct_is_dying(ct))) {
- nf_ct_add_to_dying_list(ct);
NF_CT_STAT_INC(net, insert_failed);
goto dying;
}
+ max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
+ if (chainlen++ > max_chainlen)
+ goto chaintoolong;
+ }
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ chainlen = 0;
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
+ if (chainlen++ > max_chainlen) {
+chaintoolong:
+ NF_CT_STAT_INC(net, chaintoolong);
+ NF_CT_STAT_INC(net, insert_failed);
+ ret = NF_DROP;
+ goto dying;
+ }
+ }
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
@@ -1164,6 +1269,16 @@ __nf_conntrack_confirm(struct sk_buff *skb)
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
+ /* ext area is still valid (rcu read lock is held,
+ * but will go out of scope soon, we need to remove
+ * this conntrack again.
+ */
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ nf_ct_kill(ct);
+ NF_CT_STAT_INC(net, drop);
+ return NF_DROP;
+ }
+
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
@@ -1200,7 +1315,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
rcu_read_lock();
begin:
nf_conntrack_get_ht(&ct_hash, &hsize);
- hash = __hash_conntrack(net, tuple, hsize);
+ hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize);
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
@@ -1227,7 +1342,8 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
* Let nf_ct_resolve_clash() deal with this later.
*/
if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
+ nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
continue;
NF_CT_STAT_INC_ATOMIC(net, found);
@@ -1275,9 +1391,12 @@ static unsigned int early_drop_list(struct net *net,
nf_ct_is_dying(tmp))
continue;
- if (!atomic_inc_not_zero(&tmp->ct_general.use))
+ if (!refcount_inc_not_zero(&tmp->ct_general.use))
continue;
+ /* load ->ct_net and ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
/* kill only if still in same netns -- might have moved due to
* SLAB_TYPESAFE_BY_RCU rules.
*
@@ -1342,83 +1461,107 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
return false;
}
-#define DAY (86400 * HZ)
-
-/* Set an arbitrary timeout large enough not to ever expire, this save
- * us a check for the IPS_OFFLOAD_BIT from the packet path via
- * nf_ct_is_expired().
- */
-static void nf_ct_offload_timeout(struct nf_conn *ct)
-{
- if (nf_ct_expires(ct) < DAY / 2)
- ct->timeout = nfct_time_stamp + DAY;
-}
-
static void gc_worker(struct work_struct *work)
{
- unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
- unsigned int i, goal, buckets = 0, expired_count = 0;
- unsigned int nf_conntrack_max95 = 0;
+ unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
- unsigned int ratio, scanned = 0;
+ unsigned int expired_count = 0;
unsigned long next_run;
+ s32 delta_time;
+ long count;
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
- goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
- i = gc_work->last_bucket;
+ i = gc_work->next_bucket;
if (gc_work->early_drop)
nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
+ if (i == 0) {
+ gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
+ gc_work->count = GC_SCAN_INITIAL_COUNT;
+ gc_work->start_time = start_time;
+ }
+
+ next_run = gc_work->avg_timeout;
+ count = gc_work->count;
+
+ end_time = start_time + GC_SCAN_MAX_DURATION;
+
do {
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
- unsigned int hashsz;
struct nf_conn *tmp;
- i++;
rcu_read_lock();
nf_conntrack_get_ht(&ct_hash, &hashsz);
- if (i >= hashsz)
- i = 0;
+ if (i >= hashsz) {
+ rcu_read_unlock();
+ break;
+ }
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ struct nf_conntrack_net *cnet;
struct net *net;
+ long expires;
tmp = nf_ct_tuplehash_to_ctrack(h);
- scanned++;
if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
nf_ct_offload_timeout(tmp);
continue;
}
+ if (expired_count > GC_SCAN_EXPIRED_MAX) {
+ rcu_read_unlock();
+
+ gc_work->next_bucket = i;
+ gc_work->avg_timeout = next_run;
+ gc_work->count = count;
+
+ delta_time = nfct_time_stamp - gc_work->start_time;
+
+ /* re-sched immediately if total cycle time is exceeded */
+ next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
+ goto early_exit;
+ }
+
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
expired_count++;
continue;
}
+ expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
+ expires = (expires - (long)next_run) / ++count;
+ next_run += expires;
+
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
net = nf_ct_net(tmp);
- if (atomic_read(&net->ct.count) < nf_conntrack_max95)
+ cnet = nf_ct_pernet(net);
+ if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
/* need to take reference to avoid possible races */
- if (!atomic_inc_not_zero(&tmp->ct_general.use))
+ if (!refcount_inc_not_zero(&tmp->ct_general.use))
continue;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (gc_worker_skip_ct(tmp)) {
nf_ct_put(tmp);
continue;
}
- if (gc_worker_can_early_drop(tmp))
+ if (gc_worker_can_early_drop(tmp)) {
nf_ct_kill(tmp);
+ expired_count++;
+ }
nf_ct_put(tmp);
}
@@ -1429,51 +1572,41 @@ static void gc_worker(struct work_struct *work)
*/
rcu_read_unlock();
cond_resched();
- } while (++buckets < goal);
+ i++;
- if (gc_work->exiting)
- return;
+ delta_time = nfct_time_stamp - end_time;
+ if (delta_time > 0 && i < hashsz) {
+ gc_work->avg_timeout = next_run;
+ gc_work->count = count;
+ gc_work->next_bucket = i;
+ next_run = 0;
+ goto early_exit;
+ }
+ } while (i < hashsz);
- /*
- * Eviction will normally happen from the packet path, and not
- * from this gc worker.
- *
- * This worker is only here to reap expired entries when system went
- * idle after a busy period.
- *
- * The heuristics below are supposed to balance conflicting goals:
- *
- * 1. Minimize time until we notice a stale entry
- * 2. Maximize scan intervals to not waste cycles
- *
- * Normally, expire ratio will be close to 0.
- *
- * As soon as a sizeable fraction of the entries have expired
- * increase scan frequency.
- */
- ratio = scanned ? expired_count * 100 / scanned : 0;
- if (ratio > GC_EVICT_RATIO) {
- gc_work->next_gc_run = min_interval;
- } else {
- unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
+ gc_work->next_bucket = 0;
- BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
+ next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
- gc_work->next_gc_run += min_interval;
- if (gc_work->next_gc_run > max)
- gc_work->next_gc_run = max;
- }
+ delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
+ if (next_run > (unsigned long)delta_time)
+ next_run -= delta_time;
+ else
+ next_run = 1;
+
+early_exit:
+ if (gc_work->exiting)
+ return;
+
+ if (next_run)
+ gc_work->early_drop = false;
- next_run = gc_work->next_gc_run;
- gc_work->last_bucket = i;
- gc_work->early_drop = false;
queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
}
static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
{
- INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
- gc_work->next_gc_run = HZ;
+ INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
gc_work->exiting = false;
}
@@ -1484,17 +1617,18 @@ __nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_tuple *repl,
gfp_t gfp, u32 hash)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+ unsigned int ct_count;
struct nf_conn *ct;
/* We don't want any race condition at early drop stage */
- atomic_inc(&net->ct.count);
+ ct_count = atomic_inc_return(&cnet->count);
- if (nf_conntrack_max &&
- unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
+ if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
- atomic_dec(&net->ct.count);
+ atomic_dec(&cnet->count);
net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
return ERR_PTR(-ENOMEM);
}
@@ -1515,21 +1649,19 @@ __nf_conntrack_alloc(struct net *net,
/* save hash for reusing when confirming */
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
ct->status = 0;
- ct->timeout = 0;
+ WRITE_ONCE(ct->timeout, 0);
write_pnet(&ct->ct_net, net);
- memset(&ct->__nfct_init_offset[0], 0,
- offsetof(struct nf_conn, proto) -
- offsetof(struct nf_conn, __nfct_init_offset[0]));
+ memset_after(ct, 0, __nfct_init_offset);
nf_ct_zone_add(ct, zone);
/* Because we use RCU lookups, we set ct_general.use to zero before
* this is inserted in any list.
*/
- atomic_set(&ct->ct_general.use, 0);
+ refcount_set(&ct->ct_general.use, 0);
return ct;
out:
- atomic_dec(&net->ct.count);
+ atomic_dec(&cnet->count);
return ERR_PTR(-ENOMEM);
}
@@ -1546,16 +1678,29 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
void nf_conntrack_free(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_net *cnet;
/* A freed object has refcnt == 0, that's
* the golden rule for SLAB_TYPESAFE_BY_RCU
*/
- WARN_ON(atomic_read(&ct->ct_general.use) != 0);
+ WARN_ON(refcount_read(&ct->ct_general.use) != 0);
+
+ if (ct->status & IPS_SRC_NAT_DONE) {
+ const struct nf_nat_hook *nat_hook;
- nf_ct_ext_destroy(ct);
+ rcu_read_lock();
+ nat_hook = rcu_dereference(nf_nat_hook);
+ if (nat_hook)
+ nat_hook->remove_nat_bysrc(ct);
+ rcu_read_unlock();
+ }
+
+ kfree(ct->ext);
kmem_cache_free(nf_conntrack_cachep, ct);
+ cnet = nf_ct_pernet(net);
+
smp_mb__before_atomic();
- atomic_dec(&net->ct.count);
+ atomic_dec(&cnet->count);
}
EXPORT_SYMBOL_GPL(nf_conntrack_free);
@@ -1571,11 +1716,14 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
struct nf_conntrack_ecache *ecache;
+#endif
struct nf_conntrack_expect *exp = NULL;
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
struct nf_conntrack_zone tmp;
+ struct nf_conntrack_net *cnet;
if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
pr_debug("Can't invert tuple.\n");
@@ -1603,14 +1751,21 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
- nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
- ecache ? ecache->expmask : 0,
- GFP_ATOMIC);
- local_bh_disable();
- if (net->ct.expect_count) {
- spin_lock(&nf_conntrack_expect_lock);
+ if ((ecache || net->ct.sysctl_events) &&
+ !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
+ ecache ? ecache->expmask : 0,
+ GFP_ATOMIC)) {
+ nf_conntrack_free(ct);
+ return ERR_PTR(-ENOMEM);
+ }
+#endif
+
+ cnet = nf_ct_pernet(net);
+ if (cnet->expect_count) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
pr_debug("expectation arrives ct=%p exp=%p\n",
@@ -1633,16 +1788,23 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
#endif
NF_CT_STAT_INC(net, expect_new);
}
- spin_unlock(&nf_conntrack_expect_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
- if (!exp)
+ if (!exp && tmpl)
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
- /* Now it is inserted into the unconfirmed list, bump refcount */
- nf_conntrack_get(&ct->ct_general);
- nf_ct_add_to_unconfirmed_list(ct);
+ /* Other CPU might have obtained a pointer to this object before it was
+ * released. Because refcount is 0, refcount_inc_not_zero() will fail.
+ *
+ * After refcount_set(1) it will succeed; ensure that zeroing of
+ * ct->status and the correct ct->net pointer are visible; else other
+ * core might observe CONFIRMED bit which means the entry is valid and
+ * in the hash table, but its not (anymore).
+ */
+ smp_wmb();
- local_bh_enable();
+ /* Now it is going to be associated with an sk_buff, set refcount to 1. */
+ refcount_set(&ct->ct_general.use, 1);
if (exp) {
if (exp->expectfn)
@@ -1666,8 +1828,8 @@ resolve_normal_ct(struct nf_conn *tmpl,
struct nf_conntrack_tuple_hash *h;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
+ u32 hash, zone_id, rid;
struct nf_conn *ct;
- u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, state->pf, protonum, state->net,
@@ -1678,8 +1840,20 @@ resolve_normal_ct(struct nf_conn *tmpl,
/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
- hash = hash_conntrack_raw(&tuple, state->net);
+
+ zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
+ hash = hash_conntrack_raw(&tuple, zone_id, state->net);
h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
+
+ if (!h) {
+ rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
+ if (zone_id != rid) {
+ u32 tmp = hash_conntrack_raw(&tuple, rid, state->net);
+
+ h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp);
+ }
+ }
+
if (!h) {
h = init_conntrack(state->net, tmpl, &tuple,
skb, dataoff, hash);
@@ -1735,10 +1909,8 @@ nf_conntrack_handle_icmp(struct nf_conn *tmpl,
else
return NF_ACCEPT;
- if (ret <= 0) {
+ if (ret <= 0)
NF_CT_STAT_INC_ATOMIC(state->net, error);
- NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- }
return ret;
}
@@ -1812,10 +1984,8 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
if (tmpl || ctinfo == IP_CT_UNTRACKED) {
/* Previously seen (loopback or untracked)? Ignore. */
if ((tmpl && !nf_ct_is_template(tmpl)) ||
- ctinfo == IP_CT_UNTRACKED) {
- NF_CT_STAT_INC_ATOMIC(state->net, ignore);
+ ctinfo == IP_CT_UNTRACKED)
return NF_ACCEPT;
- }
skb->_nfct = 0;
}
@@ -1823,7 +1993,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
if (dataoff <= 0) {
pr_debug("not prepared to track yet or error occurred\n");
- NF_CT_STAT_INC_ATOMIC(state->net, error);
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
ret = NF_ACCEPT;
goto out;
@@ -1863,17 +2032,19 @@ repeat:
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
pr_debug("nf_conntrack_in: Can't track with proto module\n");
- nf_conntrack_put(&ct->ct_general);
+ nf_ct_put(ct);
skb->_nfct = 0;
- NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- if (ret == -NF_DROP)
- NF_CT_STAT_INC_ATOMIC(state->net, drop);
/* Special case: TCP tracker reports an attempt to reopen a
* closed/aborted connection. We have to go back and create a
* fresh conntrack.
*/
if (ret == -NF_REPEAT)
goto repeat;
+
+ NF_CT_STAT_INC_ATOMIC(state->net, invalid);
+ if (ret == -NF_DROP)
+ NF_CT_STAT_INC_ATOMIC(state->net, drop);
+
ret = -ret;
goto out;
}
@@ -1905,10 +2076,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
if (ct->master || (help && !hlist_empty(&help->expectations)))
return;
-
- rcu_read_lock();
- __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
- rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
@@ -1931,7 +2098,7 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
if (do_acct)
- nf_ct_acct_update(ct, ctinfo, skb->len);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -1939,7 +2106,7 @@ bool nf_ct_kill_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb)
{
- nf_ct_acct_update(ct, ctinfo, skb->len);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
return nf_ct_delete(ct, 0, 0);
}
@@ -1972,13 +2139,22 @@ const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
+ struct nf_conntrack_tuple *t,
+ u_int32_t flags)
{
- if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
+ if (!tb[CTA_PROTO_SRC_PORT])
+ return -EINVAL;
+
+ t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
+ if (!tb[CTA_PROTO_DST_PORT])
+ return -EINVAL;
- t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
- t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
+ t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
+ }
return 0;
}
@@ -2014,22 +2190,18 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
nf_conntrack_get(skb_nfct(nskb));
}
-static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
+static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
{
+ const struct nf_nat_hook *nat_hook;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
- enum ip_conntrack_info ctinfo;
- struct nf_nat_hook *nat_hook;
unsigned int status;
- struct nf_conn *ct;
int dataoff;
u16 l3num;
u8 l4num;
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || nf_ct_is_confirmed(ct))
- return 0;
-
l3num = nf_ct_l3num(ct);
dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
@@ -2086,6 +2258,78 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
return 0;
}
+/* This packet is coming from userspace via nf_queue, complete the packet
+ * processing after the helper invocation in nf_confirm().
+ */
+static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ const struct nf_conntrack_helper *helper;
+ const struct nf_conn_help *help;
+ int protoff;
+
+ help = nfct_help(ct);
+ if (!help)
+ return 0;
+
+ helper = rcu_dereference(help->helper);
+ if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
+ return 0;
+
+ switch (nf_ct_l3num(ct)) {
+ case NFPROTO_IPV4:
+ protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NFPROTO_IPV6: {
+ __be16 frag_off;
+ u8 pnum;
+
+ pnum = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+ return 0;
+ break;
+ }
+#endif
+ default:
+ return 0;
+ }
+
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return -1;
+ }
+ }
+
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0;
+}
+
+static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ int err;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return 0;
+
+ if (!nf_ct_is_confirmed(ct)) {
+ err = __nf_conntrack_update(net, skb, ct, ctinfo);
+ if (err < 0)
+ return err;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ }
+
+ return nf_confirm_cthelper(skb, ct, ctinfo);
+}
+
static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
const struct sk_buff *skb)
{
@@ -2124,7 +2368,7 @@ static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
- void *data, unsigned int *bucket)
+ const struct nf_ct_iter_data *iter_data, unsigned int *bucket)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
@@ -2132,17 +2376,36 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
spinlock_t *lockp;
for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
+ struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];
+
+ if (hlist_nulls_empty(hslot))
+ continue;
+
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
nf_conntrack_lock(lockp);
- if (*bucket < nf_conntrack_htable_size) {
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
- if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
- continue;
- ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
- goto found;
- }
+ hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
+ continue;
+ /* All nf_conn objects are added to hash table twice, one
+ * for original direction tuple, once for the reply tuple.
+ *
+ * Exception: In the IPS_NAT_CLASH case, only the reply
+ * tuple is added (the original tuple already existed for
+ * a different object).
+ *
+ * We only need to call the iterator once for each
+ * conntrack, so we just use the 'reply' direction
+ * tuple while iterating.
+ */
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (iter_data->net &&
+ !net_eq(iter_data->net, nf_ct_net(ct)))
+ continue;
+
+ if (iter(ct, iter_data->data))
+ goto found;
}
spin_unlock(lockp);
local_bh_enable();
@@ -2151,109 +2414,43 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
return NULL;
found:
- atomic_inc(&ct->ct_general.use);
+ refcount_inc(&ct->ct_general.use);
spin_unlock(lockp);
local_bh_enable();
return ct;
}
static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
+ const struct nf_ct_iter_data *iter_data)
{
- unsigned int bucket = 0, sequence;
+ unsigned int bucket = 0;
struct nf_conn *ct;
might_sleep();
- for (;;) {
- sequence = read_seqcount_begin(&nf_conntrack_generation);
-
- while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
- /* Time to push up daises... */
-
- nf_ct_delete(ct, portid, report);
- nf_ct_put(ct);
- cond_resched();
- }
-
- if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
- break;
- bucket = 0;
- }
-}
-
-struct iter_data {
- int (*iter)(struct nf_conn *i, void *data);
- void *data;
- struct net *net;
-};
-
-static int iter_net_only(struct nf_conn *i, void *data)
-{
- struct iter_data *d = data;
-
- if (!net_eq(d->net, nf_ct_net(i)))
- return 0;
-
- return d->iter(i, d->data);
-}
-
-static void
-__nf_ct_unconfirmed_destroy(struct net *net)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct nf_conntrack_tuple_hash *h;
- struct hlist_nulls_node *n;
- struct ct_pcpu *pcpu;
-
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_bh(&pcpu->lock);
- hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
- struct nf_conn *ct;
-
- ct = nf_ct_tuplehash_to_ctrack(h);
+ mutex_lock(&nf_conntrack_mutex);
+ while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) {
+ /* Time to push up daises... */
- /* we cannot call iter() on unconfirmed list, the
- * owning cpu can reallocate ct->ext at any time.
- */
- set_bit(IPS_DYING_BIT, &ct->status);
- }
- spin_unlock_bh(&pcpu->lock);
+ nf_ct_delete(ct, iter_data->portid, iter_data->report);
+ nf_ct_put(ct);
cond_resched();
}
+ mutex_unlock(&nf_conntrack_mutex);
}
-void nf_ct_unconfirmed_destroy(struct net *net)
+void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
+ const struct nf_ct_iter_data *iter_data)
{
- might_sleep();
-
- if (atomic_read(&net->ct.count) > 0) {
- __nf_ct_unconfirmed_destroy(net);
- nf_queue_nf_hook_drop(net);
- synchronize_net();
- }
-}
-EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
-
-void nf_ct_iterate_cleanup_net(struct net *net,
- int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
-{
- struct iter_data d;
+ struct net *net = iter_data->net;
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
might_sleep();
- if (atomic_read(&net->ct.count) == 0)
+ if (atomic_read(&cnet->count) == 0)
return;
- d.iter = iter;
- d.data = data;
- d.net = net;
-
- nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
+ nf_ct_iterate_cleanup(iter, iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
@@ -2271,43 +2468,56 @@ EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
void
nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
{
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
down_read(&net_rwsem);
for_each_net(net) {
- if (atomic_read(&net->ct.count) == 0)
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ if (atomic_read(&cnet->count) == 0)
continue;
- __nf_ct_unconfirmed_destroy(net);
nf_queue_nf_hook_drop(net);
}
up_read(&net_rwsem);
/* Need to wait for netns cleanup worker to finish, if its
* running -- it might have deleted a net namespace from
- * the global list, so our __nf_ct_unconfirmed_destroy() might
- * not have affected all namespaces.
+ * the global list, so hook drop above might not have
+ * affected all namespaces.
*/
net_ns_barrier();
- /* a conntrack could have been unlinked from unconfirmed list
- * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy().
+ /* a skb w. unconfirmed conntrack could have been reinjected just
+ * before we called nf_queue_nf_hook_drop().
+ *
* This makes sure its inserted into conntrack table.
*/
synchronize_net();
- nf_ct_iterate_cleanup(iter, data, 0, 0);
+ nf_ct_ext_bump_genid();
+ iter_data.data = data;
+ nf_ct_iterate_cleanup(iter, &iter_data);
+
+ /* Another cpu might be in a rcu read section with
+ * rcu protected pointer cleared in iter callback
+ * or hidden via nf_ct_ext_bump_genid() above.
+ *
+ * Wait until those are done.
+ */
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
static int kill_all(struct nf_conn *i, void *data)
{
- return net_eq(nf_ct_net(i), data);
+ return 1;
}
void nf_conntrack_cleanup_start(void)
{
+ cleanup_nf_conntrack_bpf();
conntrack_gc_work.exiting = true;
- RCU_INIT_POINTER(ip_ct_attach, NULL);
}
void nf_conntrack_cleanup_end(void)
@@ -2317,13 +2527,7 @@ void nf_conntrack_cleanup_end(void)
kvfree(nf_conntrack_hash);
nf_conntrack_proto_fini();
- nf_conntrack_seqadj_fini();
- nf_conntrack_labels_fini();
nf_conntrack_helper_fini();
- nf_conntrack_timeout_fini();
- nf_conntrack_ecache_fini();
- nf_conntrack_tstamp_fini();
- nf_conntrack_acct_fini();
nf_conntrack_expect_fini();
kmem_cache_destroy(nf_conntrack_cachep);
@@ -2343,8 +2547,9 @@ void nf_conntrack_cleanup_net(struct net *net)
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
{
- int busy;
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
+ int busy;
/*
* This makes sure all current packets have passed through
@@ -2355,8 +2560,11 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
i_see_dead_people:
busy = 0;
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_ct_iterate_cleanup(kill_all, net, 0, 0);
- if (atomic_read(&net->ct.count) != 0)
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ iter_data.net = net;
+ nf_ct_iterate_cleanup_net(kill_all, &iter_data);
+ if (atomic_read(&cnet->count) != 0)
busy = 1;
}
if (busy) {
@@ -2365,11 +2573,9 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_conntrack_proto_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
- free_percpu(net->ct.pcpu_lists);
}
}
@@ -2409,8 +2615,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
if (!hash)
return -ENOMEM;
+ mutex_lock(&nf_conntrack_mutex);
old_size = nf_conntrack_htable_size;
if (old_size == hashsize) {
+ mutex_unlock(&nf_conntrack_mutex);
kvfree(hash);
return 0;
}
@@ -2427,16 +2635,19 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
for (i = 0; i < nf_conntrack_htable_size; i++) {
while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
+ unsigned int zone_id;
+
h = hlist_nulls_entry(nf_conntrack_hash[i].first,
struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
+
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h));
bucket = __hash_conntrack(nf_ct_net(ct),
- &h->tuple, hashsize);
+ &h->tuple, zone_id, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
- old_size = nf_conntrack_htable_size;
old_hash = nf_conntrack_hash;
nf_conntrack_hash = hash;
@@ -2446,6 +2657,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
nf_conntrack_all_unlock();
local_bh_enable();
+ mutex_unlock(&nf_conntrack_mutex);
+
synchronize_net();
kvfree(old_hash);
return 0;
@@ -2470,36 +2683,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
return nf_conntrack_hash_resize(hashsize);
}
-static __always_inline unsigned int total_extension_size(void)
-{
- /* remember to add new extensions below */
- BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
-
- return sizeof(struct nf_ct_ext) +
- sizeof(struct nf_conn_help)
-#if IS_ENABLED(CONFIG_NF_NAT)
- + sizeof(struct nf_conn_nat)
-#endif
- + sizeof(struct nf_conn_seqadj)
- + sizeof(struct nf_conn_acct)
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- + sizeof(struct nf_conntrack_ecache)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- + sizeof(struct nf_conn_tstamp)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- + sizeof(struct nf_conn_timeout)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
- + sizeof(struct nf_conn_labels)
-#endif
-#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
- + sizeof(struct nf_conn_synproxy)
-#endif
- ;
-};
-
int nf_conntrack_init_start(void)
{
unsigned long nr_pages = totalram_pages();
@@ -2507,35 +2690,31 @@ int nf_conntrack_init_start(void)
int ret = -ENOMEM;
int i;
- /* struct nf_ct_ext uses u8 to store offsets/size */
- BUILD_BUG_ON(total_extension_size() > 255u);
-
- seqcount_init(&nf_conntrack_generation);
+ seqcount_spinlock_init(&nf_conntrack_generation,
+ &nf_conntrack_locks_all_lock);
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_conntrack_locks[i]);
if (!nf_conntrack_htable_size) {
- /* Idea from tcp.c: use 1/16384 of memory.
- * On i386: 32MB machine has 512 buckets.
- * >= 1GB machines have 16384 buckets.
- * >= 4GB machines have 65536 buckets.
- */
nf_conntrack_htable_size
= (((nr_pages << PAGE_SHIFT) / 16384)
/ sizeof(struct hlist_head));
- if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
- nf_conntrack_htable_size = 65536;
+ if (BITS_PER_LONG >= 64 &&
+ nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
+ nf_conntrack_htable_size = 262144;
else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
- nf_conntrack_htable_size = 16384;
- if (nf_conntrack_htable_size < 32)
- nf_conntrack_htable_size = 32;
+ nf_conntrack_htable_size = 65536;
- /* Use a max. factor of four by default to get the same max as
- * with the old struct list_heads. When a table size is given
- * we use the old value of 8 to avoid reducing the max.
- * entries. */
- max_factor = 4;
+ if (nf_conntrack_htable_size < 1024)
+ nf_conntrack_htable_size = 1024;
+ /* Use a max. factor of one by default to keep the average
+ * hash chain length at 2 entries. Each entry has to be added
+ * twice (once for original direction, once for reply).
+ * When a table size is given we use the old value of 8 to
+ * avoid implicit reduction of the max entries setting.
+ */
+ max_factor = 1;
}
nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
@@ -2555,34 +2734,10 @@ int nf_conntrack_init_start(void)
if (ret < 0)
goto err_expect;
- ret = nf_conntrack_acct_init();
- if (ret < 0)
- goto err_acct;
-
- ret = nf_conntrack_tstamp_init();
- if (ret < 0)
- goto err_tstamp;
-
- ret = nf_conntrack_ecache_init();
- if (ret < 0)
- goto err_ecache;
-
- ret = nf_conntrack_timeout_init();
- if (ret < 0)
- goto err_timeout;
-
ret = nf_conntrack_helper_init();
if (ret < 0)
goto err_helper;
- ret = nf_conntrack_labels_init();
- if (ret < 0)
- goto err_labels;
-
- ret = nf_conntrack_seqadj_init();
- if (ret < 0)
- goto err_seqadj;
-
ret = nf_conntrack_proto_init();
if (ret < 0)
goto err_proto;
@@ -2590,23 +2745,18 @@ int nf_conntrack_init_start(void)
conntrack_gc_work_init(&conntrack_gc_work);
queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
+ ret = register_nf_conntrack_bpf();
+ if (ret < 0)
+ goto err_kfunc;
+
return 0;
+err_kfunc:
+ cancel_delayed_work_sync(&conntrack_gc_work.dwork);
+ nf_conntrack_proto_fini();
err_proto:
- nf_conntrack_seqadj_fini();
-err_seqadj:
- nf_conntrack_labels_fini();
-err_labels:
nf_conntrack_helper_fini();
err_helper:
- nf_conntrack_timeout_fini();
-err_timeout:
- nf_conntrack_ecache_fini();
-err_ecache:
- nf_conntrack_tstamp_fini();
-err_tstamp:
- nf_conntrack_acct_fini();
-err_acct:
nf_conntrack_expect_fini();
err_expect:
kmem_cache_destroy(nf_conntrack_cachep);
@@ -2615,16 +2765,15 @@ err_cachep:
return ret;
}
-static struct nf_ct_hook nf_conntrack_hook = {
+static const struct nf_ct_hook nf_conntrack_hook = {
.update = nf_conntrack_update,
- .destroy = destroy_conntrack,
+ .destroy = nf_ct_destroy,
.get_tuple_skb = nf_conntrack_get_tuple_skb,
+ .attach = nf_conntrack_attach,
};
void nf_conntrack_init_end(void)
{
- /* For use by REJECT target */
- RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
}
@@ -2632,33 +2781,19 @@ void nf_conntrack_init_end(void)
* We need to use special "null" values, not used in hash table
*/
#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
-#define DYING_NULLS_VAL ((1<<30)+1)
-#define TEMPLATE_NULLS_VAL ((1<<30)+2)
int nf_conntrack_init_net(struct net *net)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
int ret = -ENOMEM;
- int cpu;
BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
- atomic_set(&net->ct.count, 0);
-
- net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
- if (!net->ct.pcpu_lists)
- goto err_stat;
-
- for_each_possible_cpu(cpu) {
- struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_init(&pcpu->lock);
- INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
- INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
- }
+ atomic_set(&cnet->count, 0);
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
if (!net->ct.stat)
- goto err_pcpu_lists;
+ return ret;
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
@@ -2667,15 +2802,67 @@ int nf_conntrack_init_net(struct net *net)
nf_conntrack_acct_pernet_init(net);
nf_conntrack_tstamp_pernet_init(net);
nf_conntrack_ecache_pernet_init(net);
- nf_conntrack_helper_pernet_init(net);
nf_conntrack_proto_pernet_init(net);
return 0;
err_expect:
free_percpu(net->ct.stat);
-err_pcpu_lists:
- free_percpu(net->ct.pcpu_lists);
-err_stat:
return ret;
}
+
+/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+
+int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
+{
+ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ return -EPERM;
+
+ __nf_ct_set_timeout(ct, timeout);
+
+ if (test_bit(IPS_DYING_BIT, &ct->status))
+ return -ETIME;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);
+
+void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off)
+{
+ unsigned int bit;
+
+ /* Ignore these unchangable bits */
+ on &= ~IPS_UNCHANGEABLE_MASK;
+ off &= ~IPS_UNCHANGEABLE_MASK;
+
+ for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
+ if (on & (1 << bit))
+ set_bit(bit, &ct->status);
+ else if (off & (1 << bit))
+ clear_bit(bit, &ct->status);
+ }
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_status);
+
+int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status)
+{
+ unsigned long d;
+
+ d = ct->status ^ status;
+
+ if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+ /* unchangeable */
+ return -EBUSY;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EBUSY;
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EBUSY;
+
+ __nf_ct_change_status(ct, status, 0);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_change_status_common);
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 7956c9f19899..8698b3424646 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -16,7 +16,6 @@
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
-#include <linux/percpu.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
@@ -29,8 +28,9 @@
static DEFINE_MUTEX(nf_ct_ecache_mutex);
-#define ECACHE_RETRY_WAIT (HZ/10)
-#define ECACHE_STACK_ALLOC (256 / sizeof(void *))
+#define DYING_NULLS_VAL ((1 << 30) + 1)
+#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10)
+#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10)
enum retry_state {
STATE_CONGESTED,
@@ -38,150 +38,163 @@ enum retry_state {
STATE_DONE,
};
-static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
+struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net)
{
- struct nf_conn *refs[ECACHE_STACK_ALLOC];
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ return &cnet->ecache;
+}
+#if IS_MODULE(CONFIG_NF_CT_NETLINK)
+EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache);
+#endif
+
+static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)
+{
+ unsigned long stop = jiffies + ECACHE_MAX_JIFFIES;
+ struct hlist_nulls_head evicted_list;
enum retry_state ret = STATE_DONE;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- unsigned int evicted = 0;
+ unsigned int sent;
- spin_lock(&pcpu->lock);
+ INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL);
- hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+next:
+ sent = 0;
+ spin_lock_bh(&cnet->ecache.dying_lock);
+
+ hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) {
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
- struct nf_conntrack_ecache *e;
-
- if (!nf_ct_is_confirmed(ct))
- continue;
-
- /* This ecache access is safe because the ct is on the
- * pcpu dying list and we hold the spinlock -- the entry
- * cannot be free'd until after the lock is released.
- *
- * This is true even if ct has a refcount of 0: the
- * cpu that is about to free the entry must remove it
- * from the dying list and needs the lock to do so.
- */
- e = nf_ct_ecache_find(ct);
- if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
- continue;
- /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means
- * the worker owns this entry: the ct will remain valid
- * until the worker puts its ct reference.
+ /* The worker owns all entries, ct remains valid until nf_ct_put
+ * in the loop below.
*/
if (nf_conntrack_event(IPCT_DESTROY, ct)) {
ret = STATE_CONGESTED;
break;
}
- e->state = NFCT_ECACHE_DESTROY_SENT;
- refs[evicted] = ct;
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list);
- if (++evicted >= ARRAY_SIZE(refs)) {
+ if (time_after(stop, jiffies)) {
ret = STATE_RESTART;
break;
}
+
+ if (sent++ > 16) {
+ spin_unlock_bh(&cnet->ecache.dying_lock);
+ cond_resched();
+ goto next;
+ }
}
- spin_unlock(&pcpu->lock);
+ spin_unlock_bh(&cnet->ecache.dying_lock);
- /* can't _put while holding lock */
- while (evicted)
- nf_ct_put(refs[--evicted]);
+ hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
+ nf_ct_put(ct);
+
+ cond_resched();
+ }
return ret;
}
static void ecache_work(struct work_struct *work)
{
- struct netns_ct *ctnet =
- container_of(work, struct netns_ct, ecache_dwork.work);
- int cpu, delay = -1;
- struct ct_pcpu *pcpu;
+ struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work);
+ int ret, delay = -1;
+
+ ret = ecache_work_evict_list(cnet);
+ switch (ret) {
+ case STATE_CONGESTED:
+ delay = ECACHE_RETRY_JIFFIES;
+ break;
+ case STATE_RESTART:
+ delay = 0;
+ break;
+ case STATE_DONE:
+ break;
+ }
- local_bh_disable();
+ if (delay >= 0)
+ schedule_delayed_work(&cnet->ecache.dwork, delay);
+}
- for_each_possible_cpu(cpu) {
- enum retry_state ret;
+static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
+ const u32 events,
+ const u32 missed,
+ const struct nf_ct_event *item)
+{
+ struct net *net = nf_ct_net(item->ct);
+ struct nf_ct_event_notifier *notify;
+ u32 old, want;
+ int ret;
- pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu);
+ if (!((events | missed) & e->ctmask))
+ return 0;
- ret = ecache_work_evict_list(pcpu);
+ rcu_read_lock();
- switch (ret) {
- case STATE_CONGESTED:
- delay = ECACHE_RETRY_WAIT;
- goto out;
- case STATE_RESTART:
- delay = 0;
- break;
- case STATE_DONE:
- break;
- }
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+ if (!notify) {
+ rcu_read_unlock();
+ return 0;
}
- out:
- local_bh_enable();
+ ret = notify->ct_event(events | missed, item);
+ rcu_read_unlock();
+
+ if (likely(ret >= 0 && missed == 0))
+ return 0;
- ctnet->ecache_dwork_pending = delay > 0;
- if (delay >= 0)
- schedule_delayed_work(&ctnet->ecache_dwork, delay);
+ do {
+ old = READ_ONCE(e->missed);
+ if (ret < 0)
+ want = old | events;
+ else
+ want = old & ~missed;
+ } while (cmpxchg(&e->missed, old, want) != old);
+
+ return ret;
}
-int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
+int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
u32 portid, int report)
{
- int ret = 0;
- struct net *net = nf_ct_net(ct);
- struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
+ struct nf_ct_event item;
+ unsigned int missed;
+ int ret;
- rcu_read_lock();
- notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
- if (!notify)
- goto out_unlock;
+ if (!nf_ct_is_confirmed(ct))
+ return 0;
e = nf_ct_ecache_find(ct);
if (!e)
- goto out_unlock;
+ return 0;
- if (nf_ct_is_confirmed(ct)) {
- struct nf_ct_event item = {
- .ct = ct,
- .portid = e->portid ? e->portid : portid,
- .report = report
- };
- /* This is a resent of a destroy event? If so, skip missed */
- unsigned long missed = e->portid ? 0 : e->missed;
-
- if (!((eventmask | missed) & e->ctmask))
- goto out_unlock;
-
- ret = notify->fcn(eventmask | missed, &item);
- if (unlikely(ret < 0 || missed)) {
- spin_lock_bh(&ct->lock);
- if (ret < 0) {
- /* This is a destroy event that has been
- * triggered by a process, we store the PORTID
- * to include it in the retransmission.
- */
- if (eventmask & (1 << IPCT_DESTROY)) {
- if (e->portid == 0 && portid != 0)
- e->portid = portid;
- e->state = NFCT_ECACHE_DESTROY_FAIL;
- } else {
- e->missed |= eventmask;
- }
- } else {
- e->missed &= ~missed;
- }
- spin_unlock_bh(&ct->lock);
- }
+ memset(&item, 0, sizeof(item));
+
+ item.ct = ct;
+ item.portid = e->portid ? e->portid : portid;
+ item.report = report;
+
+ /* This is a resent of a destroy event? If so, skip missed */
+ missed = e->portid ? 0 : e->missed;
+
+ ret = __nf_conntrack_eventmask_report(e, events, missed, &item);
+ if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) {
+ /* This is a destroy event that has been triggered by a process,
+ * we store the PORTID to include it in the retransmission.
+ */
+ if (e->portid == 0 && portid != 0)
+ e->portid = portid;
}
-out_unlock:
- rcu_read_unlock();
+
return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);
@@ -190,53 +203,28 @@ EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);
* disabled softirqs */
void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
- struct net *net = nf_ct_net(ct);
- unsigned long events, missed;
- struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
struct nf_ct_event item;
- int ret;
-
- rcu_read_lock();
- notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
- if (notify == NULL)
- goto out_unlock;
+ unsigned int events;
if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
- goto out_unlock;
+ return;
e = nf_ct_ecache_find(ct);
if (e == NULL)
- goto out_unlock;
+ return;
events = xchg(&e->cache, 0);
- /* We make a copy of the missed event cache without taking
- * the lock, thus we may send missed events twice. However,
- * this does not harm and it happens very rarely. */
- missed = e->missed;
-
- if (!((events | missed) & e->ctmask))
- goto out_unlock;
-
item.ct = ct;
item.portid = 0;
item.report = 0;
- ret = notify->fcn(events | missed, &item);
-
- if (likely(ret == 0 && !missed))
- goto out_unlock;
-
- spin_lock_bh(&ct->lock);
- if (ret < 0)
- e->missed |= events;
- else
- e->missed &= ~missed;
- spin_unlock_bh(&ct->lock);
-
-out_unlock:
- rcu_read_unlock();
+ /* We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely.
+ */
+ __nf_conntrack_eventmask_report(e, events, e->missed, &item);
}
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
@@ -246,11 +234,11 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
{
struct net *net = nf_ct_exp_net(exp);
- struct nf_exp_event_notifier *notify;
+ struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
rcu_read_lock();
- notify = rcu_dereference(net->ct.nf_expect_event_cb);
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
if (!notify)
goto out_unlock;
@@ -264,118 +252,107 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
.portid = portid,
.report = report
};
- notify->fcn(1 << event, &item);
+ notify->exp_event(1 << event, &item);
}
out_unlock:
rcu_read_unlock();
}
-int nf_conntrack_register_notifier(struct net *net,
- struct nf_ct_event_notifier *new)
+void nf_conntrack_register_notifier(struct net *net,
+ const struct nf_ct_event_notifier *new)
{
- int ret;
struct nf_ct_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
lockdep_is_held(&nf_ct_ecache_mutex));
- if (notify != NULL) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ WARN_ON_ONCE(notify);
rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
- ret = 0;
-
-out_unlock:
mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
-void nf_conntrack_unregister_notifier(struct net *net,
- struct nf_ct_event_notifier *new)
+void nf_conntrack_unregister_notifier(struct net *net)
{
- struct nf_ct_event_notifier *notify;
-
mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
- lockdep_is_held(&nf_ct_ecache_mutex));
- BUG_ON(notify != new);
RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
mutex_unlock(&nf_ct_ecache_mutex);
- /* synchronize_rcu() is called from ctnetlink_exit. */
+ /* synchronize_rcu() is called after netns pre_exit */
}
EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
-int nf_ct_expect_register_notifier(struct net *net,
- struct nf_exp_event_notifier *new)
+void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
{
- int ret;
- struct nf_exp_event_notifier *notify;
-
- mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
- lockdep_is_held(&nf_ct_ecache_mutex));
- if (notify != NULL) {
- ret = -EBUSY;
- goto out_unlock;
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ if (state == NFCT_ECACHE_DESTROY_FAIL &&
+ !delayed_work_pending(&cnet->ecache.dwork)) {
+ schedule_delayed_work(&cnet->ecache.dwork, HZ);
+ net->ct.ecache_dwork_pending = true;
+ } else if (state == NFCT_ECACHE_DESTROY_SENT) {
+ if (!hlist_nulls_empty(&cnet->ecache.dying_list))
+ mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
+ else
+ net->ct.ecache_dwork_pending = false;
}
- rcu_assign_pointer(net->ct.nf_expect_event_cb, new);
- ret = 0;
-
-out_unlock:
- mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
}
-EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
-void nf_ct_expect_unregister_notifier(struct net *net,
- struct nf_exp_event_notifier *new)
+bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
{
- struct nf_exp_event_notifier *notify;
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_ecache *e;
- mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
- lockdep_is_held(&nf_ct_ecache_mutex));
- BUG_ON(notify != new);
- RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
- mutex_unlock(&nf_ct_ecache_mutex);
- /* synchronize_rcu() is called from ctnetlink_exit. */
+ switch (net->ct.sysctl_events) {
+ case 0:
+ /* assignment via template / ruleset? ignore sysctl. */
+ if (ctmask || expmask)
+ break;
+ return true;
+ case 2: /* autodetect: no event listener, don't allocate extension. */
+ if (!READ_ONCE(net->ct.ctnetlink_has_listener))
+ return true;
+ fallthrough;
+ case 1:
+ /* always allocate an extension. */
+ if (!ctmask && !expmask) {
+ ctmask = ~0;
+ expmask = ~0;
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return true;
+ }
+
+ e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
+ if (e) {
+ e->ctmask = ctmask;
+ e->expmask = expmask;
+ }
+
+ return e != NULL;
}
-EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
+EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add);
-#define NF_CT_EVENTS_DEFAULT 1
+#define NF_CT_EVENTS_DEFAULT 2
static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
-static const struct nf_ct_ext_type event_extend = {
- .len = sizeof(struct nf_conntrack_ecache),
- .align = __alignof__(struct nf_conntrack_ecache),
- .id = NF_CT_EXT_ECACHE,
-};
-
void nf_conntrack_ecache_pernet_init(struct net *net)
{
- net->ct.sysctl_events = nf_ct_events;
- INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work);
-}
-
-void nf_conntrack_ecache_pernet_fini(struct net *net)
-{
- cancel_delayed_work_sync(&net->ct.ecache_dwork);
-}
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-int nf_conntrack_ecache_init(void)
-{
- int ret = nf_ct_extend_register(&event_extend);
- if (ret < 0)
- pr_err("Unable to register event extension\n");
+ net->ct.sysctl_events = nf_ct_events;
- BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */
+ INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work);
+ INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL);
+ spin_lock_init(&cnet->ecache.dying_lock);
- return ret;
+ BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */
}
-void nf_conntrack_ecache_fini(void)
+void nf_conntrack_ecache_pernet_fini(struct net *net)
{
- nf_ct_extend_unregister(&event_extend);
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ cancel_delayed_work_sync(&cnet->ecache.dwork);
}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 42557d2b6a90..96948e98ec53 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -17,7 +17,7 @@
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/kernel.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/moduleparam.h>
#include <linux/export.h>
#include <net/net_namespace.h>
@@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
unsigned int nf_ct_expect_max __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
-static unsigned int nf_ct_expect_hashrnd __read_mostly;
+static siphash_aligned_key_t nf_ct_expect_hashrnd;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
@@ -49,12 +49,15 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct net *net = nf_ct_exp_net(exp);
+ struct nf_conntrack_net *cnet;
WARN_ON(!master_help);
WARN_ON(timer_pending(&exp->timeout));
hlist_del_rcu(&exp->hnode);
- net->ct.expect_count--;
+
+ cnet = nf_ct_pernet(net);
+ cnet->expect_count--;
hlist_del_rcu(&exp->lnode);
master_help->expecting[exp->class]--;
@@ -78,15 +81,26 @@ static void nf_ct_expectation_timed_out(struct timer_list *t)
static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
- unsigned int hash, seed;
+ struct {
+ union nf_inet_addr dst_addr;
+ u32 net_mix;
+ u16 dport;
+ u8 l3num;
+ u8 protonum;
+ } __aligned(SIPHASH_ALIGNMENT) combined;
+ u32 hash;
get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd));
- seed = nf_ct_expect_hashrnd ^ net_hash_mix(n);
+ memset(&combined, 0, sizeof(combined));
+
+ combined.dst_addr = tuple->dst.u3;
+ combined.net_mix = net_hash_mix(n);
+ combined.dport = (__force __u16)tuple->dst.u.all;
+ combined.l3num = tuple->src.l3num;
+ combined.protonum = tuple->dst.protonum;
- hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
- (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
- (__force __u16)tuple->dst.u.all) ^ seed);
+ hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd);
return reciprocal_scale(hash, nf_ct_expect_hsize);
}
@@ -118,10 +132,11 @@ __nf_ct_expect_find(struct net *net,
const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct nf_conntrack_expect *i;
unsigned int h;
- if (!net->ct.expect_count)
+ if (!cnet->expect_count)
return NULL;
h = nf_ct_expect_dst_hash(net, tuple);
@@ -158,10 +173,11 @@ nf_ct_find_expectation(struct net *net,
const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct nf_conntrack_expect *i, *exp = NULL;
unsigned int h;
- if (!net->ct.expect_count)
+ if (!cnet->expect_count)
return NULL;
h = nf_ct_expect_dst_hash(net, tuple);
@@ -187,12 +203,12 @@ nf_ct_find_expectation(struct net *net,
* about to invoke ->destroy(), or nf_ct_delete() via timeout
* or early_drop().
*
- * The atomic_inc_not_zero() check tells: If that fails, we
+ * The refcount_inc_not_zero() check tells: If that fails, we
* know that the ct is being destroyed. If it succeeds, we
* can be sure the ct cannot disappear underneath.
*/
if (unlikely(nf_ct_is_dying(exp->master) ||
- !atomic_inc_not_zero(&exp->master->ct_general.use)))
+ !refcount_inc_not_zero(&exp->master->ct_general.use)))
return NULL;
if (exp->flags & NF_CT_EXPECT_PERMANENT) {
@@ -368,6 +384,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_put);
static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
{
+ struct nf_conntrack_net *cnet;
struct nf_conn_help *master_help = nfct_help(exp->master);
struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(exp);
@@ -389,7 +406,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
master_help->expecting[exp->class]++;
hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
- net->ct.expect_count++;
+ cnet = nf_ct_pernet(net);
+ cnet->expect_count++;
NF_CT_STAT_INC(net, expect_create);
}
@@ -415,6 +433,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
{
const struct nf_conntrack_expect_policy *p;
struct nf_conntrack_expect *i;
+ struct nf_conntrack_net *cnet;
struct nf_conn *master = expect->master;
struct nf_conn_help *master_help = nfct_help(master);
struct nf_conntrack_helper *helper;
@@ -458,7 +477,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
}
}
- if (net->ct.expect_count >= nf_ct_expect_max) {
+ cnet = nf_ct_pernet(net);
+ if (cnet->expect_count >= nf_ct_expect_max) {
net_warn_ratelimited("nf_conntrack: expectation table full\n");
ret = -EMFILE;
}
@@ -686,7 +706,6 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
int nf_conntrack_expect_pernet_init(struct net *net)
{
- net->ct.expect_count = 0;
return exp_proc_init(net);
}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 3dbe2329c3f1..0b513f7bf9f3 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -13,40 +13,92 @@
#include <linux/skbuff.h>
#include <net/netfilter/nf_conntrack_extend.h>
-static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
-static DEFINE_MUTEX(nf_ct_ext_type_mutex);
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_act_ct.h>
+#include <net/netfilter/nf_nat.h>
+
#define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */
-void nf_ct_ext_destroy(struct nf_conn *ct)
+atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1);
+
+static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
+ [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help),
+#if IS_ENABLED(CONFIG_NF_NAT)
+ [NF_CT_EXT_NAT] = sizeof(struct nf_conn_nat),
+#endif
+ [NF_CT_EXT_SEQADJ] = sizeof(struct nf_conn_seqadj),
+ [NF_CT_EXT_ACCT] = sizeof(struct nf_conn_acct),
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_acct),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_tstamp),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels),
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ [NF_CT_EXT_SYNPROXY] = sizeof(struct nf_conn_synproxy),
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ [NF_CT_EXT_ACT_CT] = sizeof(struct nf_conn_act_ct_ext),
+#endif
+};
+
+static __always_inline unsigned int total_extension_size(void)
{
- unsigned int i;
- struct nf_ct_ext_type *t;
-
- for (i = 0; i < NF_CT_EXT_NUM; i++) {
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[i]);
-
- /* Here the nf_ct_ext_type might have been unregisterd.
- * I.e., it has responsible to cleanup private
- * area in all conntracks when it is unregisterd.
- */
- if (t && t->destroy)
- t->destroy(ct);
- rcu_read_unlock();
- }
-
- kfree(ct->ext);
+ /* remember to add new extensions below */
+ BUILD_BUG_ON(NF_CT_EXT_NUM > 10);
+
+ return sizeof(struct nf_ct_ext) +
+ sizeof(struct nf_conn_help)
+#if IS_ENABLED(CONFIG_NF_NAT)
+ + sizeof(struct nf_conn_nat)
+#endif
+ + sizeof(struct nf_conn_seqadj)
+ + sizeof(struct nf_conn_acct)
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ + sizeof(struct nf_conntrack_ecache)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ + sizeof(struct nf_conn_tstamp)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ + sizeof(struct nf_conn_timeout)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ + sizeof(struct nf_conn_labels)
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ + sizeof(struct nf_conn_synproxy)
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ + sizeof(struct nf_conn_act_ct_ext)
+#endif
+ ;
}
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
{
unsigned int newlen, newoff, oldlen, alloc;
- struct nf_ct_ext_type *t;
struct nf_ct_ext *new;
/* Conntrack must not be confirmed to avoid races on reallocation. */
WARN_ON(nf_ct_is_confirmed(ct));
+ /* struct nf_ct_ext uses u8 to store offsets/size */
+ BUILD_BUG_ON(total_extension_size() > 255u);
if (ct->ext) {
const struct nf_ct_ext *old = ct->ext;
@@ -58,24 +110,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
oldlen = sizeof(*new);
}
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[id]);
- if (!t) {
- rcu_read_unlock();
- return NULL;
- }
-
- newoff = ALIGN(oldlen, t->align);
- newlen = newoff + t->len;
- rcu_read_unlock();
+ newoff = ALIGN(oldlen, __alignof__(struct nf_ct_ext));
+ newlen = newoff + nf_ct_ext_type_len[id];
alloc = max(newlen, NF_CT_EXT_PREALLOC);
new = krealloc(ct->ext, alloc, gfp);
if (!new)
return NULL;
- if (!ct->ext)
+ if (!ct->ext) {
memset(new->offset, 0, sizeof(new->offset));
+ new->gen_id = atomic_read(&nf_conntrack_ext_genid);
+ }
new->offset[id] = newoff;
new->len = newlen;
@@ -86,30 +132,28 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
}
EXPORT_SYMBOL(nf_ct_ext_add);
-/* This MUST be called in process context. */
-int nf_ct_extend_register(const struct nf_ct_ext_type *type)
+/* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */
+void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id)
{
- int ret = 0;
+ unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid);
+ unsigned int this_id = READ_ONCE(ext->gen_id);
- mutex_lock(&nf_ct_ext_type_mutex);
- if (nf_ct_ext_types[type->id]) {
- ret = -EBUSY;
- goto out;
- }
+ if (!__nf_ct_ext_exist(ext, id))
+ return NULL;
- rcu_assign_pointer(nf_ct_ext_types[type->id], type);
-out:
- mutex_unlock(&nf_ct_ext_type_mutex);
- return ret;
+ if (this_id == 0 || ext->gen_id == gen_id)
+ return (void *)ext + ext->offset[id];
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(nf_ct_extend_register);
+EXPORT_SYMBOL(__nf_ct_ext_find);
-/* This MUST be called in process context. */
-void nf_ct_extend_unregister(const struct nf_ct_ext_type *type)
+void nf_ct_ext_bump_genid(void)
{
- mutex_lock(&nf_ct_ext_type_mutex);
- RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
- mutex_unlock(&nf_ct_ext_type_mutex);
- synchronize_rcu();
+ unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid);
+
+ if (value == UINT_MAX)
+ atomic_set(&nf_conntrack_ext_genid, 1);
+
+ msleep(HZ);
}
-EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 9eca90414bb7..617f744a2e3a 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -33,10 +33,6 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp connection tracking helper");
MODULE_ALIAS("ip_conntrack_ftp");
MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
-
-/* This is slow, but it's simple. --RR */
-static char *ftp_buffer;
-
static DEFINE_SPINLOCK(nf_ftp_lock);
#define MAX_PORTS 8
@@ -382,7 +378,7 @@ static int help(struct sk_buff *skb,
int ret;
u32 seq;
int dir = CTINFO2DIR(ctinfo);
- unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
+ unsigned int matchlen, matchoff;
struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
union nf_inet_addr *daddr;
@@ -398,6 +394,9 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT;
}
+ if (unlikely(skb_linearize(skb)))
+ return NF_DROP;
+
th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
if (th == NULL)
return NF_ACCEPT;
@@ -411,9 +410,9 @@ static int help(struct sk_buff *skb,
}
datalen = skb->len - dataoff;
+ /* seqadj (nat) uses ct->lock internally, nf_nat_ftp would cause deadlock */
spin_lock_bh(&nf_ftp_lock);
- fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
- BUG_ON(fb_ptr == NULL);
+ fb_ptr = skb->data + dataoff;
ends_in_nl = (fb_ptr[datalen - 1] == '\n');
seq = ntohl(th->seq) + datalen;
@@ -568,7 +567,6 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
static void __exit nf_conntrack_ftp_fini(void)
{
nf_conntrack_helpers_unregister(ftp, ports_c * 2);
- kfree(ftp_buffer);
}
static int __init nf_conntrack_ftp_init(void)
@@ -577,10 +575,6 @@ static int __init nf_conntrack_ftp_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_ftp_master));
- ftp_buffer = kmalloc(65536, GFP_KERNEL);
- if (!ftp_buffer)
- return -ENOMEM;
-
if (ports_c == 0)
ports[ports_c++] = FTP_PORT;
@@ -600,7 +594,6 @@ static int __init nf_conntrack_ftp_init(void)
ret = nf_conntrack_helpers_register(ftp, ports_c * 2);
if (ret < 0) {
pr_err("failed to register helpers\n");
- kfree(ftp_buffer);
return ret;
}
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index 573cb4481481..e697a824b001 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -257,15 +257,15 @@ static unsigned int get_uint(struct bitstr *bs, int b)
case 4:
v |= *bs->cur++;
v <<= 8;
- /* fall through */
+ fallthrough;
case 3:
v |= *bs->cur++;
v <<= 8;
- /* fall through */
+ fallthrough;
case 2:
v |= *bs->cur++;
v <<= 8;
- /* fall through */
+ fallthrough;
case 1:
v |= *bs->cur++;
break;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 8ba037b76ad3..5a9bce24f3c3 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -34,6 +34,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_conntrack_h323.h>
+#define H323_MAX_SIZE 65535
+
/* Parameters */
static unsigned int default_rrq_ttl __read_mostly = 300;
module_param(default_rrq_ttl, uint, 0600);
@@ -49,64 +51,8 @@ MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
"if both endpoints are on different sides "
"(determined by routing information)");
-/* Hooks for NAT */
-int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr,
- union nf_inet_addr *addr, __be16 port)
- __read_mostly;
-int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr,
- union nf_inet_addr *addr, __be16 port)
- __read_mostly;
-int (*set_sig_addr_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff, unsigned char **data,
- TransportAddress *taddr, int count) __read_mostly;
-int (*set_ras_addr_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff, unsigned char **data,
- TransportAddress *taddr, int count) __read_mostly;
-int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr,
- __be16 port, __be16 rtp_port,
- struct nf_conntrack_expect *rtp_exp,
- struct nf_conntrack_expect *rtcp_exp) __read_mostly;
-int (*nat_t120_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_h245_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_callforwarding_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_q931_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, TransportAddress *taddr, int idx,
- __be16 port, struct nf_conntrack_expect *exp)
- __read_mostly;
+const struct nfct_h323_nat_hooks __rcu *nfct_h323_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfct_h323_nat_hook);
static DEFINE_SPINLOCK(nf_h323_lock);
static char *h323_buffer;
@@ -142,11 +88,15 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
if (tcpdatalen <= 0) /* No TCP data */
goto clear_out;
+ if (tcpdatalen > H323_MAX_SIZE)
+ tcpdatalen = H323_MAX_SIZE;
+
if (*data == NULL) { /* first TPKT */
/* Get first TPKT pointer */
tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
h323_buffer);
- BUG_ON(tpkt == NULL);
+ if (!tpkt)
+ goto clear_out;
/* Validate TPKT identifier */
if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
@@ -193,7 +143,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
if (tcpdatalen == 4) { /* Separate TPKT header */
/* Netmeeting sends TPKT header and data separately */
pr_debug("nf_ct_h323: separate TPKT header indicates "
- "there will be TPKT data of %hu bytes\n",
+ "there will be TPKT data of %d bytes\n",
tpktlen - 4);
info->tpkt_len[dir] = tpktlen - 4;
return 0;
@@ -258,6 +208,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
@@ -265,7 +216,6 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
union nf_inet_addr addr;
struct nf_conntrack_expect *rtp_exp;
struct nf_conntrack_expect *rtcp_exp;
- typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
/* Read RTP or RTCP address */
if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -295,15 +245,16 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.dst.u3,
IPPROTO_UDP, NULL, &rtcp_port);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
- taddr, port, rtp_port, rtp_exp, rtcp_exp);
+ ret = nathook->nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ taddr, port, rtp_port, rtp_exp, rtcp_exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(rtp_exp, 0) == 0) {
if (nf_ct_expect_related(rtcp_exp, 0) == 0) {
@@ -332,12 +283,12 @@ static int expect_t120(struct sk_buff *skb,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_t120_hook) nat_t120;
/* Read T.120 address */
if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -354,15 +305,16 @@ static int expect_t120(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_t120 = rcu_dereference(nat_t120_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr,
- port, exp);
+ ret = nathook->nat_t120(skb, ct, ctinfo, protoff, data,
+ dataoff, taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_h323: expect T.120 ");
@@ -663,18 +615,19 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data,
return 1;
}
+EXPORT_SYMBOL_GPL(get_h225_addr);
static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int protoff, unsigned char **data, int dataoff,
TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_h245_hook) nat_h245;
/* Read h245Address */
if (!get_h225_addr(ct, *data, taddr, &addr, &port) ||
@@ -691,15 +644,16 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
IPPROTO_TCP, NULL, &port);
exp->helper = &nf_conntrack_helper_h245;
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_h245 = rcu_dereference(nat_h245_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr,
- port, exp);
+ ret = nathook->nat_h245(skb, ct, ctinfo, protoff, data,
+ dataoff, taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_q931: expect H.245 ");
@@ -784,13 +738,13 @@ static int expect_callforwarding(struct sk_buff *skb,
unsigned char **data, int dataoff,
TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
struct net *net = nf_ct_net(ct);
- typeof(nat_callforwarding_hook) nat_callforwarding;
/* Read alternativeAddress */
if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0)
@@ -814,16 +768,17 @@ static int expect_callforwarding(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
exp->helper = nf_conntrack_helper_q931;
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Need NAT */
- ret = nat_callforwarding(skb, ct, ctinfo,
- protoff, data, dataoff,
- taddr, port, exp);
+ ret = nathook->nat_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_q931: expect Call Forwarding ");
@@ -843,12 +798,12 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, int dataoff,
Setup_UUIE *setup)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
int i;
__be16 port;
union nf_inet_addr addr;
- typeof(set_h225_addr_hook) set_h225_addr;
pr_debug("nf_ct_q931: Setup\n");
@@ -859,9 +814,9 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
return -1;
}
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
- (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->destCallSignalAddress,
&addr, &port) &&
@@ -869,16 +824,16 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
- ret = set_h225_addr(skb, protoff, data, dataoff,
- &setup->destCallSignalAddress,
- &ct->tuplehash[!dir].tuple.src.u3,
- ct->tuplehash[!dir].tuple.src.u.tcp.port);
+ ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+ &setup->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ ct->tuplehash[!dir].tuple.src.u.tcp.port);
if (ret < 0)
return -1;
}
if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
- (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
&addr, &port) &&
@@ -886,10 +841,10 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
- ret = set_h225_addr(skb, protoff, data, dataoff,
- &setup->sourceCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- ct->tuplehash[!dir].tuple.dst.u.tcp.port);
+ ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+ &setup->sourceCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ ct->tuplehash[!dir].tuple.dst.u.tcp.port);
if (ret < 0)
return -1;
}
@@ -1219,6 +1174,9 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NULL;
*datalen = skb->len - dataoff;
+ if (*datalen > H323_MAX_SIZE)
+ *datalen = H323_MAX_SIZE;
+
return skb_header_pointer(skb, dataoff, *datalen, h323_buffer);
}
@@ -1248,13 +1206,13 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
TransportAddress *taddr, int count)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
int i;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_q931_hook) nat_q931;
/* Look for the first related address */
for (i = 0; i < count; i++) {
@@ -1278,11 +1236,11 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
exp->helper = nf_conntrack_helper_q931;
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */
- nat_q931 = rcu_dereference(nat_q931_hook);
- if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) { /* Need NAT */
- ret = nat_q931(skb, ct, ctinfo, protoff, data,
- taddr, i, port, exp);
+ ret = nathook->nat_q931(skb, ct, ctinfo, protoff, data,
+ taddr, i, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect Q.931 ");
@@ -1304,15 +1262,15 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, GatekeeperRequest *grq)
{
- typeof(set_ras_addr_hook) set_ras_addr;
+ const struct nfct_h323_nat_hooks *nathook;
pr_debug("nf_ct_ras: GRQ\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) /* NATed */
- return set_ras_addr(skb, ct, ctinfo, protoff, data,
- &grq->rasAddress, 1);
+ return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &grq->rasAddress, 1);
return 0;
}
@@ -1366,8 +1324,8 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, RegistrationRequest *rrq)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
pr_debug("nf_ct_ras: RRQ\n");
@@ -1377,12 +1335,12 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
if (ret < 0)
return -1;
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
- rrq->rasAddress.item,
- rrq->rasAddress.count);
+ ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ rrq->rasAddress.item,
+ rrq->rasAddress.count);
if (ret < 0)
return -1;
}
@@ -1402,19 +1360,19 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, RegistrationConfirm *rcf)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
struct nf_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: RCF\n");
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- rcf->callSignalAddress.item,
- rcf->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ rcf->callSignalAddress.item,
+ rcf->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1453,18 +1411,18 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, UnregistrationRequest *urq)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: URQ\n");
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- urq->callSignalAddress.item,
- urq->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ urq->callSignalAddress.item,
+ urq->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1486,39 +1444,42 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, AdmissionRequest *arq)
{
const struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
__be16 port;
union nf_inet_addr addr;
- typeof(set_h225_addr_hook) set_h225_addr;
pr_debug("nf_ct_ras: ARQ\n");
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (!nathook)
+ return 0;
+
if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
get_h225_addr(ct, *data, &arq->destCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
port == info->sig_port[dir] &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
- set_h225_addr && ct->status & IPS_NAT_MASK) {
+ ct->status & IPS_NAT_MASK) {
/* Answering ARQ */
- return set_h225_addr(skb, protoff, data, 0,
- &arq->destCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- info->sig_port[!dir]);
+ return nathook->set_h225_addr(skb, protoff, data, 0,
+ &arq->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir]);
}
if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
- set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Calling ARQ */
- return set_h225_addr(skb, protoff, data, 0,
- &arq->srcCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- port);
+ return nathook->set_h225_addr(skb, protoff, data, 0,
+ &arq->srcCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ port);
}
return 0;
@@ -1534,7 +1495,6 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: ACF\n");
@@ -1543,12 +1503,15 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
return 0;
if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
+ const struct nfct_h323_nat_hooks *nathook;
+
/* Answering ACF */
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK)
- return set_sig_addr(skb, ct, ctinfo, protoff, data,
- &acf->destCallSignalAddress, 1);
+ return nathook->set_sig_addr(skb, ct, ctinfo, protoff,
+ data,
+ &acf->destCallSignalAddress, 1);
return 0;
}
@@ -1577,15 +1540,15 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, LocationRequest *lrq)
{
- typeof(set_ras_addr_hook) set_ras_addr;
+ const struct nfct_h323_nat_hooks *nathook;
pr_debug("nf_ct_ras: LRQ\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK)
- return set_ras_addr(skb, ct, ctinfo, protoff, data,
- &lrq->replyAddress, 1);
+ return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &lrq->replyAddress, 1);
return 0;
}
@@ -1633,27 +1596,22 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, InfoRequestResponse *irr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: IRR\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
- &irr->rasAddress, 1);
+ ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &irr->rasAddress, 1);
if (ret < 0)
return -1;
- }
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
- ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- irr->callSignalAddress.item,
- irr->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ irr->callSignalAddress.item,
+ irr->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1820,7 +1778,7 @@ static int __init nf_conntrack_h323_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_h323_master));
- h323_buffer = kmalloc(65536, GFP_KERNEL);
+ h323_buffer = kmalloc(H323_MAX_SIZE + 1, GFP_KERNEL);
if (!h323_buffer)
return -ENOMEM;
ret = h323_helper_init();
@@ -1836,17 +1794,6 @@ err1:
module_init(nf_conntrack_h323_init);
module_exit(nf_conntrack_h323_fini);
-EXPORT_SYMBOL_GPL(get_h225_addr);
-EXPORT_SYMBOL_GPL(set_h245_addr_hook);
-EXPORT_SYMBOL_GPL(set_h225_addr_hook);
-EXPORT_SYMBOL_GPL(set_sig_addr_hook);
-EXPORT_SYMBOL_GPL(set_ras_addr_hook);
-EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
-EXPORT_SYMBOL_GPL(nat_t120_hook);
-EXPORT_SYMBOL_GPL(nat_h245_hook);
-EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
-EXPORT_SYMBOL_GPL(nat_q931_hook);
-
MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
MODULE_DESCRIPTION("H.323 connection tracking helper");
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 118f415928ae..ff737a76052e 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -35,11 +35,6 @@ unsigned int nf_ct_helper_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
static unsigned int nf_ct_helper_count __read_mostly;
-static bool nf_ct_auto_assign_helper __read_mostly = false;
-module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
-MODULE_PARM_DESC(nf_conntrack_helper,
- "Enable automatic conntrack helper assignment (default 0)");
-
static DEFINE_MUTEX(nf_ct_nat_helpers_mutex);
static struct list_head nf_ct_nat_helpers __read_mostly;
@@ -51,24 +46,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple)
(__force __u16)tuple->src.u.all) % nf_ct_helper_hsize;
}
-static struct nf_conntrack_helper *
-__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
-{
- struct nf_conntrack_helper *helper;
- struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
- unsigned int h;
-
- if (!nf_ct_helper_count)
- return NULL;
-
- h = helper_hash(tuple);
- hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) {
- if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
- return helper;
- }
- return NULL;
-}
-
struct nf_conntrack_helper *
__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
{
@@ -165,7 +142,7 @@ nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
if (!nat) {
snprintf(mod_name, sizeof(mod_name), "%s", h->nat_mod_name);
rcu_read_unlock();
- request_module(mod_name);
+ request_module("%s", mod_name);
rcu_read_lock();
nat = nf_conntrack_nat_helper_find(mod_name);
@@ -209,32 +186,11 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
-static struct nf_conntrack_helper *
-nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
-{
- if (!net->ct.sysctl_auto_assign_helper) {
- if (net->ct.auto_assign_helper_warned)
- return NULL;
- if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple))
- return NULL;
- pr_info("nf_conntrack: default automatic helper assignment "
- "has been turned off for security reasons and CT-based "
- " firewall rule not found. Use the iptables CT target "
- "to attach helpers instead.\n");
- net->ct.auto_assign_helper_warned = 1;
- return NULL;
- }
-
- return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-}
-
-
int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
gfp_t flags)
{
struct nf_conntrack_helper *helper = NULL;
struct nf_conn_help *help;
- struct net *net = nf_ct_net(ct);
/* We already got a helper explicitly attached. The function
* nf_conntrack_alter_reply - in case NAT is in use - asks for looking
@@ -245,23 +201,21 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
if (test_bit(IPS_HELPER_BIT, &ct->status))
return 0;
- if (tmpl != NULL) {
- help = nfct_help(tmpl);
- if (help != NULL) {
- helper = help->helper;
- set_bit(IPS_HELPER_BIT, &ct->status);
- }
+ if (WARN_ON_ONCE(!tmpl))
+ return 0;
+
+ help = nfct_help(tmpl);
+ if (help != NULL) {
+ helper = rcu_dereference(help->helper);
+ set_bit(IPS_HELPER_BIT, &ct->status);
}
help = nfct_help(ct);
if (helper == NULL) {
- helper = nf_ct_lookup_helper(ct, net);
- if (helper == NULL) {
- if (help)
- RCU_INIT_POINTER(help->helper, NULL);
- return 0;
- }
+ if (help)
+ RCU_INIT_POINTER(help->helper, NULL);
+ return 0;
}
if (help == NULL) {
@@ -467,11 +421,6 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
nf_ct_expect_iterate_destroy(expect_iter_me, NULL);
nf_ct_iterate_destroy(unhelp, me);
-
- /* Maybe someone has gotten the helper already when unhelp above.
- * So need to wait it.
- */
- synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
@@ -549,42 +498,19 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat)
}
EXPORT_SYMBOL_GPL(nf_nat_helper_unregister);
-static const struct nf_ct_ext_type helper_extend = {
- .len = sizeof(struct nf_conn_help),
- .align = __alignof__(struct nf_conn_help),
- .id = NF_CT_EXT_HELPER,
-};
-
-void nf_conntrack_helper_pernet_init(struct net *net)
-{
- net->ct.auto_assign_helper_warned = false;
- net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
-}
-
int nf_conntrack_helper_init(void)
{
- int ret;
nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
nf_ct_helper_hash =
nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
if (!nf_ct_helper_hash)
return -ENOMEM;
- ret = nf_ct_extend_register(&helper_extend);
- if (ret < 0) {
- pr_err("nf_ct_helper: Unable to register helper extension.\n");
- goto out_extend;
- }
-
INIT_LIST_HEAD(&nf_ct_nat_helpers);
return 0;
-out_extend:
- kvfree(nf_ct_helper_hash);
- return ret;
}
void nf_conntrack_helper_fini(void)
{
- nf_ct_extend_unregister(&helper_extend);
kvfree(nf_ct_helper_hash);
}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index e40988a2f22f..5703846bea3b 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -39,6 +39,7 @@ unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_irc_hook);
#define HELPER_NAME "irc"
+#define MAX_SEARCH_SIZE 4095
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
@@ -121,6 +122,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
int i, ret = NF_ACCEPT;
char *addr_beg_p, *addr_end_p;
typeof(nf_nat_irc_hook) nf_nat_irc;
+ unsigned int datalen;
/* If packet is coming from IRC server */
if (dir == IP_CT_DIR_REPLY)
@@ -140,23 +142,52 @@ static int help(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
+ datalen = skb->len - dataoff;
+ if (datalen > MAX_SEARCH_SIZE)
+ datalen = MAX_SEARCH_SIZE;
+
spin_lock_bh(&irc_buffer_lock);
- ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff,
+ ib_ptr = skb_header_pointer(skb, dataoff, datalen,
irc_buffer);
- BUG_ON(ib_ptr == NULL);
+ if (!ib_ptr) {
+ spin_unlock_bh(&irc_buffer_lock);
+ return NF_ACCEPT;
+ }
data = ib_ptr;
- data_limit = ib_ptr + skb->len - dataoff;
+ data_limit = ib_ptr + datalen;
+
+ /* Skip any whitespace */
+ while (data < data_limit - 10) {
+ if (*data == ' ' || *data == '\r' || *data == '\n')
+ data++;
+ else
+ break;
+ }
+
+ /* strlen("PRIVMSG x ")=10 */
+ if (data < data_limit - 10) {
+ if (strncasecmp("PRIVMSG ", data, 8))
+ goto out;
+ data += 8;
+ }
- /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
- * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
- while (data < data_limit - (19 + MINMATCHLEN)) {
- if (memcmp(data, "\1DCC ", 5)) {
+ /* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26
+ * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26
+ */
+ while (data < data_limit - (21 + MINMATCHLEN)) {
+ /* Find first " :", the start of message */
+ if (memcmp(data, " :", 2)) {
data++;
continue;
}
+ data += 2;
+
+ /* then check that place only for the DCC command */
+ if (memcmp(data, "\1DCC ", 5))
+ goto out;
data += 5;
- /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+ /* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */
iph = ip_hdr(skb);
pr_debug("DCC found in master %pI4:%u %pI4:%u\n",
@@ -172,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
pr_debug("DCC %s detected\n", dccprotos[i]);
/* we have at least
- * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+ * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid
* data left (== 14/13 bytes) */
if (parse_dcc(data, data_limit, &dcc_ip,
&dcc_port, &addr_beg_p, &addr_end_p)) {
@@ -185,8 +216,9 @@ static int help(struct sk_buff *skb, unsigned int protoff,
/* dcc_ip can be the internal OR external (NAT'ed) IP */
tuple = &ct->tuplehash[dir].tuple;
- if (tuple->src.u3.ip != dcc_ip &&
- tuple->dst.u3.ip != dcc_ip) {
+ if ((tuple->src.u3.ip != dcc_ip &&
+ ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) ||
+ dcc_port == 0) {
net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n",
&tuple->src.u3.ip,
&dcc_ip, dcc_port);
@@ -248,7 +280,7 @@ static int __init nf_conntrack_irc_init(void)
irc_exp_policy.max_expected = max_dcc_channels;
irc_exp_policy.timeout = dcc_timeout;
- irc_buffer = kmalloc(65536, GFP_KERNEL);
+ irc_buffer = kmalloc(MAX_SEARCH_SIZE + 1, GFP_KERNEL);
if (!irc_buffer)
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 522792556632..6e70e137a0a6 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -67,6 +67,8 @@ int nf_connlabels_get(struct net *net, unsigned int bits)
net->ct.labels_used++;
spin_unlock(&nf_connlabels_lock);
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+
return 0;
}
EXPORT_SYMBOL_GPL(nf_connlabels_get);
@@ -78,21 +80,3 @@ void nf_connlabels_put(struct net *net)
spin_unlock(&nf_connlabels_lock);
}
EXPORT_SYMBOL_GPL(nf_connlabels_put);
-
-static const struct nf_ct_ext_type labels_extend = {
- .len = sizeof(struct nf_conn_labels),
- .align = __alignof__(struct nf_conn_labels),
- .id = NF_CT_EXT_LABELS,
-};
-
-int nf_conntrack_labels_init(void)
-{
- BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
-
- return nf_ct_extend_register(&labels_extend);
-}
-
-void nf_conntrack_labels_fini(void)
-{
- nf_ct_extend_unregister(&labels_extend);
-}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index 7f19ee259609..55415f011943 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -20,13 +20,14 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#define HELPER_NAME "netbios-ns"
#define NMBD_PORT 137
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ip_conntrack_netbios_ns");
-MODULE_ALIAS_NFCT_HELPER("netbios_ns");
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
static unsigned int timeout __read_mostly = 3;
module_param(timeout, uint, 0400);
@@ -44,7 +45,7 @@ static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
}
static struct nf_conntrack_helper helper __read_mostly = {
- .name = "netbios-ns",
+ .name = HELPER_NAME,
.tuple.src.l3num = NFPROTO_IPV4,
.tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
.tuple.dst.protonum = IPPROTO_UDP,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 6a1c8f1f6171..7562b215b932 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -54,8 +54,16 @@
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
+#include "nf_internals.h"
+
MODULE_LICENSE("GPL");
+struct ctnetlink_list_dump_ctx {
+ struct nf_conn *last;
+ unsigned int cpu;
+ bool done;
+};
+
static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l4proto *l4proto)
@@ -165,10 +173,14 @@ nla_put_failure:
return -1;
}
-static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct,
+ bool skip_zero)
{
long timeout = nf_ct_expires(ct) / HZ;
+ if (skip_zero && timeout == 0)
+ return 0;
+
if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
goto nla_put_failure;
return 0;
@@ -177,7 +189,8 @@ nla_put_failure:
return -1;
}
-static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
+static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct,
+ bool destroy)
{
const struct nf_conntrack_l4proto *l4proto;
struct nlattr *nest_proto;
@@ -191,7 +204,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
if (!nest_proto)
goto nla_put_failure;
- ret = l4proto->to_nlattr(skb, nest_proto, ct);
+ ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy);
nla_nest_end(skb, nest_proto);
@@ -211,6 +224,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
if (!help)
return 0;
+ rcu_read_lock();
helper = rcu_dereference(help->helper);
if (!helper)
goto out;
@@ -226,9 +240,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
nla_nest_end(skb, nest_helper);
out:
+ rcu_read_unlock();
return 0;
nla_put_failure:
+ rcu_read_unlock();
return -1;
}
@@ -498,7 +514,7 @@ nla_put_failure:
static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
- if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
+ if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use))))
goto nla_put_failure;
return 0;
@@ -535,8 +551,8 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
return -1;
if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) &&
- (ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_protoinfo(skb, ct) < 0))
+ (ctnetlink_dump_timeout(skb, ct, false) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct, false) < 0))
return -1;
return 0;
@@ -544,24 +560,21 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
static int
ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
- struct nf_conn *ct, bool extinfo)
+ struct nf_conn *ct, bool extinfo, unsigned int flags)
{
const struct nf_conntrack_zone *zone;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
- unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+ unsigned int event;
+ if (portid)
+ flags |= NLM_F_MULTI;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct),
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = nf_ct_l3num(ct);
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
zone = nf_ct_zone(ct);
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
@@ -699,12 +712,11 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
}
static int
-ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
{
const struct nf_conntrack_zone *zone;
struct net *net;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
struct nf_conn *ct = item->ct;
struct sk_buff *skb;
@@ -734,15 +746,11 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto errout;
type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type);
- nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct),
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = nf_ct_l3num(ct);
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
zone = nf_ct_zone(ct);
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
@@ -776,15 +784,19 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
if (events & (1 << IPCT_DESTROY)) {
+ if (ctnetlink_dump_timeout(skb, ct, true) < 0)
+ goto nla_put_failure;
+
if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
- ctnetlink_dump_timestamp(skb, ct) < 0)
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct, true) < 0)
goto nla_put_failure;
} else {
- if (ctnetlink_dump_timeout(skb, ct) < 0)
+ if (ctnetlink_dump_timeout(skb, ct, false) < 0)
goto nla_put_failure;
- if (events & (1 << IPCT_PROTOINFO)
- && ctnetlink_dump_protoinfo(skb, ct) < 0)
+ if (events & (1 << IPCT_PROTOINFO) &&
+ ctnetlink_dump_protoinfo(skb, ct, false) < 0)
goto nla_put_failure;
if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
@@ -846,21 +858,112 @@ static int ctnetlink_done(struct netlink_callback *cb)
return 0;
}
+struct ctnetlink_filter_u32 {
+ u32 val;
+ u32 mask;
+};
+
struct ctnetlink_filter {
u8 family;
- struct {
- u_int32_t val;
- u_int32_t mask;
- } mark;
+
+ u_int32_t orig_flags;
+ u_int32_t reply_flags;
+
+ struct nf_conntrack_tuple orig;
+ struct nf_conntrack_tuple reply;
+ struct nf_conntrack_zone zone;
+
+ struct ctnetlink_filter_u32 mark;
+ struct ctnetlink_filter_u32 status;
+};
+
+static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = {
+ [CTA_FILTER_ORIG_FLAGS] = { .type = NLA_U32 },
+ [CTA_FILTER_REPLY_FLAGS] = { .type = NLA_U32 },
};
+static int ctnetlink_parse_filter(const struct nlattr *attr,
+ struct ctnetlink_filter *filter)
+{
+ struct nlattr *tb[CTA_FILTER_MAX + 1];
+ int ret = 0;
+
+ ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy,
+ NULL);
+ if (ret)
+ return ret;
+
+ if (tb[CTA_FILTER_ORIG_FLAGS]) {
+ filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]);
+ if (filter->orig_flags & ~CTA_FILTER_F_ALL)
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[CTA_FILTER_REPLY_FLAGS]) {
+ filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]);
+ if (filter->reply_flags & ~CTA_FILTER_F_ALL)
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int ctnetlink_parse_zone(const struct nlattr *attr,
+ struct nf_conntrack_zone *zone);
+static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
+ struct nf_conntrack_tuple *tuple,
+ u32 type, u_int8_t l3num,
+ struct nf_conntrack_zone *zone,
+ u_int32_t flags);
+
+static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark,
+ const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (cda[CTA_MARK]) {
+ mark->val = ntohl(nla_get_be32(cda[CTA_MARK]));
+
+ if (cda[CTA_MARK_MASK])
+ mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+ else
+ mark->mask = 0xffffffff;
+ } else if (cda[CTA_MARK_MASK]) {
+ return -EINVAL;
+ }
+#endif
+ return 0;
+}
+
+static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status,
+ const struct nlattr * const cda[])
+{
+ if (cda[CTA_STATUS]) {
+ status->val = ntohl(nla_get_be32(cda[CTA_STATUS]));
+ if (cda[CTA_STATUS_MASK])
+ status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK]));
+ else
+ status->mask = status->val;
+
+ /* status->val == 0? always true, else always false. */
+ if (status->mask == 0)
+ return -EINVAL;
+ } else if (cda[CTA_STATUS_MASK]) {
+ return -EINVAL;
+ }
+
+ /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */
+ BUILD_BUG_ON(__IPS_MAX_BIT >= 32);
+ return 0;
+}
+
static struct ctnetlink_filter *
ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
{
struct ctnetlink_filter *filter;
+ int err;
#ifndef CONFIG_NF_CONNTRACK_MARK
- if (cda[CTA_MARK] && cda[CTA_MARK_MASK])
+ if (cda[CTA_MARK] || cda[CTA_MARK_MASK])
return ERR_PTR(-EOPNOTSUPP);
#endif
@@ -870,13 +973,66 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
filter->family = family;
-#ifdef CONFIG_NF_CONNTRACK_MARK
- if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
- filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
- filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+ err = ctnetlink_filter_parse_mark(&filter->mark, cda);
+ if (err)
+ goto err_filter;
+
+ err = ctnetlink_filter_parse_status(&filter->status, cda);
+ if (err)
+ goto err_filter;
+
+ if (!cda[CTA_FILTER])
+ return filter;
+
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone);
+ if (err < 0)
+ goto err_filter;
+
+ err = ctnetlink_parse_filter(cda[CTA_FILTER], filter);
+ if (err < 0)
+ goto err_filter;
+
+ if (filter->orig_flags) {
+ if (!cda[CTA_TUPLE_ORIG]) {
+ err = -EINVAL;
+ goto err_filter;
+ }
+
+ err = ctnetlink_parse_tuple_filter(cda, &filter->orig,
+ CTA_TUPLE_ORIG,
+ filter->family,
+ &filter->zone,
+ filter->orig_flags);
+ if (err < 0)
+ goto err_filter;
}
-#endif
+
+ if (filter->reply_flags) {
+ if (!cda[CTA_TUPLE_REPLY]) {
+ err = -EINVAL;
+ goto err_filter;
+ }
+
+ err = ctnetlink_parse_tuple_filter(cda, &filter->reply,
+ CTA_TUPLE_REPLY,
+ filter->family,
+ &filter->zone,
+ filter->reply_flags);
+ if (err < 0)
+ goto err_filter;
+ }
+
return filter;
+
+err_filter:
+ kfree(filter);
+
+ return ERR_PTR(err);
+}
+
+static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda)
+{
+ return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS];
}
static int ctnetlink_start(struct netlink_callback *cb)
@@ -886,7 +1042,7 @@ static int ctnetlink_start(struct netlink_callback *cb)
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u8 family = nfmsg->nfgen_family;
- if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
+ if (ctnetlink_needs_filter(family, cda)) {
filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
@@ -896,9 +1052,80 @@ static int ctnetlink_start(struct netlink_callback *cb)
return 0;
}
+static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple,
+ struct nf_conntrack_tuple *ct_tuple,
+ u_int32_t flags, int family)
+{
+ switch (family) {
+ case NFPROTO_IPV4:
+ if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) &&
+ filter_tuple->src.u3.ip != ct_tuple->src.u3.ip)
+ return 0;
+
+ if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) &&
+ filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip)
+ return 0;
+ break;
+ case NFPROTO_IPV6:
+ if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) &&
+ !ipv6_addr_cmp(&filter_tuple->src.u3.in6,
+ &ct_tuple->src.u3.in6))
+ return 0;
+
+ if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) &&
+ !ipv6_addr_cmp(&filter_tuple->dst.u3.in6,
+ &ct_tuple->dst.u3.in6))
+ return 0;
+ break;
+ }
+
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) &&
+ filter_tuple->dst.protonum != ct_tuple->dst.protonum)
+ return 0;
+
+ switch (ct_tuple->dst.protonum) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) &&
+ filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port)
+ return 0;
+
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) &&
+ filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port)
+ return 0;
+ break;
+ case IPPROTO_ICMP:
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) &&
+ filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type)
+ return 0;
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) &&
+ filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code)
+ return 0;
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) &&
+ filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id)
+ return 0;
+ break;
+ case IPPROTO_ICMPV6:
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) &&
+ filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type)
+ return 0;
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) &&
+ filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code)
+ return 0;
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) &&
+ filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id)
+ return 0;
+ break;
+ }
+
+ return 1;
+}
+
static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
{
struct ctnetlink_filter *filter = data;
+ struct nf_conntrack_tuple *tuple;
+ u32 status;
if (filter == NULL)
goto out;
@@ -910,10 +1137,29 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
if (filter->family && nf_ct_l3num(ct) != filter->family)
goto ignore_entry;
+ if (filter->orig_flags) {
+ tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL);
+ if (!ctnetlink_filter_match_tuple(&filter->orig, tuple,
+ filter->orig_flags,
+ filter->family))
+ goto ignore_entry;
+ }
+
+ if (filter->reply_flags) {
+ tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY);
+ if (!ctnetlink_filter_match_tuple(&filter->reply, tuple,
+ filter->reply_flags,
+ filter->family))
+ goto ignore_entry;
+ }
+
#ifdef CONFIG_NF_CONNTRACK_MARK
if ((ct->mark & filter->mark.mask) != filter->mark.val)
goto ignore_entry;
#endif
+ status = (u32)READ_ONCE(ct->status);
+ if ((status & filter->status.mask) != filter->status.val)
+ goto ignore_entry;
out:
return 1;
@@ -925,6 +1171,7 @@ ignore_entry:
static int
ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
+ unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
struct net *net = sock_net(skb->sk);
struct nf_conn *ct, *last;
struct nf_conntrack_tuple_hash *h;
@@ -954,12 +1201,11 @@ restart:
}
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
hnnode) {
- if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
- continue;
ct = nf_ct_tuplehash_to_ctrack(h);
if (nf_ct_is_expired(ct)) {
+ /* need to defer nf_ct_kill() until lock is released */
if (i < ARRAY_SIZE(nf_ct_evict) &&
- atomic_inc_not_zero(&ct->ct_general.use))
+ refcount_inc_not_zero(&ct->ct_general.use))
nf_ct_evict[i++] = ct;
continue;
}
@@ -967,6 +1213,9 @@ restart:
if (!net_eq(net, nf_ct_net(ct)))
continue;
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+
if (cb->args[1]) {
if (ct != last)
continue;
@@ -979,7 +1228,7 @@ restart:
ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
- ct, true);
+ ct, true, flags);
if (res < 0) {
nf_conntrack_get(&ct->ct_general);
cb->args[1] = (unsigned long)ct;
@@ -1014,31 +1263,50 @@ out:
}
static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
+ struct nf_conntrack_tuple *t,
+ u_int32_t flags)
{
- if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
+ if (!tb[CTA_IP_V4_SRC])
+ return -EINVAL;
+
+ t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
+ }
- t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
- t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+ if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) {
+ if (!tb[CTA_IP_V4_DST])
+ return -EINVAL;
+
+ t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+ }
return 0;
}
static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
+ struct nf_conntrack_tuple *t,
+ u_int32_t flags)
{
- if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
+ if (!tb[CTA_IP_V6_SRC])
+ return -EINVAL;
+
+ t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
+ }
- t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
- t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+ if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) {
+ if (!tb[CTA_IP_V6_DST])
+ return -EINVAL;
+
+ t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+ }
return 0;
}
static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
- struct nf_conntrack_tuple *tuple)
+ struct nf_conntrack_tuple *tuple,
+ u_int32_t flags)
{
struct nlattr *tb[CTA_IP_MAX+1];
int ret = 0;
@@ -1054,10 +1322,10 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
switch (tuple->src.l3num) {
case NFPROTO_IPV4:
- ret = ipv4_nlattr_to_tuple(tb, tuple);
+ ret = ipv4_nlattr_to_tuple(tb, tuple, flags);
break;
case NFPROTO_IPV6:
- ret = ipv6_nlattr_to_tuple(tb, tuple);
+ ret = ipv6_nlattr_to_tuple(tb, tuple, flags);
break;
}
@@ -1069,7 +1337,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
};
static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
- struct nf_conntrack_tuple *tuple)
+ struct nf_conntrack_tuple *tuple,
+ u_int32_t flags)
{
const struct nf_conntrack_l4proto *l4proto;
struct nlattr *tb[CTA_PROTO_MAX+1];
@@ -1080,8 +1349,12 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
if (ret < 0)
return ret;
+ if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)))
+ return 0;
+
if (!tb[CTA_PROTO_NUM])
return -EINVAL;
+
tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
rcu_read_lock();
@@ -1092,7 +1365,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
l4proto->nla_policy,
NULL);
if (ret == 0)
- ret = l4proto->nlattr_to_tuple(tb, tuple);
+ ret = l4proto->nlattr_to_tuple(tb, tuple, flags);
}
rcu_read_unlock();
@@ -1143,10 +1416,21 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
[CTA_TUPLE_ZONE] = { .type = NLA_U16 },
};
+#define CTA_FILTER_F_ALL_CTA_PROTO \
+ (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \
+ CTA_FILTER_F_CTA_PROTO_DST_PORT | \
+ CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \
+ CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \
+ CTA_FILTER_F_CTA_PROTO_ICMP_ID | \
+ CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \
+ CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \
+ CTA_FILTER_F_CTA_PROTO_ICMPV6_ID)
+
static int
-ctnetlink_parse_tuple(const struct nlattr * const cda[],
- struct nf_conntrack_tuple *tuple, u32 type,
- u_int8_t l3num, struct nf_conntrack_zone *zone)
+ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
+ struct nf_conntrack_tuple *tuple, u32 type,
+ u_int8_t l3num, struct nf_conntrack_zone *zone,
+ u_int32_t flags)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
@@ -1158,23 +1442,33 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
if (err < 0)
return err;
- if (!tb[CTA_TUPLE_IP])
- return -EINVAL;
-
+ if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6)
+ return -EOPNOTSUPP;
tuple->src.l3num = l3num;
- err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple);
- if (err < 0)
- return err;
+ if (flags & CTA_FILTER_FLAG(CTA_IP_DST) ||
+ flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
+ if (!tb[CTA_TUPLE_IP])
+ return -EINVAL;
- if (!tb[CTA_TUPLE_PROTO])
- return -EINVAL;
+ err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags);
+ if (err < 0)
+ return err;
+ }
- err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple);
- if (err < 0)
- return err;
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) {
+ if (!tb[CTA_TUPLE_PROTO])
+ return -EINVAL;
- if (tb[CTA_TUPLE_ZONE]) {
+ err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags);
+ if (err < 0)
+ return err;
+ } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) {
+ /* Can't manage proto flags without a protonum */
+ return -EINVAL;
+ }
+
+ if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) {
if (!zone)
return -EINVAL;
@@ -1193,6 +1487,15 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
return 0;
}
+static int
+ctnetlink_parse_tuple(const struct nlattr * const cda[],
+ struct nf_conntrack_tuple *tuple, u32 type,
+ u_int8_t l3num, struct nf_conntrack_zone *zone)
+{
+ return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone,
+ CTA_FILTER_FLAG(ALL));
+}
+
static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
[CTA_HELP_NAME] = { .type = NLA_NUL_STRING,
.len = NF_CT_HELPER_NAME_LEN - 1 },
@@ -1240,6 +1543,8 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
.len = NF_CT_LABELS_MAX_SIZE },
[CTA_LABELS_MASK] = { .type = NLA_BINARY,
.len = NF_CT_LABELS_MAX_SIZE },
+ [CTA_FILTER] = { .type = NLA_NESTED },
+ [CTA_STATUS_MASK] = { .type = NLA_U32 },
};
static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
@@ -1255,31 +1560,38 @@ static int ctnetlink_flush_conntrack(struct net *net,
u32 portid, int report, u8 family)
{
struct ctnetlink_filter *filter = NULL;
+ struct nf_ct_iter_data iter = {
+ .net = net,
+ .portid = portid,
+ .report = report,
+ };
+
+ if (ctnetlink_needs_filter(family, cda)) {
+ if (cda[CTA_FILTER])
+ return -EOPNOTSUPP;
- if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
+
+ iter.data = filter;
}
- nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
- portid, report);
+ nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter);
kfree(filter);
return 0;
}
-static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_del_conntrack(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u8 family = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
- struct nf_conn *ct;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1288,22 +1600,22 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
if (cda[CTA_TUPLE_ORIG])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
- nfmsg->nfgen_family, &zone);
+ family, &zone);
else if (cda[CTA_TUPLE_REPLY])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
- nfmsg->nfgen_family, &zone);
+ family, &zone);
else {
- u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC;
+ u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC;
- return ctnetlink_flush_conntrack(net, cda,
+ return ctnetlink_flush_conntrack(info->net, cda,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh), u3);
+ nlmsg_report(info->nlh), u3);
}
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, &zone, &tuple);
+ h = nf_conntrack_find_get(info->net, &zone, &tuple);
if (!h)
return -ENOENT;
@@ -1323,28 +1635,25 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
}
}
- nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
+ nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh));
nf_ct_put(ct);
return 0;
}
-static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_conntrack(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
- struct nf_conn *ct;
- struct sk_buff *skb2 = NULL;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct sk_buff *skb2;
+ struct nf_conn *ct;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = ctnetlink_start,
.dump = ctnetlink_dump_table,
@@ -1352,7 +1661,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
.data = (void *)cda,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1371,158 +1680,163 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, &zone, &tuple);
+ h = nf_conntrack_find_get(info->net, &zone, &tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
- err = -ENOMEM;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
+ if (!skb2) {
nf_ct_put(ct);
return -ENOMEM;
}
- err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true);
+ err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid,
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct,
+ true, 0);
nf_ct_put(ct);
- if (err <= 0)
- goto free;
+ if (err <= 0) {
+ kfree_skb(skb2);
+ return -ENOMEM;
+ }
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+}
+
+static int ctnetlink_done_list(struct netlink_callback *cb)
+{
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+
+ if (ctx->last)
+ nf_ct_put(ctx->last);
return 0;
+}
-free:
- kfree_skb(skb2);
-out:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int ctnetlink_dump_one_entry(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nf_conn *ct,
+ bool dying)
+{
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u8 l3proto = nfmsg->nfgen_family;
+ int res;
+
+ if (l3proto && nf_ct_l3num(ct) != l3proto)
+ return 0;
+
+ if (ctx->last) {
+ if (ct != ctx->last)
+ return 0;
+
+ ctx->last = NULL;
+ }
+
+ /* We can't dump extension info for the unconfirmed
+ * list because unconfirmed conntracks can have
+ * ct->ext reallocated (and thus freed).
+ *
+ * In the dying list case ct->ext can't be free'd
+ * until after we drop pcpu->lock.
+ */
+ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, dying, 0);
+ if (res < 0) {
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
+ return 0;
+
+ ctx->last = ct;
+ }
+
+ return res;
}
+#endif
-static int ctnetlink_done_list(struct netlink_callback *cb)
+static int
+ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
{
- if (cb->args[1])
- nf_ct_put((struct nf_conn *)cb->args[1]);
return 0;
}
static int
-ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
+ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nf_conn *ct, *last;
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+ struct nf_conn *last = ctx->last;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ const struct net *net = sock_net(skb->sk);
+ struct nf_conntrack_net_ecache *ecache_net;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- u_int8_t l3proto = nfmsg->nfgen_family;
- int res;
- int cpu;
- struct hlist_nulls_head *list;
- struct net *net = sock_net(skb->sk);
+#endif
- if (cb->args[2])
+ if (ctx->done)
return 0;
- last = (struct nf_conn *)cb->args[1];
+ ctx->last = NULL;
- for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
- struct ct_pcpu *pcpu;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ ecache_net = nf_conn_pernet_ecache(net);
+ spin_lock_bh(&ecache_net->dying_lock);
- if (!cpu_possible(cpu))
- continue;
+ hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) {
+ struct nf_conn *ct;
+ int res;
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
- spin_lock_bh(&pcpu->lock);
- list = dying ? &pcpu->dying : &pcpu->unconfirmed;
-restart:
- hlist_nulls_for_each_entry(h, n, list, hnnode) {
- ct = nf_ct_tuplehash_to_ctrack(h);
- if (l3proto && nf_ct_l3num(ct) != l3proto)
- continue;
- if (cb->args[1]) {
- if (ct != last)
- continue;
- cb->args[1] = 0;
- }
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (last && last != ct)
+ continue;
- /* We can't dump extension info for the unconfirmed
- * list because unconfirmed conntracks can have
- * ct->ext reallocated (and thus freed).
- *
- * In the dying list case ct->ext can't be free'd
- * until after we drop pcpu->lock.
- */
- res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
- ct, dying ? true : false);
- if (res < 0) {
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- continue;
- cb->args[0] = cpu;
- cb->args[1] = (unsigned long)ct;
- spin_unlock_bh(&pcpu->lock);
- goto out;
- }
+ res = ctnetlink_dump_one_entry(skb, cb, ct, true);
+ if (res < 0) {
+ spin_unlock_bh(&ecache_net->dying_lock);
+ nf_ct_put(last);
+ return skb->len;
}
- if (cb->args[1]) {
- cb->args[1] = 0;
- goto restart;
- }
- spin_unlock_bh(&pcpu->lock);
- }
- cb->args[2] = 1;
-out:
- if (last)
+
nf_ct_put(last);
+ last = NULL;
+ }
- return skb->len;
-}
+ spin_unlock_bh(&ecache_net->dying_lock);
+#endif
+ ctx->done = true;
+ nf_ct_put(last);
-static int
-ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return ctnetlink_dump_list(skb, cb, true);
+ return skb->len;
}
-static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_ct_dying(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_dying,
.done = ctnetlink_done_list,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return -EOPNOTSUPP;
}
-static int
-ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
+static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- return ctnetlink_dump_list(skb, cb, false);
-}
-
-static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
-{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_unconfirmed,
.done = ctnetlink_done_list,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return -EOPNOTSUPP;
@@ -1533,8 +1847,9 @@ static int
ctnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
const struct nlattr *attr)
+ __must_hold(RCU)
{
- struct nf_nat_hook *nat_hook;
+ const struct nf_nat_hook *nat_hook;
int err;
nat_hook = rcu_dereference(nf_nat_hook);
@@ -1576,45 +1891,10 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
}
#endif
-static void
-__ctnetlink_change_status(struct nf_conn *ct, unsigned long on,
- unsigned long off)
-{
- unsigned int bit;
-
- /* Ignore these unchangable bits */
- on &= ~IPS_UNCHANGEABLE_MASK;
- off &= ~IPS_UNCHANGEABLE_MASK;
-
- for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
- if (on & (1 << bit))
- set_bit(bit, &ct->status);
- else if (off & (1 << bit))
- clear_bit(bit, &ct->status);
- }
-}
-
static int
ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
{
- unsigned long d;
- unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
- d = ct->status ^ status;
-
- if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
- /* unchangeable */
- return -EBUSY;
-
- if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
- /* SEEN_REPLY bit can only be set */
- return -EBUSY;
-
- if (d & IPS_ASSURED && !(status & IPS_ASSURED))
- /* ASSURED bit can only be set */
- return -EBUSY;
-
- __ctnetlink_change_status(ct, status, 0);
- return 0;
+ return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS])));
}
static int
@@ -1690,7 +1970,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
}
if (help) {
- if (help->helper == helper) {
+ if (rcu_access_pointer(help->helper) == helper) {
/* update private helper data if allowed. */
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
@@ -1709,16 +1989,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
static int ctnetlink_change_timeout(struct nf_conn *ct,
const struct nlattr * const cda[])
{
- u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
-
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- ct->timeout = nfct_time_stamp + (u32)timeout;
-
- if (test_bit(IPS_DYING_BIT, &ct->status))
- return -ETIME;
-
- return 0;
+ return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ);
}
#if defined(CONFIG_NF_CONNTRACK_MARK)
@@ -1978,9 +2249,7 @@ ctnetlink_create_conntrack(struct net *net,
goto err1;
timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- ct->timeout = (u32)timeout + nfct_time_stamp;
+ __nf_ct_set_timeout(ct, timeout);
rcu_read_lock();
if (cda[CTA_HELP]) {
@@ -2025,14 +2294,10 @@ ctnetlink_create_conntrack(struct net *net,
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
- /* not in hash table yet so not strictly necessary */
+ /* disable helper auto-assignment for this entry */
+ ct->status |= IPS_HELPER;
RCU_INIT_POINTER(help->helper, helper);
}
- } else {
- /* try an implicit helper assignation */
- err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
- if (err < 0)
- goto err2;
}
err = ctnetlink_setup_nat(ct, cda);
@@ -2118,18 +2383,15 @@ err1:
return ERR_PTR(err);
}
-static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_new_conntrack(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
struct nf_conntrack_tuple otuple, rtuple;
struct nf_conntrack_tuple_hash *h = NULL;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- struct nf_conn *ct;
- u_int8_t u3 = nfmsg->nfgen_family;
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -2151,13 +2413,13 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
}
if (cda[CTA_TUPLE_ORIG])
- h = nf_conntrack_find_get(net, &zone, &otuple);
+ h = nf_conntrack_find_get(info->net, &zone, &otuple);
else if (cda[CTA_TUPLE_REPLY])
- h = nf_conntrack_find_get(net, &zone, &rtuple);
+ h = nf_conntrack_find_get(info->net, &zone, &rtuple);
if (h == NULL) {
err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE) {
+ if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
enum ip_conntrack_events events;
if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
@@ -2165,8 +2427,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
if (otuple.dst.protonum != rtuple.dst.protonum)
return -EINVAL;
- ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple,
- &rtuple, u3);
+ ct = ctnetlink_create_conntrack(info->net, &zone, cda,
+ &otuple, &rtuple, u3);
if (IS_ERR(ct))
return PTR_ERR(ct);
@@ -2189,7 +2451,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
(1 << IPCT_SYNPROXY) |
events,
ct, NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
nf_ct_put(ct);
}
@@ -2199,7 +2461,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
err = -EEXIST;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
+ if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) {
err = ctnetlink_change_conntrack(ct, cda);
if (err == 0) {
nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
@@ -2211,7 +2473,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
(1 << IPCT_MARK) |
(1 << IPCT_SYNPROXY),
ct, NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
}
}
@@ -2224,23 +2486,17 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
__u16 cpu, const struct ip_conntrack_stat *st)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
IPCTNL_MSG_CT_GET_STATS_CPU);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, htons(cpu));
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(cpu);
-
if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
- nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
htonl(st->insert_failed)) ||
@@ -2248,7 +2504,11 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
- htonl(st->search_restart)))
+ htonl(st->search_restart)) ||
+ nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE,
+ htonl(st->clash_resolve)) ||
+ nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG,
+ htonl(st->chaintoolong)))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -2287,17 +2547,15 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_stat_ct_cpu(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_ct_stat_cpu_dump,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return 0;
@@ -2307,21 +2565,17 @@ static int
ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
struct net *net)
{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
- unsigned int nr_conntracks = atomic_read(&net->ct.count);
+ unsigned int nr_conntracks;
+ struct nlmsghdr *nlh;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
+ nr_conntracks = nf_conntrack_count(net);
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
@@ -2337,10 +2591,8 @@ nlmsg_failure:
return -1;
}
-static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
struct sk_buff *skb2;
int err;
@@ -2350,23 +2602,15 @@ static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl,
return -ENOMEM;
err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
sock_net(skb->sk));
- if (err <= 0)
- goto free;
-
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
-
- return 0;
+ if (err <= 0) {
+ kfree_skb(skb2);
+ return -ENOMEM;
+ }
-free:
- kfree_skb(skb2);
-out:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
@@ -2405,6 +2649,8 @@ ctnetlink_glue_build_size(const struct nf_conn *ct)
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ ctnetlink_secctx_size(ct)
+ + ctnetlink_acct_size(ct)
+ + ctnetlink_timestamp_size(ct)
#if IS_ENABLED(CONFIG_NF_NAT)
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
@@ -2419,12 +2665,6 @@ ctnetlink_glue_build_size(const struct nf_conn *ct)
;
}
-static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo)
-{
- return nf_ct_get(skb, ctinfo);
-}
-
static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
{
const struct nf_conntrack_zone *zone;
@@ -2462,10 +2702,14 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
if (ctnetlink_dump_status(skb, ct) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_timeout(skb, ct) < 0)
+ if (ctnetlink_dump_timeout(skb, ct, false) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_protoinfo(skb, ct) < 0)
+ if (ctnetlink_dump_protoinfo(skb, ct, false) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0)
goto nla_put_failure;
if (ctnetlink_dump_helpinfo(skb, ct) < 0)
@@ -2542,7 +2786,7 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
* unchangeable bits but do not error out. Also user programs
* are allowed to clear the bits that they are allowed to change.
*/
- __ctnetlink_change_status(ct, status, ~status);
+ __nf_ct_change_status(ct, status, ~status);
return 0;
}
@@ -2657,8 +2901,7 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff);
}
-static struct nfnl_ct_hook ctnetlink_glue_hook = {
- .get_ct = ctnetlink_glue_get_ct,
+static const struct nfnl_ct_hook ctnetlink_glue_hook = {
.build_size = ctnetlink_glue_build_size,
.build = ctnetlink_glue_build,
.parse = ctnetlink_glue_parse,
@@ -2702,6 +2945,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
memset(&m, 0xFF, sizeof(m));
memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
m.src.u.all = mask->src.u.all;
+ m.src.l3num = tuple->src.l3num;
m.dst.protonum = tuple->dst.protonum;
nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK);
@@ -2731,7 +2975,7 @@ static const union nf_inet_addr any_addr;
static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp)
{
- static __read_mostly siphash_key_t exp_id_seed;
+ static siphash_aligned_key_t exp_id_seed;
unsigned long a, b, c, d;
net_get_random_once(&exp_id_seed, sizeof(exp_id_seed));
@@ -2824,19 +3068,14 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
int event, const struct nf_conntrack_expect *exp)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags,
+ exp->tuple.src.l3num, NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = exp->tuple.src.l3num;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
goto nla_put_failure;
@@ -2851,12 +3090,11 @@ nla_put_failure:
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int
-ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
+ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item)
{
struct nf_conntrack_expect *exp = item->exp;
struct net *net = nf_ct_exp_net(exp);
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct sk_buff *skb;
unsigned int type, group;
int flags = 0;
@@ -2879,15 +3117,11 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
goto errout;
type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type);
- nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, item->portid, 0, type, flags,
+ exp->tuple.src.l3num, NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = exp->tuple.src.l3num;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
goto nla_put_failure;
@@ -3052,29 +3286,28 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
return err;
}
-static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_expect(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
- struct sk_buff *skb2;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct sk_buff *skb2;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
if (cda[CTA_EXPECT_MASTER])
- return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda,
- extack);
+ return ctnetlink_dump_exp_ct(info->net, info->sk, skb,
+ info->nlh, cda,
+ info->extack);
else {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_dump_table,
.done = ctnetlink_exp_done,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
}
@@ -3094,7 +3327,7 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
if (err < 0)
return err;
- exp = nf_ct_expect_find_get(net, &zone, &tuple);
+ exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
if (!exp)
return -ENOENT;
@@ -3107,42 +3340,39 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
}
}
- err = -ENOMEM;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
+ if (!skb2) {
nf_ct_expect_put(exp);
- goto out;
+ return -ENOMEM;
}
rcu_read_lock();
err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
+ info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+ exp);
rcu_read_unlock();
nf_ct_expect_put(exp);
- if (err <= 0)
- goto free;
-
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
-
- return 0;
+ if (err <= 0) {
+ kfree_skb(skb2);
+ return -ENOMEM;
+ }
-free:
- kfree_skb(skb2);
-out:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data)
{
+ struct nf_conntrack_helper *helper;
const struct nf_conn_help *m_help;
const char *name = data;
m_help = nfct_help(exp->master);
- return strcmp(m_help->helper->name, name) == 0;
+ helper = rcu_dereference(m_help->helper);
+ if (!helper)
+ return false;
+
+ return strcmp(helper->name, name) == 0;
}
static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
@@ -3150,15 +3380,13 @@ static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
return true;
}
-static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_del_expect(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple tuple;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
int err;
@@ -3174,7 +3402,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
return err;
/* bump usage count to 2 */
- exp = nf_ct_expect_find_get(net, &zone, &tuple);
+ exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
if (!exp)
return -ENOENT;
@@ -3190,7 +3418,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
spin_lock_bh(&nf_conntrack_expect_lock);
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
nf_ct_expect_put(exp);
}
spin_unlock_bh(&nf_conntrack_expect_lock);
@@ -3200,14 +3428,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
} else if (cda[CTA_EXPECT_HELP_NAME]) {
char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);
- nf_ct_expect_iterate_net(net, expect_iter_name, name,
+ nf_ct_expect_iterate_net(info->net, expect_iter_name, name,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
} else {
/* This basically means we have to flush everything*/
- nf_ct_expect_iterate_net(net, expect_iter_all, NULL,
+ nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
}
return 0;
@@ -3403,15 +3631,13 @@ err_ct:
return err;
}
-static int ctnetlink_new_expect(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_new_expect(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
int err;
@@ -3430,20 +3656,20 @@ static int ctnetlink_new_expect(struct net *net, struct sock *ctnl,
return err;
spin_lock_bh(&nf_conntrack_expect_lock);
- exp = __nf_ct_expect_find(net, &zone, &tuple);
+ exp = __nf_ct_expect_find(info->net, &zone, &tuple);
if (!exp) {
spin_unlock_bh(&nf_conntrack_expect_lock);
err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE) {
- err = ctnetlink_create_expect(net, &zone, cda, u3,
+ if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
+ err = ctnetlink_create_expect(info->net, &zone, cda, u3,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
}
return err;
}
err = -EEXIST;
- if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ if (!(info->nlh->nlmsg_flags & NLM_F_EXCL))
err = ctnetlink_change_expect(exp, cda);
spin_unlock_bh(&nf_conntrack_expect_lock);
@@ -3455,20 +3681,15 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
const struct ip_conntrack_stat *st)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
IPCTNL_MSG_EXP_GET_STATS_CPU);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, htons(cpu));
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(cpu);
-
if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) ||
nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) ||
nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete)))
@@ -3509,17 +3730,15 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_stat_exp_cpu(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_stat_cpu_dump,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return 0;
@@ -3527,44 +3746,77 @@ static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl,
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static struct nf_ct_event_notifier ctnl_notifier = {
- .fcn = ctnetlink_conntrack_event,
-};
-
-static struct nf_exp_event_notifier ctnl_notifier_exp = {
- .fcn = ctnetlink_expect_event,
+ .ct_event = ctnetlink_conntrack_event,
+ .exp_event = ctnetlink_expect_event,
};
#endif
static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
- [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu },
- [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct },
- [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying },
- [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed },
+ [IPCTNL_MSG_CT_NEW] = {
+ .call = ctnetlink_new_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_GET] = {
+ .call = ctnetlink_get_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_DELETE] = {
+ .call = ctnetlink_del_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_GET_CTRZERO] = {
+ .call = ctnetlink_get_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_GET_STATS_CPU] = {
+ .call = ctnetlink_stat_ct_cpu,
+ .type = NFNL_CB_MUTEX,
+ },
+ [IPCTNL_MSG_CT_GET_STATS] = {
+ .call = ctnetlink_stat_ct,
+ .type = NFNL_CB_MUTEX,
+ },
+ [IPCTNL_MSG_CT_GET_DYING] = {
+ .call = ctnetlink_get_ct_dying,
+ .type = NFNL_CB_MUTEX,
+ },
+ [IPCTNL_MSG_CT_GET_UNCONFIRMED] = {
+ .call = ctnetlink_get_ct_unconfirmed,
+ .type = NFNL_CB_MUTEX,
+ },
};
static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
- [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
- .attr_count = CTA_EXPECT_MAX,
- .policy = exp_nla_policy },
- [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
- .attr_count = CTA_EXPECT_MAX,
- .policy = exp_nla_policy },
- [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
- .attr_count = CTA_EXPECT_MAX,
- .policy = exp_nla_policy },
- [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu },
+ [IPCTNL_MSG_EXP_GET] = {
+ .call = ctnetlink_get_expect,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy
+ },
+ [IPCTNL_MSG_EXP_NEW] = {
+ .call = ctnetlink_new_expect,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy
+ },
+ [IPCTNL_MSG_EXP_DELETE] = {
+ .call = ctnetlink_del_expect,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy
+ },
+ [IPCTNL_MSG_EXP_GET_STATS_CPU] = {
+ .call = ctnetlink_stat_exp_cpu,
+ .type = NFNL_CB_MUTEX,
+ },
};
static const struct nfnetlink_subsystem ctnl_subsys = {
@@ -3588,58 +3840,29 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
static int __net_init ctnetlink_net_init(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- int ret;
-
- ret = nf_conntrack_register_notifier(net, &ctnl_notifier);
- if (ret < 0) {
- pr_err("ctnetlink_init: cannot register notifier.\n");
- goto err_out;
- }
-
- ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
- if (ret < 0) {
- pr_err("ctnetlink_init: cannot expect register notifier.\n");
- goto err_unreg_notifier;
- }
+ nf_conntrack_register_notifier(net, &ctnl_notifier);
#endif
return 0;
-
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-err_unreg_notifier:
- nf_conntrack_unregister_notifier(net, &ctnl_notifier);
-err_out:
- return ret;
-#endif
}
-static void ctnetlink_net_exit(struct net *net)
+static void ctnetlink_net_pre_exit(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
- nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+ nf_conntrack_unregister_notifier(net);
#endif
}
-static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
-{
- struct net *net;
-
- list_for_each_entry(net, net_exit_list, exit_list)
- ctnetlink_net_exit(net);
-
- /* wait for other cpus until they are done with ctnl_notifiers */
- synchronize_rcu();
-}
-
static struct pernet_operations ctnetlink_net_ops = {
.init = ctnetlink_net_init,
- .exit_batch = ctnetlink_net_exit_batch,
+ .pre_exit = ctnetlink_net_pre_exit,
};
static int __init ctnetlink_init(void)
{
int ret;
+ BUILD_BUG_ON(sizeof(struct ctnetlink_list_dump_ctx) > sizeof_field(struct netlink_callback, ctx));
+
ret = nfnetlink_subsys_register(&ctnl_subsys);
if (ret < 0) {
pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index a971183f11af..4c679638df06 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Connection tracking support for PPTP (Point to Point Tunneling Protocol).
- * PPTP is a a protocol for creating virtual private networks.
+ * PPTP is a protocol for creating virtual private networks.
* It is a specification defined by Microsoft and some vendors
* working with Microsoft. PPTP is built on top of a modified
* version of the Internet Generic Routing Encapsulation Protocol.
@@ -45,51 +45,37 @@ MODULE_ALIAS_NFCT_HELPER("pptp");
static DEFINE_SPINLOCK(nf_pptp_lock);
-int
-(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
-
-int
-(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
-
-void
-(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig,
- struct nf_conntrack_expect *expect_reply)
- __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre);
-
-void
-(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
- struct nf_conntrack_expect *exp) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
+const struct nf_nat_pptp_hook __rcu *nf_nat_pptp_hook;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook);
#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
/* PptpControlMessageType names */
-const char *const pptp_msg_name[] = {
- "UNKNOWN_MESSAGE",
- "START_SESSION_REQUEST",
- "START_SESSION_REPLY",
- "STOP_SESSION_REQUEST",
- "STOP_SESSION_REPLY",
- "ECHO_REQUEST",
- "ECHO_REPLY",
- "OUT_CALL_REQUEST",
- "OUT_CALL_REPLY",
- "IN_CALL_REQUEST",
- "IN_CALL_REPLY",
- "IN_CALL_CONNECT",
- "CALL_CLEAR_REQUEST",
- "CALL_DISCONNECT_NOTIFY",
- "WAN_ERROR_NOTIFY",
- "SET_LINK_INFO"
+static const char *const pptp_msg_name_array[PPTP_MSG_MAX + 1] = {
+ [0] = "UNKNOWN_MESSAGE",
+ [PPTP_START_SESSION_REQUEST] = "START_SESSION_REQUEST",
+ [PPTP_START_SESSION_REPLY] = "START_SESSION_REPLY",
+ [PPTP_STOP_SESSION_REQUEST] = "STOP_SESSION_REQUEST",
+ [PPTP_STOP_SESSION_REPLY] = "STOP_SESSION_REPLY",
+ [PPTP_ECHO_REQUEST] = "ECHO_REQUEST",
+ [PPTP_ECHO_REPLY] = "ECHO_REPLY",
+ [PPTP_OUT_CALL_REQUEST] = "OUT_CALL_REQUEST",
+ [PPTP_OUT_CALL_REPLY] = "OUT_CALL_REPLY",
+ [PPTP_IN_CALL_REQUEST] = "IN_CALL_REQUEST",
+ [PPTP_IN_CALL_REPLY] = "IN_CALL_REPLY",
+ [PPTP_IN_CALL_CONNECT] = "IN_CALL_CONNECT",
+ [PPTP_CALL_CLEAR_REQUEST] = "CALL_CLEAR_REQUEST",
+ [PPTP_CALL_DISCONNECT_NOTIFY] = "CALL_DISCONNECT_NOTIFY",
+ [PPTP_WAN_ERROR_NOTIFY] = "WAN_ERROR_NOTIFY",
+ [PPTP_SET_LINK_INFO] = "SET_LINK_INFO"
};
+
+const char *pptp_msg_name(u_int16_t msg)
+{
+ if (msg > PPTP_MSG_MAX)
+ return pptp_msg_name_array[0];
+
+ return pptp_msg_name_array[msg];
+}
EXPORT_SYMBOL(pptp_msg_name);
#endif
@@ -103,8 +89,8 @@ EXPORT_SYMBOL(pptp_msg_name);
static void pptp_expectfn(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
+ const struct nf_nat_pptp_hook *hook;
struct net *net = nf_ct_net(ct);
- typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
pr_debug("increasing timeouts\n");
/* increase timeout of GRE data channel conntrack entry */
@@ -114,9 +100,9 @@ static void pptp_expectfn(struct nf_conn *ct,
/* Can you see how rusty this code is, compared with the pre-2.6.11
* one? That's what happened to my shiny newnat of 2002 ;( -HW */
- nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn);
- if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK)
- nf_nat_pptp_expectfn(ct, exp);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->master->status & IPS_NAT_MASK)
+ hook->expectfn(ct, exp);
else {
struct nf_conntrack_tuple inv_t;
struct nf_conntrack_expect *exp_other;
@@ -201,9 +187,9 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
{
struct nf_conntrack_expect *exp_orig, *exp_reply;
+ const struct nf_nat_pptp_hook *hook;
enum ip_conntrack_dir dir;
int ret = 1;
- typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre;
exp_orig = nf_ct_expect_alloc(ct);
if (exp_orig == NULL)
@@ -231,9 +217,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
IPPROTO_GRE, &callid, &peer_callid);
exp_reply->expectfn = pptp_expectfn;
- nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre);
- if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK)
- nf_nat_pptp_exp_gre(exp_orig, exp_reply);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ hook->exp_gre(exp_orig, exp_reply);
if (nf_ct_expect_related(exp_orig, 0) != 0)
goto out_put_both;
if (nf_ct_expect_related(exp_reply, 0) != 0)
@@ -271,12 +257,12 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
msg = ntohs(ctlh->messageType);
- pr_debug("inbound control message %s\n", pptp_msg_name[msg]);
+ pr_debug("inbound control message %s\n", pptp_msg_name(msg));
switch (msg) {
case PPTP_START_SESSION_REPLY:
@@ -311,7 +297,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
pcid = pptpReq->ocack.peersCallID;
if (info->pns_call_id != pcid)
goto invalid;
- pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
+ pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name(msg),
ntohs(cid), ntohs(pcid));
if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) {
@@ -328,7 +314,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
cid = pptpReq->icreq.callID;
- pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid));
info->cstate = PPTP_CALL_IN_REQ;
info->pac_call_id = cid;
break;
@@ -347,7 +333,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
if (info->pns_call_id != pcid)
goto invalid;
- pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid));
+ pr_debug("%s, PCID=%X\n", pptp_msg_name(msg), ntohs(pcid));
info->cstate = PPTP_CALL_IN_CONF;
/* we expect a GRE connection from PAC to PNS */
@@ -357,7 +343,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
case PPTP_CALL_DISCONNECT_NOTIFY:
/* server confirms disconnect */
cid = pptpReq->disc.callID;
- pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid));
info->cstate = PPTP_CALL_NONE;
/* untrack this call id, unexpect GRE packets */
@@ -375,16 +361,15 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
- if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_inbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->inbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
pr_debug("invalid %s: type=%d cid=%u pcid=%u "
"cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+ pptp_msg_name(msg),
msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
ntohs(info->pns_call_id), ntohs(info->pac_call_id));
return NF_ACCEPT;
@@ -399,12 +384,12 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
msg = ntohs(ctlh->messageType);
- pr_debug("outbound control message %s\n", pptp_msg_name[msg]);
+ pr_debug("outbound control message %s\n", pptp_msg_name(msg));
switch (msg) {
case PPTP_START_SESSION_REQUEST:
@@ -426,7 +411,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
info->cstate = PPTP_CALL_OUT_REQ;
/* track PNS call id */
cid = pptpReq->ocreq.callID;
- pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid));
info->pns_call_id = cid;
break;
@@ -440,7 +425,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
pcid = pptpReq->icack.peersCallID;
if (info->pac_call_id != pcid)
goto invalid;
- pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg],
+ pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name(msg),
ntohs(cid), ntohs(pcid));
if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) {
@@ -471,16 +456,15 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
- if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_outbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->outbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
pr_debug("invalid %s: type=%d cid=%u pcid=%u "
"cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+ pptp_msg_name(msg),
msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
ntohs(info->pns_call_id), ntohs(info->pac_call_id));
return NF_ACCEPT;
@@ -536,7 +520,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
nexthdr_off = protoff;
tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph);
- BUG_ON(!tcph);
+ if (!tcph)
+ return NF_ACCEPT;
+
nexthdr_off += tcph->doff * 4;
datalen = tcplen - tcph->doff * 4;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index a0560d175a7f..895b09cbd7cf 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -42,17 +42,16 @@
#include <net/ipv6.h>
#include <net/inet_frag.h>
-extern unsigned int nf_conntrack_net_id;
-
static DEFINE_MUTEX(nf_ct_proto_mutex);
#ifdef CONFIG_SYSCTL
-__printf(5, 6)
+__printf(4, 5)
void nf_l4proto_log_invalid(const struct sk_buff *skb,
- struct net *net,
- u16 pf, u8 protonum,
+ const struct nf_hook_state *state,
+ u8 protonum,
const char *fmt, ...)
{
+ struct net *net = state->net;
struct va_format vaf;
va_list args;
@@ -64,15 +63,16 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb,
vaf.fmt = fmt;
vaf.va = &args;
- nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_proto_%d: %pV ", protonum, &vaf);
+ nf_log_packet(net, state->pf, 0, skb, state->in, state->out,
+ NULL, "nf_ct_proto_%d: %pV ", protonum, &vaf);
va_end(args);
}
EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid);
-__printf(3, 4)
+__printf(4, 5)
void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
const struct nf_conn *ct,
+ const struct nf_hook_state *state,
const char *fmt, ...)
{
struct va_format vaf;
@@ -87,7 +87,7 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
vaf.fmt = fmt;
vaf.va = &args;
- nf_l4proto_log_invalid(skb, net, nf_ct_l3num(ct),
+ nf_l4proto_log_invalid(skb, state,
nf_ct_protonum(ct), "%pV", &vaf);
va_end(args);
}
@@ -155,6 +155,16 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
}
EXPORT_SYMBOL_GPL(nf_confirm);
+static bool in_vrf_postrouting(const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (state->hook == NF_INET_POST_ROUTING &&
+ netif_is_l3_master(state->out))
+ return true;
+#endif
+ return false;
+}
+
static unsigned int ipv4_confirm(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -166,6 +176,9 @@ static unsigned int ipv4_confirm(void *priv,
if (!ct || ctinfo == IP_CT_RELATED_REPLY)
return nf_conntrack_confirm(skb);
+ if (in_vrf_postrouting(state))
+ return NF_ACCEPT;
+
return nf_confirm(skb,
skb_network_offset(skb) + ip_hdrlen(skb),
ct, ctinfo);
@@ -374,6 +387,9 @@ static unsigned int ipv6_confirm(void *priv,
if (!ct || ctinfo == IP_CT_RELATED_REPLY)
return nf_conntrack_confirm(skb);
+ if (in_vrf_postrouting(state))
+ return NF_ACCEPT;
+
protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
&frag_off);
if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
@@ -446,7 +462,7 @@ static struct nf_ct_bridge_info *nf_ct_bridge_info;
static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
{
- struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
bool fixup_needed = false, retry = true;
int err = 0;
retry:
@@ -522,29 +538,37 @@ retry:
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
- if (fixup_needed)
- nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup,
- (void *)(unsigned long)nfproto, 0, 0);
+ if (fixup_needed) {
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = (void *)(unsigned long)nfproto,
+ };
+ nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data);
+ }
return err;
}
static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
{
- struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
mutex_lock(&nf_ct_proto_mutex);
switch (nfproto) {
case NFPROTO_IPV4:
- if (cnet->users4 && (--cnet->users4 == 0))
+ if (cnet->users4 && (--cnet->users4 == 0)) {
nf_unregister_net_hooks(net, ipv4_conntrack_ops,
ARRAY_SIZE(ipv4_conntrack_ops));
+ nf_defrag_ipv4_disable(net);
+ }
break;
#if IS_ENABLED(CONFIG_IPV6)
case NFPROTO_IPV6:
- if (cnet->users6 && (--cnet->users6 == 0))
+ if (cnet->users6 && (--cnet->users6 == 0)) {
nf_unregister_net_hooks(net, ipv6_conntrack_ops,
ARRAY_SIZE(ipv6_conntrack_ops));
+ nf_defrag_ipv6_disable(net);
+ }
break;
#endif
case NFPROTO_BRIDGE:
@@ -565,6 +589,7 @@ static int nf_ct_netns_inet_get(struct net *net)
int err;
err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
+#if IS_ENABLED(CONFIG_IPV6)
if (err < 0)
goto err1;
err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
@@ -575,6 +600,7 @@ static int nf_ct_netns_inet_get(struct net *net)
err2:
nf_ct_netns_put(net, NFPROTO_IPV4);
err1:
+#endif
return err;
}
@@ -610,7 +636,7 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto)
switch (nfproto) {
case NFPROTO_BRIDGE:
nf_ct_netns_do_put(net, NFPROTO_BRIDGE);
- /* fall through */
+ fallthrough;
case NFPROTO_INET:
nf_ct_netns_do_put(net, NFPROTO_IPV4);
nf_ct_netns_do_put(net, NFPROTO_IPV6);
@@ -658,7 +684,7 @@ int nf_conntrack_proto_init(void)
#if IS_ENABLED(CONFIG_IPV6)
cleanup_sockopt:
- nf_unregister_sockopt(&so_getorigdst6);
+ nf_unregister_sockopt(&so_getorigdst);
#endif
return ret;
}
@@ -691,13 +717,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
#endif
}
-void nf_conntrack_proto_pernet_fini(struct net *net)
-{
-#ifdef CONFIG_NF_CT_PROTO_GRE
- nf_ct_gre_keymap_flush(net);
-#endif
-}
-
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
&nf_conntrack_htable_size, 0600);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index b3f4a334f9d7..c1557d47ccd1 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -382,7 +382,8 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
static noinline bool
dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
- const struct dccp_hdr *dh)
+ const struct dccp_hdr *dh,
+ const struct nf_hook_state *hook_state)
{
struct net *net = nf_ct_net(ct);
struct nf_dccp_net *dn;
@@ -397,6 +398,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
msg = "not picking up existing connection ";
goto out_invalid;
}
+ break;
case CT_DCCP_REQUEST:
break;
case CT_DCCP_INVALID:
@@ -413,7 +415,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
return true;
out_invalid:
- nf_ct_l4proto_log_invalid(skb, ct, "%s", msg);
+ nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg);
return false;
}
@@ -463,8 +465,7 @@ static bool dccp_error(const struct dccp_hdr *dh,
}
return false;
out_invalid:
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_DCCP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg);
return true;
}
@@ -487,7 +488,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
return -NF_ACCEPT;
type = dh->dccph_type;
- if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh))
+ if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state))
return -NF_ACCEPT;
if (type == DCCP_PKT_RESET &&
@@ -542,11 +543,11 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
ct->proto.dccp.last_pkt = type;
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid packet");
+ nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet");
return NF_ACCEPT;
case CT_DCCP_INVALID:
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid state transition");
+ nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition");
return -NF_ACCEPT;
}
@@ -589,7 +590,7 @@ static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
@@ -597,15 +598,22 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP);
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) ||
- nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
+ if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
cpu_to_be64(ct->proto.dccp.handshake_seq),
CTA_PROTOINFO_DCCP_PAD))
goto nla_put_failure;
+skip_state:
nla_nest_end(skb, nest_parms);
spin_unlock_bh(&ct->lock);
+
return 0;
nla_put_failure:
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 5b05487a60d2..728eeb0aea87 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net)
return &net->ct.nf_ct_proto.gre;
}
-void nf_ct_gre_keymap_flush(struct net *net)
-{
- struct nf_gre_net *net_gre = gre_pernet(net);
- struct nf_ct_gre_keymap *km, *tmp;
-
- spin_lock_bh(&keymap_lock);
- list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
- list_del_rcu(&km->list);
- kfree_rcu(km, rcu);
- }
- spin_unlock_bh(&keymap_lock);
-}
-
static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
const struct nf_conntrack_tuple *t)
{
@@ -218,9 +205,6 @@ int nf_conntrack_gre_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct nf_hook_state *state)
{
- if (state->pf != NFPROTO_IPV4)
- return -NF_ACCEPT;
-
if (!nf_ct_is_confirmed(ct)) {
unsigned int *timeouts = nf_ct_timeout_lookup(ct);
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index c2e3dff773bc..b38b7164acd5 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -20,6 +20,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_log.h>
+#include "nf_internals.h"
+
static const unsigned int nf_ct_icmp_timeout = 30*HZ;
bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -168,12 +170,12 @@ int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb,
ct_daddr = &ct->tuplehash[dir].tuple.dst.u3;
if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) {
if (state->pf == AF_INET) {
- nf_l4proto_log_invalid(skb, state->net, state->pf,
+ nf_l4proto_log_invalid(skb, state,
l4proto,
"outer daddr %pI4 != inner %pI4",
&outer_daddr->ip, &ct_daddr->ip);
} else if (state->pf == AF_INET6) {
- nf_l4proto_log_invalid(skb, state->net, state->pf,
+ nf_l4proto_log_invalid(skb, state,
l4proto,
"outer daddr %pI6 != inner %pI6",
&outer_daddr->ip6, &ct_daddr->ip6);
@@ -195,8 +197,7 @@ static void icmp_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_ICMP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_ICMP, "%s", msg);
}
/* Small and modified version of icmp_rcv */
@@ -271,20 +272,32 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
};
static int icmp_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *tuple)
+ struct nf_conntrack_tuple *tuple,
+ u_int32_t flags)
{
- if (!tb[CTA_PROTO_ICMP_TYPE] ||
- !tb[CTA_PROTO_ICMP_CODE] ||
- !tb[CTA_PROTO_ICMP_ID])
- return -EINVAL;
-
- tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
- tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
- tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
-
- if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
- !invmap[tuple->dst.u.icmp.type])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) {
+ if (!tb[CTA_PROTO_ICMP_TYPE])
+ return -EINVAL;
+
+ tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
+ if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
+ !invmap[tuple->dst.u.icmp.type])
+ return -EINVAL;
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) {
+ if (!tb[CTA_PROTO_ICMP_CODE])
+ return -EINVAL;
+
+ tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) {
+ if (!tb[CTA_PROTO_ICMP_ID])
+ return -EINVAL;
+
+ tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
+ }
return 0;
}
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 6f9144e1f1c1..61e3b05cf02c 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -24,6 +24,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_log.h>
+#include "nf_internals.h"
+
static const unsigned int nf_ct_icmpv6_timeout = 30*HZ;
bool icmpv6_pkt_to_tuple(const struct sk_buff *skb,
@@ -124,8 +126,7 @@ static void icmpv6_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_ICMPV6, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg);
}
int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
@@ -193,21 +194,33 @@ static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = {
};
static int icmpv6_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *tuple)
+ struct nf_conntrack_tuple *tuple,
+ u_int32_t flags)
{
- if (!tb[CTA_PROTO_ICMPV6_TYPE] ||
- !tb[CTA_PROTO_ICMPV6_CODE] ||
- !tb[CTA_PROTO_ICMPV6_ID])
- return -EINVAL;
-
- tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]);
- tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]);
- tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]);
-
- if (tuple->dst.u.icmp.type < 128 ||
- tuple->dst.u.icmp.type - 128 >= sizeof(invmap) ||
- !invmap[tuple->dst.u.icmp.type - 128])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) {
+ if (!tb[CTA_PROTO_ICMPV6_TYPE])
+ return -EINVAL;
+
+ tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]);
+ if (tuple->dst.u.icmp.type < 128 ||
+ tuple->dst.u.icmp.type - 128 >= sizeof(invmap) ||
+ !invmap[tuple->dst.u.icmp.type - 128])
+ return -EINVAL;
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) {
+ if (!tb[CTA_PROTO_ICMPV6_CODE])
+ return -EINVAL;
+
+ tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]);
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) {
+ if (!tb[CTA_PROTO_ICMPV6_ID])
+ return -EINVAL;
+
+ tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]);
+ }
return 0;
}
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 4f897b14b606..5a936334b517 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -62,6 +62,8 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
[SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS,
};
+#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1
+
#define sNO SCTP_CONNTRACK_NONE
#define sCL SCTP_CONNTRACK_CLOSED
#define sCW SCTP_CONNTRACK_COOKIE_WAIT
@@ -349,7 +351,7 @@ static bool sctp_error(struct sk_buff *skb,
}
return false;
out_invalid:
- nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg);
return true;
}
@@ -369,6 +371,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
u_int32_t offset, count;
unsigned int *timeouts;
unsigned long map[256 / sizeof(unsigned long)] = { 0 };
+ bool ignore = false;
if (sctp_error(skb, dataoff, state))
return -NF_ACCEPT;
@@ -427,15 +430,39 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
/* Sec 8.5.1 (D) */
if (sh->vtag != ct->proto.sctp.vtag[dir])
goto out_unlock;
- } else if (sch->type == SCTP_CID_HEARTBEAT ||
- sch->type == SCTP_CID_HEARTBEAT_ACK) {
+ } else if (sch->type == SCTP_CID_HEARTBEAT) {
+ if (ct->proto.sctp.vtag[dir] == 0) {
+ pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir);
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ } else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
+ if (test_bit(SCTP_CID_DATA, map) || ignore)
+ goto out_unlock;
+
+ ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+ ct->proto.sctp.last_dir = dir;
+ ignore = true;
+ continue;
+ } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+ ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+ }
+ } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) {
if (ct->proto.sctp.vtag[dir] == 0) {
pr_debug("Setting vtag %x for dir %d\n",
sh->vtag, dir);
ct->proto.sctp.vtag[dir] = sh->vtag;
} else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
- pr_debug("Verification tag check failed\n");
- goto out_unlock;
+ if (test_bit(SCTP_CID_DATA, map) || ignore)
+ goto out_unlock;
+
+ if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 ||
+ ct->proto.sctp.last_dir == dir)
+ goto out_unlock;
+
+ ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ ct->proto.sctp.vtag[!dir] = 0;
+ } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+ ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
}
}
@@ -462,6 +489,15 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
pr_debug("Setting vtag %x for dir %d\n",
ih->init_tag, !dir);
ct->proto.sctp.vtag[!dir] = ih->init_tag;
+
+ /* don't renew timeout on init retransmit so
+ * port reuse by client or NAT middlebox cannot
+ * keep entry alive indefinitely (incl. nat info).
+ */
+ if (new_state == SCTP_CONNTRACK_CLOSED &&
+ old_state == SCTP_CONNTRACK_CLOSED &&
+ nf_ct_is_confirmed(ct))
+ ignore = true;
}
ct->proto.sctp.state = new_state;
@@ -470,6 +506,10 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
}
spin_unlock_bh(&ct->lock);
+ /* allow but do not refresh timeout */
+ if (ignore)
+ return NF_ACCEPT;
+
timeouts = nf_ct_timeout_lookup(ct);
if (!timeouts)
timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts;
@@ -512,7 +552,7 @@ static bool sctp_can_early_drop(const struct nf_conn *ct)
#include <linux/netfilter/nfnetlink_conntrack.h>
static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
@@ -521,15 +561,20 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) ||
- nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
+ if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) ||
nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY,
ct->proto.sctp.vtag[IP_CT_DIR_REPLY]))
goto nla_put_failure;
+skip_state:
spin_unlock_bh(&ct->lock);
-
nla_nest_end(skb, nest_parms);
return 0;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 1926fd56df56..656631083177 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -31,20 +31,6 @@
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
-/* "Be conservative in what you do,
- be liberal in what you accept from others."
- If it's non-zero, we mark only out of window RST segments as INVALID. */
-static int nf_ct_tcp_be_liberal __read_mostly = 0;
-
-/* If it is set to zero, we disable picking up already established
- connections. */
-static int nf_ct_tcp_loose __read_mostly = 1;
-
-/* Max number of the retransmitted packets without receiving an (acceptable)
- ACK from the destination. If this number is reached, a shorter timer
- will be started. */
-static int nf_ct_tcp_max_retrans __read_mostly = 3;
-
/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
closely. They're more complex. --RR */
@@ -61,6 +47,12 @@ static const char *const tcp_conntrack_names[] = {
"SYN_SENT2",
};
+enum nf_ct_tcp_action {
+ NFCT_TCP_IGNORE,
+ NFCT_TCP_INVALID,
+ NFCT_TCP_ACCEPT,
+};
+
#define SECS * HZ
#define MINS * 60 SECS
#define HOURS * 60 MINS
@@ -352,10 +344,11 @@ static void tcp_options(const struct sk_buff *skb,
ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
length, buff);
- BUG_ON(ptr == NULL);
+ if (!ptr)
+ return;
- state->td_scale =
- state->flags = 0;
+ state->td_scale = 0;
+ state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
while (length > 0) {
int opcode=*ptr++;
@@ -408,7 +401,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
length, buff);
- BUG_ON(ptr == NULL);
+ if (!ptr)
+ return;
/* Fast path for timestamp-only option */
if (length == TCPOLEN_TSTAMP_ALIGNED
@@ -458,23 +452,71 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
}
}
-static bool tcp_in_window(const struct nf_conn *ct,
- struct ip_ct_tcp *state,
- enum ip_conntrack_dir dir,
- unsigned int index,
- const struct sk_buff *skb,
- unsigned int dataoff,
- const struct tcphdr *tcph)
+static void tcp_init_sender(struct ip_ct_tcp_state *sender,
+ struct ip_ct_tcp_state *receiver,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct tcphdr *tcph,
+ u32 end, u32 win)
{
- struct net *net = nf_ct_net(ct);
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ /* SYN-ACK in reply to a SYN
+ * or SYN from reply direction in simultaneous open.
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, dataoff, tcph, sender);
+ /* RFC 1323:
+ * Both sides must send the Window Scale option
+ * to enable window scaling in either direction.
+ */
+ if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
+ receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) {
+ sender->td_scale = 0;
+ receiver->td_scale = 0;
+ }
+}
+
+__printf(6, 7)
+static enum nf_ct_tcp_action nf_tcp_log_invalid(const struct sk_buff *skb,
+ const struct nf_conn *ct,
+ const struct nf_hook_state *state,
+ const struct ip_ct_tcp_state *sender,
+ enum nf_ct_tcp_action ret,
+ const char *fmt, ...)
+{
+ const struct nf_tcp_net *tn = nf_tcp_pernet(nf_ct_net(ct));
+ struct va_format vaf;
+ va_list args;
+ bool be_liberal;
+
+ be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal;
+ if (be_liberal)
+ return NFCT_TCP_ACCEPT;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ nf_ct_l4proto_log_invalid(skb, ct, state, "%pV", &vaf);
+ va_end(args);
+
+ return ret;
+}
+
+static enum nf_ct_tcp_action
+tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir,
+ unsigned int index, const struct sk_buff *skb,
+ unsigned int dataoff, const struct tcphdr *tcph,
+ const struct nf_hook_state *hook_state)
+{
+ struct ip_ct_tcp *state = &ct->proto.tcp;
struct ip_ct_tcp_state *sender = &state->seen[dir];
struct ip_ct_tcp_state *receiver = &state->seen[!dir];
- const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
__u32 seq, ack, sack, end, win, swin;
- u16 win_raw;
+ bool in_recv_win, seq_ok;
s32 receiver_offset;
- bool res, in_recv_win;
+ u16 win_raw;
/*
* Get the required data from the packet.
@@ -493,44 +535,17 @@ static bool tcp_in_window(const struct nf_conn *ct,
ack -= receiver_offset;
sack -= receiver_offset;
- pr_debug("tcp_in_window: START\n");
- pr_debug("tcp_in_window: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
- seq, ack, receiver_offset, sack, receiver_offset, win, end);
- pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-
if (sender->td_maxwin == 0) {
/*
* Initialize sender data.
*/
if (tcph->syn) {
- /*
- * SYN-ACK in reply to a SYN
- * or SYN from reply direction in simultaneous open.
- */
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
-
- tcp_options(skb, dataoff, tcph, sender);
- /*
- * RFC 1323:
- * Both sides must send the Window Scale option
- * to enable window scaling in either direction.
- */
- if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
- && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
- sender->td_scale =
- receiver->td_scale = 0;
+ tcp_init_sender(sender, receiver,
+ skb, dataoff, tcph,
+ end, win);
if (!tcph->ack)
/* Simultaneous open */
- return true;
+ return NFCT_TCP_ACCEPT;
} else {
/*
* We are in the middle of a connection,
@@ -541,29 +556,39 @@ static bool tcp_in_window(const struct nf_conn *ct,
swin = win << sender->td_scale;
sender->td_maxwin = (swin == 0 ? 1 : swin);
sender->td_maxend = end + sender->td_maxwin;
- /*
- * We haven't seen traffic in the other direction yet
- * but we have to tweak window tracking to pass III
- * and IV until that happens.
- */
- if (receiver->td_maxwin == 0)
+ if (receiver->td_maxwin == 0) {
+ /* We haven't seen traffic in the other
+ * direction yet but we have to tweak window
+ * tracking to pass III and IV until that
+ * happens.
+ */
receiver->td_end = receiver->td_maxend = sack;
+ } else if (sack == receiver->td_end + 1) {
+ /* Likely a reply to a keepalive.
+ * Needed for III.
+ */
+ receiver->td_end++;
+ }
+
}
- } else if (((state->state == TCP_CONNTRACK_SYN_SENT
- && dir == IP_CT_DIR_ORIGINAL)
- || (state->state == TCP_CONNTRACK_SYN_RECV
- && dir == IP_CT_DIR_REPLY))
- && after(end, sender->td_end)) {
+ } else if (tcph->syn &&
+ after(end, sender->td_end) &&
+ (state->state == TCP_CONNTRACK_SYN_SENT ||
+ state->state == TCP_CONNTRACK_SYN_RECV)) {
/*
* RFC 793: "if a TCP is reinitialized ... then it need
* not wait at all; it must only be sure to use sequence
* numbers larger than those recently used."
+ *
+ * Re-init state for this direction, just like for the first
+ * syn(-ack) reply, it might differ in seq, ack or tcp options.
*/
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
+ tcp_init_sender(sender, receiver,
+ skb, dataoff, tcph,
+ end, win);
- tcp_options(skb, dataoff, tcph, sender);
+ if (dir == IP_CT_DIR_REPLY && !tcph->ack)
+ return NFCT_TCP_ACCEPT;
}
if (!(tcph->ack)) {
@@ -587,113 +612,166 @@ static bool tcp_in_window(const struct nf_conn *ct,
*/
seq = end = sender->td_end;
- pr_debug("tcp_in_window: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
- seq, ack, receiver_offset, sack, receiver_offset, win, end);
- pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
+ seq_ok = before(seq, sender->td_maxend + 1);
+ if (!seq_ok) {
+ u32 overshot = end - sender->td_maxend + 1;
+ bool ack_ok;
+
+ ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1);
+ in_recv_win = receiver->td_maxwin &&
+ after(end, sender->td_end - receiver->td_maxwin - 1);
+
+ if (in_recv_win &&
+ ack_ok &&
+ overshot <= receiver->td_maxwin &&
+ before(sack, receiver->td_end + 1)) {
+ /* Work around TCPs that send more bytes than allowed by
+ * the receive window.
+ *
+ * If the (marked as invalid) packet is allowed to pass by
+ * the ruleset and the peer acks this data, then its possible
+ * all future packets will trigger 'ACK is over upper bound' check.
+ *
+ * Thus if only the sequence check fails then do update td_end so
+ * possible ACK for this data can update internal state.
+ */
+ sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "%u bytes more than expected", overshot);
+ }
+
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
+ "SEQ is over upper bound %u (over the window of the receiver)",
+ sender->td_maxend + 1);
+ }
+
+ if (!before(sack, receiver->td_end + 1))
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
+ "ACK is over upper bound %u (ACKed data not seen yet)",
+ receiver->td_end + 1);
/* Is the ending sequence in the receive window (if available)? */
in_recv_win = !receiver->td_maxwin ||
after(end, sender->td_end - receiver->td_maxwin - 1);
+ if (!in_recv_win)
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "SEQ is under lower bound %u (already ACKed data retransmitted)",
+ sender->td_end - receiver->td_maxwin - 1);
+ if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1))
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "ignored ACK under lower bound %u (possible overly delayed)",
+ receiver->td_end - MAXACKWINDOW(sender) - 1);
+
+ /* Take into account window scaling (RFC 1323). */
+ if (!tcph->syn)
+ win <<= sender->td_scale;
+
+ /* Update sender data. */
+ swin = win + (sack - ack);
+ if (sender->td_maxwin < swin)
+ sender->td_maxwin = swin;
+ if (after(end, sender->td_end)) {
+ sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ }
+ if (tcph->ack) {
+ if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+ sender->td_maxack = ack;
+ sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+ } else if (after(ack, sender->td_maxack)) {
+ sender->td_maxack = ack;
+ }
+ }
- pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
- before(seq, sender->td_maxend + 1),
- (in_recv_win ? 1 : 0),
- before(sack, receiver->td_end + 1),
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
+ /* Update receiver data. */
+ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
+ receiver->td_maxwin += end - sender->td_maxend;
+ if (after(sack + win, receiver->td_maxend - 1)) {
+ receiver->td_maxend = sack + win;
+ if (win == 0)
+ receiver->td_maxend++;
+ }
+ if (ack == receiver->td_end)
+ receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+ /* Check retransmissions. */
+ if (index == TCP_ACK_SET) {
+ if (state->last_dir == dir &&
+ state->last_seq == seq &&
+ state->last_ack == ack &&
+ state->last_end == end &&
+ state->last_win == win_raw) {
+ state->retrans++;
+ } else {
+ state->last_dir = dir;
+ state->last_seq = seq;
+ state->last_ack = ack;
+ state->last_end = end;
+ state->last_win = win_raw;
+ state->retrans = 0;
+ }
+ }
- if (before(seq, sender->td_maxend + 1) &&
- in_recv_win &&
- before(sack, receiver->td_end + 1) &&
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
- /*
- * Take into account window scaling (RFC 1323).
- */
- if (!tcph->syn)
- win <<= sender->td_scale;
+ return NFCT_TCP_ACCEPT;
+}
- /*
- * Update sender data.
- */
- swin = win + (sack - ack);
- if (sender->td_maxwin < swin)
- sender->td_maxwin = swin;
- if (after(end, sender->td_end)) {
- sender->td_end = end;
- sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
- }
- if (tcph->ack) {
- if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
- sender->td_maxack = ack;
- sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
- } else if (after(ack, sender->td_maxack))
- sender->td_maxack = ack;
- }
+static void __cold nf_tcp_handle_invalid(struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ int index,
+ const struct sk_buff *skb,
+ const struct nf_hook_state *hook_state)
+{
+ const unsigned int *timeouts;
+ const struct nf_tcp_net *tn;
+ unsigned int timeout;
+ u32 expires;
- /*
- * Update receiver data.
- */
- if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
- receiver->td_maxwin += end - sender->td_maxend;
- if (after(sack + win, receiver->td_maxend - 1)) {
- receiver->td_maxend = sack + win;
- if (win == 0)
- receiver->td_maxend++;
- }
- if (ack == receiver->td_end)
- receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ if (!test_bit(IPS_ASSURED_BIT, &ct->status) ||
+ test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ return;
- /*
- * Check retransmissions.
- */
- if (index == TCP_ACK_SET) {
- if (state->last_dir == dir
- && state->last_seq == seq
- && state->last_ack == ack
- && state->last_end == end
- && state->last_win == win_raw)
- state->retrans++;
- else {
- state->last_dir = dir;
- state->last_seq = seq;
- state->last_ack = ack;
- state->last_end = end;
- state->last_win = win_raw;
- state->retrans = 0;
- }
- }
- res = true;
- } else {
- res = false;
- if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
- tn->tcp_be_liberal)
- res = true;
- if (!res) {
- nf_ct_l4proto_log_invalid(skb, ct,
- "%s",
- before(seq, sender->td_maxend + 1) ?
- in_recv_win ?
- before(sack, receiver->td_end + 1) ?
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
- : "ACK is under the lower bound (possible overly delayed ACK)"
- : "ACK is over the upper bound (ACKed data not seen yet)"
- : "SEQ is under the lower bound (already ACKed data retransmitted)"
- : "SEQ is over the upper bound (over the window of the receiver)");
- }
+ /* We don't want to have connections hanging around in ESTABLISHED
+ * state for long time 'just because' conntrack deemed a FIN/RST
+ * out-of-window.
+ *
+ * Shrink the timeout just like when there is unacked data.
+ * This speeds up eviction of 'dead' connections where the
+ * connection and conntracks internal state are out of sync.
+ */
+ switch (index) {
+ case TCP_RST_SET:
+ case TCP_FIN_SET:
+ break;
+ default:
+ return;
}
- pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
- "receiver end=%u maxend=%u maxwin=%u\n",
- res, sender->td_end, sender->td_maxend, sender->td_maxwin,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+ if (ct->proto.tcp.last_dir != dir &&
+ (ct->proto.tcp.last_index == TCP_FIN_SET ||
+ ct->proto.tcp.last_index == TCP_RST_SET)) {
+ expires = nf_ct_expires(ct);
+ if (expires < 120 * HZ)
+ return;
+
+ tn = nf_tcp_pernet(nf_ct_net(ct));
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = tn->timeouts;
+
+ timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]);
+ if (expires > timeout) {
+ nf_ct_l4proto_log_invalid(skb, ct, hook_state,
+ "packet (index %d, dir %d) response for index %d lower timeout to %u",
+ index, dir, ct->proto.tcp.last_index, timeout);
- return res;
+ WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);
+ }
+ } else {
+ ct->proto.tcp.last_index = index;
+ ct->proto.tcp.last_dir = dir;
+ }
}
/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
@@ -715,7 +793,7 @@ static void tcp_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg);
}
/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
@@ -762,8 +840,6 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
enum tcp_conntrack new_state;
struct net *net = nf_ct_net(ct);
const struct nf_tcp_net *tn = nf_tcp_pernet(net);
- const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
- const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
/* Don't need lock here: this conntrack not in circulation yet */
new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
@@ -816,21 +892,33 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* tcp_packet will set them */
ct->proto.tcp.last_index = TCP_NONE_SET;
-
- pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- __func__,
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
return true;
}
-static bool nf_conntrack_tcp_established(const struct nf_conn *ct)
+static bool tcp_can_early_drop(const struct nf_conn *ct)
+{
+ switch (ct->proto.tcp.state) {
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ case TCP_CONNTRACK_CLOSE:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
{
- return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED &&
- test_bit(IPS_ASSURED_BIT, &ct->status);
+ state->td_end = 0;
+ state->td_maxend = 0;
+ state->td_maxwin = 0;
+ state->td_maxack = 0;
+ state->td_scale = 0;
+ state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
}
/* Returns verdict for packet, or -1 for invalid. */
@@ -845,6 +933,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
struct nf_conntrack_tuple *tuple;
enum tcp_conntrack new_state, old_state;
unsigned int index, *timeouts;
+ enum nf_ct_tcp_action res;
enum ip_conntrack_dir dir;
const struct tcphdr *th;
struct tcphdr _tcph;
@@ -900,7 +989,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
return -NF_REPEAT;
return NF_DROP;
}
- /* Fall through */
+ fallthrough;
case TCP_CONNTRACK_IGNORE:
/* Ignored packets:
*
@@ -939,8 +1028,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
ct->proto.tcp.last_flags;
- memset(&ct->proto.tcp.seen[dir], 0,
- sizeof(struct ip_ct_tcp_state));
+ nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]);
break;
}
ct->proto.tcp.last_index = index;
@@ -981,8 +1069,10 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
IP_CT_EXP_CHALLENGE_ACK;
}
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in "
- "state %s ", tcp_conntrack_names[old_state]);
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "packet (index %d) in dir %d ignored, state %s",
+ index, dir,
+ tcp_conntrack_names[old_state]);
return NF_ACCEPT;
case TCP_CONNTRACK_MAX:
/* Special case for SYN proxy: when the SYN to the server or
@@ -1001,10 +1091,11 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
}
/* Invalid packet */
- pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
- dir, get_conntrack_index(th), old_state);
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "invalid state");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "packet (index %d) in dir %d invalid, state %s",
+ index, dir,
+ tcp_conntrack_names[old_state]);
return -NF_ACCEPT;
case TCP_CONNTRACK_TIME_WAIT:
/* RFC5961 compliance cause stack to send "challenge-ACK"
@@ -1019,7 +1110,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
/* Detected RFC5961 challenge ACK */
ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored");
+ nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored");
return NF_ACCEPT; /* Don't change state */
}
break;
@@ -1038,13 +1129,33 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
if (index != TCP_RST_SET)
break;
- if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
+ /* If we are closing, tuple might have been re-used already.
+ * last_index, last_ack, and all other ct fields used for
+ * sequence/window validation are outdated in that case.
+ *
+ * As the conntrack can already be expired by GC under pressure,
+ * just skip validation checks.
+ */
+ if (tcp_can_early_drop(ct))
+ goto in_window;
+
+ /* td_maxack might be outdated if we let a SYN through earlier */
+ if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
+ ct->proto.tcp.last_index != TCP_SYN_SET) {
u32 seq = ntohl(th->seq);
- if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
+ /* If we are not in established state and SEQ=0 this is most
+ * likely an answer to a SYN we let go through above (last_index
+ * can be updated due to out-of-order ACKs).
+ */
+ if (seq == 0 && !nf_conntrack_tcp_established(ct))
+ break;
+
+ if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
+ !tn->tcp_ignore_invalid_rst) {
/* Invalid RST */
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "invalid rst");
+ nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
return -NF_ACCEPT;
}
@@ -1088,10 +1199,18 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
break;
}
- if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
- skb, dataoff, th)) {
+ res = tcp_in_window(ct, dir, index,
+ skb, dataoff, th, state);
+ switch (res) {
+ case NFCT_TCP_IGNORE:
+ spin_unlock_bh(&ct->lock);
+ return NF_ACCEPT;
+ case NFCT_TCP_INVALID:
+ nf_tcp_handle_invalid(ct, dir, index, skb, state);
spin_unlock_bh(&ct->lock);
return -NF_ACCEPT;
+ case NFCT_TCP_ACCEPT:
+ break;
}
in_window:
/* From now on we have got in-window packets */
@@ -1142,6 +1261,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_ACCEPT;
}
+
+ if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
+ /* do not renew timeout on SYN retransmit.
+ *
+ * Else port reuse by client or NAT middlebox can keep
+ * entry alive indefinitely (including nat info).
+ */
+ return NF_ACCEPT;
+ }
+
/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
* pickup with loose=1. Avoid large ESTABLISHED timeout.
*/
@@ -1152,7 +1281,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
&& (old_state == TCP_CONNTRACK_SYN_RECV
|| old_state == TCP_CONNTRACK_ESTABLISHED)
&& new_state == TCP_CONNTRACK_ESTABLISHED) {
- /* Set ASSURED if we see see valid ack in ESTABLISHED
+ /* Set ASSURED if we see valid ack in ESTABLISHED
after SYN_RECV or a valid answer for a picked up
connection. */
set_bit(IPS_ASSURED_BIT, &ct->status);
@@ -1163,29 +1292,13 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-static bool tcp_can_early_drop(const struct nf_conn *ct)
-{
- switch (ct->proto.tcp.state) {
- case TCP_CONNTRACK_FIN_WAIT:
- case TCP_CONNTRACK_LAST_ACK:
- case TCP_CONNTRACK_TIME_WAIT:
- case TCP_CONNTRACK_CLOSE:
- case TCP_CONNTRACK_CLOSE_WAIT:
- return true;
- default:
- break;
- }
-
- return false;
-}
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
struct nf_ct_tcp_flags tmp = {};
@@ -1195,8 +1308,13 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) ||
- nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
ct->proto.tcp.seen[0].td_scale) ||
nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
ct->proto.tcp.seen[1].td_scale))
@@ -1211,8 +1329,8 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
sizeof(struct nf_ct_tcp_flags), &tmp))
goto nla_put_failure;
+skip_state:
spin_unlock_bh(&ct->lock);
-
nla_nest_end(skb, nest_parms);
return 0;
@@ -1428,9 +1546,30 @@ void nf_conntrack_tcp_init_net(struct net *net)
* ->timeouts[0] contains 'new' timeout, like udp or icmp.
*/
tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
- tn->tcp_loose = nf_ct_tcp_loose;
- tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
- tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
+
+ /* If it is set to zero, we disable picking up already established
+ * connections.
+ */
+ tn->tcp_loose = 1;
+
+ /* "Be conservative in what you do,
+ * be liberal in what you accept from others."
+ * If it's non-zero, we mark only out of window RST segments as INVALID.
+ */
+ tn->tcp_be_liberal = 0;
+
+ /* If it's non-zero, we turn off RST sequence number check */
+ tn->tcp_ignore_invalid_rst = 0;
+
+ /* Max number of the retransmitted packets without receiving an (acceptable)
+ * ACK from the destination. If this number is reached, a shorter timer
+ * will be started.
+ */
+ tn->tcp_max_retrans = 3;
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ tn->offload_timeout = 30 * HZ;
+#endif
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 760ca2422816..3b516cffc779 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -38,8 +38,7 @@ static void udp_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_UDP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg);
}
static bool udp_error(struct sk_buff *skb,
@@ -81,18 +80,6 @@ static bool udp_error(struct sk_buff *skb,
return false;
}
-static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct,
- struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- u32 extra_jiffies)
-{
- if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY &&
- ct->status & IPS_NAT_CLASH))
- nf_ct_kill(ct);
- else
- nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies);
-}
-
/* Returns verdict for packet, and may modify conntracktype */
int nf_conntrack_udp_packet(struct nf_conn *ct,
struct sk_buff *skb,
@@ -117,19 +104,25 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
*/
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
unsigned long extra = timeouts[UDP_CT_UNREPLIED];
+ bool stream = false;
/* Still active after two seconds? Extend timeout. */
- if (time_after(jiffies, ct->proto.udp.stream_ts))
+ if (time_after(jiffies, ct->proto.udp.stream_ts)) {
extra = timeouts[UDP_CT_REPLIED];
+ stream = true;
+ }
nf_ct_refresh_acct(ct, ctinfo, skb, extra);
+ /* never set ASSURED for IPS_NAT_CLASH, they time out soon */
+ if (unlikely((ct->status & IPS_NAT_CLASH)))
+ return NF_ACCEPT;
+
/* Also, more likely to be important, and not a probe */
- if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
@@ -139,8 +132,7 @@ static void udplite_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_UDPLITE, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg);
}
static bool udplite_error(struct sk_buff *skb,
@@ -206,12 +198,15 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct,
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
nf_ct_refresh_acct(ct, ctinfo, skb,
timeouts[UDP_CT_REPLIED]);
+
+ if (unlikely((ct->status & IPS_NAT_CLASH)))
+ return NF_ACCEPT;
+
/* Also, more likely to be important, and not a probe */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
@@ -276,6 +271,10 @@ void nf_conntrack_udp_init_net(struct net *net)
for (i = 0; i < UDP_CT_MAX; i++)
un->timeouts[i] = udp_timeouts[i];
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ un->offload_timeout = 30 * HZ;
+#endif
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp =
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 1aebd6569d4e..13dc421fc4f5 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -34,10 +34,6 @@ MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>");
MODULE_DESCRIPTION("SANE connection tracking helper");
MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
-static char *sane_buffer;
-
-static DEFINE_SPINLOCK(nf_sane_lock);
-
#define MAX_PORTS 8
static u_int16_t ports[MAX_PORTS];
static unsigned int ports_c;
@@ -67,14 +63,16 @@ static int help(struct sk_buff *skb,
unsigned int dataoff, datalen;
const struct tcphdr *th;
struct tcphdr _tcph;
- void *sb_ptr;
int ret = NF_ACCEPT;
int dir = CTINFO2DIR(ctinfo);
struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple *tuple;
- struct sane_request *req;
struct sane_reply_net_start *reply;
+ union {
+ struct sane_request req;
+ struct sane_reply_net_start repl;
+ } buf;
/* Until there's been traffic both ways, don't look in packets. */
if (ctinfo != IP_CT_ESTABLISHED &&
@@ -92,56 +90,62 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT;
datalen = skb->len - dataoff;
-
- spin_lock_bh(&nf_sane_lock);
- sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
- BUG_ON(sb_ptr == NULL);
-
if (dir == IP_CT_DIR_ORIGINAL) {
+ const struct sane_request *req;
+
if (datalen != sizeof(struct sane_request))
- goto out;
+ return NF_ACCEPT;
+
+ req = skb_header_pointer(skb, dataoff, datalen, &buf.req);
+ if (!req)
+ return NF_ACCEPT;
- req = sb_ptr;
if (req->RPC_code != htonl(SANE_NET_START)) {
/* Not an interesting command */
- ct_sane_info->state = SANE_STATE_NORMAL;
- goto out;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
+ return NF_ACCEPT;
}
/* We're interested in the next reply */
- ct_sane_info->state = SANE_STATE_START_REQUESTED;
- goto out;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_START_REQUESTED);
+ return NF_ACCEPT;
}
+ /* IP_CT_DIR_REPLY */
+
/* Is it a reply to an uninteresting command? */
- if (ct_sane_info->state != SANE_STATE_START_REQUESTED)
- goto out;
+ if (READ_ONCE(ct_sane_info->state) != SANE_STATE_START_REQUESTED)
+ return NF_ACCEPT;
/* It's a reply to SANE_NET_START. */
- ct_sane_info->state = SANE_STATE_NORMAL;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
if (datalen < sizeof(struct sane_reply_net_start)) {
pr_debug("NET_START reply too short\n");
- goto out;
+ return NF_ACCEPT;
}
- reply = sb_ptr;
+ datalen = sizeof(struct sane_reply_net_start);
+
+ reply = skb_header_pointer(skb, dataoff, datalen, &buf.repl);
+ if (!reply)
+ return NF_ACCEPT;
+
if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
/* saned refused the command */
pr_debug("unsuccessful SANE_STATUS = %u\n",
ntohl(reply->status));
- goto out;
+ return NF_ACCEPT;
}
/* Invalid saned reply? Ignore it. */
if (reply->zero != 0)
- goto out;
+ return NF_ACCEPT;
exp = nf_ct_expect_alloc(ct);
if (exp == NULL) {
nf_ct_helper_log(skb, ct, "cannot alloc expectation");
- ret = NF_DROP;
- goto out;
+ return NF_DROP;
}
tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
@@ -159,9 +163,6 @@ static int help(struct sk_buff *skb,
}
nf_ct_expect_put(exp);
-
-out:
- spin_unlock_bh(&nf_sane_lock);
return ret;
}
@@ -175,7 +176,6 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
static void __exit nf_conntrack_sane_fini(void)
{
nf_conntrack_helpers_unregister(sane, ports_c * 2);
- kfree(sane_buffer);
}
static int __init nf_conntrack_sane_init(void)
@@ -184,10 +184,6 @@ static int __init nf_conntrack_sane_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_sane_master));
- sane_buffer = kmalloc(65536, GFP_KERNEL);
- if (!sane_buffer)
- return -ENOMEM;
-
if (ports_c == 0)
ports[ports_c++] = SANE_PORT;
@@ -207,7 +203,6 @@ static int __init nf_conntrack_sane_init(void)
ret = nf_conntrack_helpers_register(sane, ports_c * 2);
if (ret < 0) {
pr_err("failed to register helpers\n");
- kfree(sane_buffer);
return ret;
}
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index 3066449f8bd8..7ab2b25b57bc 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -232,19 +232,3 @@ s32 nf_ct_seq_offset(const struct nf_conn *ct,
this_way->offset_after : this_way->offset_before;
}
EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
-
-static const struct nf_ct_ext_type nf_ct_seqadj_extend = {
- .len = sizeof(struct nf_conn_seqadj),
- .align = __alignof__(struct nf_conn_seqadj),
- .id = NF_CT_EXT_SEQADJ,
-};
-
-int nf_conntrack_seqadj_init(void)
-{
- return nf_ct_extend_register(&nf_ct_seqadj_extend);
-}
-
-void nf_conntrack_seqadj_fini(void)
-{
- nf_ct_extend_unregister(&nf_ct_seqadj_extend);
-}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index b83dc9bf0a5d..77f5e82d8e3f 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -60,7 +60,7 @@ module_param(sip_external_media, int, 0600);
MODULE_PARM_DESC(sip_external_media, "Expect Media streams between external "
"endpoints (default 0)");
-const struct nf_nat_sip_hooks *nf_nat_sip_hooks;
+const struct nf_nat_sip_hooks __rcu *nf_nat_sip_hooks;
EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);
static int string_len(const struct nf_conn *ct, const char *dptr,
@@ -477,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
return ret;
if (ret == 0)
break;
- dataoff += *matchoff;
+ dataoff = *matchoff;
}
*in_header = 0;
}
@@ -489,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
break;
if (ret == 0)
return ret;
- dataoff += *matchoff;
+ dataoff = *matchoff;
}
if (in_header)
@@ -1229,6 +1229,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
struct nf_conntrack_expect *exp;
union nf_inet_addr *saddr, daddr;
const struct nf_nat_sip_hooks *hooks;
+ struct nf_conntrack_helper *helper;
__be16 port;
u8 proto;
unsigned int expires = 0;
@@ -1289,10 +1290,14 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
if (sip_direct_signalling)
saddr = &ct->tuplehash[!dir].tuple.src.u3;
+ helper = rcu_dereference(nfct_help(ct)->helper);
+ if (!helper)
+ return NF_DROP;
+
nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct),
saddr, &daddr, proto, NULL, &port);
exp->timeout.expires = sip_timeout * HZ;
- exp->helper = nfct_help(ct)->helper;
+ exp->helper = helper;
exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
hooks = rcu_dereference(nf_nat_sip_hooks);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 4912069627b6..4ffe84c5a82c 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -22,6 +22,9 @@
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_timestamp.h>
+#ifdef CONFIG_LWTUNNEL
+#include <net/netfilter/nf_hooks_lwtunnel.h>
+#endif
#include <linux/rculist_nulls.h>
static bool enable_hooks __read_mostly;
@@ -60,7 +63,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
ntohs(tuple->src.u.tcp.port),
ntohs(tuple->dst.u.tcp.port));
break;
- case IPPROTO_UDPLITE: /* fallthrough */
+ case IPPROTO_UDPLITE:
case IPPROTO_UDP:
seq_printf(s, "sport=%hu dport=%hu ",
ntohs(tuple->src.u.udp.port),
@@ -266,6 +269,7 @@ static const char* l4proto_name(u16 proto)
case IPPROTO_GRE: return "gre";
case IPPROTO_SCTP: return "sctp";
case IPPROTO_UDPLITE: return "udplite";
+ case IPPROTO_ICMPV6: return "icmpv6";
}
return "unknown";
@@ -299,9 +303,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
int ret = 0;
WARN_ON(!ct);
- if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use)))
return 0;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (nf_ct_should_gc(ct)) {
nf_ct_kill(ct);
goto release;
@@ -348,7 +355,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
goto release;
- if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status))
+ seq_puts(s, "[HW_OFFLOAD] ");
+ else if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
seq_puts(s, "[OFFLOAD] ");
else if (test_bit(IPS_ASSURED_BIT, &ct->status))
seq_puts(s, "[ASSURED] ");
@@ -364,7 +373,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR);
ct_show_delta_time(s, ct);
- seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
+ seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use));
if (seq_has_overflowed(s))
goto release;
@@ -422,24 +431,26 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
static int ct_cpu_seq_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_net(seq);
- unsigned int nr_conntracks = atomic_read(&net->ct.count);
const struct ip_conntrack_stat *st = v;
+ unsigned int nr_conntracks;
if (v == SEQ_START_TOKEN) {
- seq_puts(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
+ seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
return 0;
}
+ nr_conntracks = nf_conntrack_count(net);
+
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
nr_conntracks,
- 0,
+ st->clash_resolve,
st->found,
0,
st->invalid,
- st->ignore,
0,
0,
+ st->chaintoolong,
st->insert,
st->insert_failed,
st->drop,
@@ -505,22 +516,29 @@ static void nf_conntrack_standalone_fini_proc(struct net *net)
}
#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+u32 nf_conntrack_count(const struct net *net)
+{
+ const struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ return atomic_read(&cnet->count);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_count);
+
/* Sysctl support */
#ifdef CONFIG_SYSCTL
-/* Log invalid packets of a given protocol */
-static int log_invalid_proto_min __read_mostly;
-static int log_invalid_proto_max __read_mostly = 255;
-
/* size the user *wants to set */
static unsigned int nf_conntrack_htable_size_user __read_mostly;
static int
nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
+ /* module_param hashsize could have changed value */
+ nf_conntrack_htable_size_user = nf_conntrack_htable_size;
+
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret < 0 || !write)
return ret;
@@ -543,7 +561,6 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_LOG_INVALID,
NF_SYSCTL_CT_EXPECT_MAX,
NF_SYSCTL_CT_ACCT,
- NF_SYSCTL_CT_HELPER,
#ifdef CONFIG_NF_CONNTRACK_EVENTS
NF_SYSCTL_CT_EVENTS,
#endif
@@ -561,11 +578,18 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE,
NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS,
NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK,
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD,
+#endif
NF_SYSCTL_CT_PROTO_TCP_LOOSE,
NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
+ NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST,
NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD,
+#endif
NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP,
NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6,
#ifdef CONFIG_NF_CT_PROTO_SCTP
@@ -593,6 +617,9 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TIMEOUT_GRE,
NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM,
#endif
+#ifdef CONFIG_LWTUNNEL
+ NF_SYSCTL_CT_LWTUNNEL,
+#endif
__NF_SYSCTL_CT_LAST_SYSCTL,
};
@@ -609,7 +636,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
},
[NF_SYSCTL_CT_COUNT] = {
.procname = "nf_conntrack_count",
- .data = &init_net.ct.count,
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_dointvec,
@@ -624,20 +650,18 @@ static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_CHECKSUM] = {
.procname = "nf_conntrack_checksum",
.data = &init_net.ct.sysctl_checksum,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_LOG_INVALID] = {
.procname = "nf_conntrack_log_invalid",
.data = &init_net.ct.sysctl_log_invalid,
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &log_invalid_proto_min,
- .extra2 = &log_invalid_proto_max,
+ .proc_handler = proc_dou8vec_minmax,
},
[NF_SYSCTL_CT_EXPECT_MAX] = {
.procname = "nf_conntrack_expect_max",
@@ -649,18 +673,9 @@ static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_ACCT] = {
.procname = "nf_conntrack_acct",
.data = &init_net.ct.sysctl_acct,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- [NF_SYSCTL_CT_HELPER] = {
- .procname = "nf_conntrack_helper",
- .data = &init_net.ct.sysctl_auto_assign_helper,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
@@ -668,20 +683,20 @@ static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_EVENTS] = {
.procname = "nf_conntrack_events",
.data = &init_net.ct.sysctl_events,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_TWO,
},
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
[NF_SYSCTL_CT_TIMESTAMP] = {
.procname = "nf_conntrack_timestamp",
.data = &init_net.ct.sysctl_tstamp,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
@@ -752,27 +767,43 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = {
+ .procname = "nf_flowtable_tcp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
[NF_SYSCTL_CT_PROTO_TCP_LOOSE] = {
.procname = "nf_conntrack_tcp_loose",
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = {
.procname = "nf_conntrack_tcp_be_liberal",
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = {
+ .procname = "nf_conntrack_tcp_ignore_invalid_rst",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
[NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
.procname = "nf_conntrack_tcp_max_retrans",
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = {
.procname = "nf_conntrack_udp_timeout",
@@ -786,6 +817,14 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = {
+ .procname = "nf_flowtable_udp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = {
.procname = "nf_conntrack_icmp_timeout",
.maxlen = sizeof(unsigned int),
@@ -899,9 +938,9 @@ static struct ctl_table nf_ct_sysctl_table[] = {
},
[NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = {
.procname = "nf_conntrack_dccp_loose",
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
@@ -920,6 +959,15 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
#endif
+#ifdef CONFIG_LWTUNNEL
+ [NF_SYSCTL_CT_LWTUNNEL] = {
+ .procname = "nf_hooks_lwtunnel",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = nf_hooks_lwtunnel_sysctl_handler,
+ },
+#endif
{}
};
@@ -960,7 +1008,13 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
XASSIGN(LOOSE, &tn->tcp_loose);
XASSIGN(LIBERAL, &tn->tcp_be_liberal);
XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
+ XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst);
#undef XASSIGN
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout;
+#endif
+
}
static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
@@ -1022,6 +1076,7 @@ static void nf_conntrack_standalone_init_gre_sysctl(struct net *net,
static int nf_conntrack_standalone_init_sysctl(struct net *net)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct nf_udp_net *un = nf_udp_pernet(net);
struct ctl_table *table;
@@ -1032,11 +1087,10 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
if (!table)
return -ENOMEM;
- table[NF_SYSCTL_CT_COUNT].data = &net->ct.count;
+ table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
- table[NF_SYSCTL_CT_HELPER].data = &net->ct.sysctl_auto_assign_helper;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events;
#endif
@@ -1048,30 +1102,24 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout;
table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED];
table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED];
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout;
+#endif
nf_conntrack_standalone_init_tcp_sysctl(net, table);
nf_conntrack_standalone_init_sctp_sysctl(net, table);
nf_conntrack_standalone_init_dccp_sysctl(net, table);
nf_conntrack_standalone_init_gre_sysctl(net, table);
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns) {
- table[NF_SYSCTL_CT_MAX].procname = NULL;
- table[NF_SYSCTL_CT_ACCT].procname = NULL;
- table[NF_SYSCTL_CT_HELPER].procname = NULL;
-#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- table[NF_SYSCTL_CT_TIMESTAMP].procname = NULL;
-#endif
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- table[NF_SYSCTL_CT_EVENTS].procname = NULL;
-#endif
- }
-
- if (!net_eq(&init_net, net))
+ /* Don't allow non-init_net ns to alter global sysctls */
+ if (!net_eq(&init_net, net)) {
+ table[NF_SYSCTL_CT_MAX].mode = 0444;
+ table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+ }
- net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
- if (!net->ct.sysctl_header)
+ cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table);
+ if (!cnet->sysctl_header)
goto out_unregister_netfilter;
return 0;
@@ -1083,10 +1131,11 @@ out_unregister_netfilter:
static void nf_conntrack_standalone_fini_sysctl(struct net *net)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct ctl_table *table;
- table = net->ct.sysctl_header->ctl_table_arg;
- unregister_net_sysctl_table(net->ct.sysctl_header);
+ table = cnet->sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(cnet->sysctl_header);
kfree(table);
}
#else
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 14387e0b8008..0cc584d3dbb1 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -22,19 +22,21 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_timeout.h>
-struct nf_ct_timeout *
-(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
-
-void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_hook);
static int untimeout(struct nf_conn *ct, void *timeout)
{
struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
- if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
- RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ if (timeout_ext) {
+ const struct nf_ct_timeout *t;
+
+ t = rcu_access_pointer(timeout_ext->timeout);
+
+ if (!timeout || t == timeout)
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ }
/* We are not intended to delete this conntrack. */
return 0;
@@ -42,37 +44,41 @@ static int untimeout(struct nf_conn *ct, void *timeout)
void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout)
{
- nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = timeout,
+ };
+
+ nf_ct_iterate_cleanup_net(untimeout, &iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_untimeout);
static void __nf_ct_timeout_put(struct nf_ct_timeout *timeout)
{
- typeof(nf_ct_timeout_put_hook) timeout_put;
+ const struct nf_ct_timeout_hooks *h = rcu_dereference(nf_ct_timeout_hook);
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
- if (timeout_put)
- timeout_put(timeout);
+ if (h)
+ h->timeout_put(timeout);
}
int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
u8 l3num, u8 l4num, const char *timeout_name)
{
- typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
+ const struct nf_ct_timeout_hooks *h;
struct nf_ct_timeout *timeout;
struct nf_conn_timeout *timeout_ext;
const char *errmsg = NULL;
int ret = 0;
rcu_read_lock();
- timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook);
- if (!timeout_find_get) {
+ h = rcu_dereference(nf_ct_timeout_hook);
+ if (!h) {
ret = -ENOENT;
errmsg = "Timeout policy base is empty";
goto out;
}
- timeout = timeout_find_get(net, timeout_name);
+ timeout = h->timeout_find_get(net, timeout_name);
if (!timeout) {
ret = -ENOENT;
pr_info_ratelimited("No such timeout policy \"%s\"\n",
@@ -119,37 +125,22 @@ EXPORT_SYMBOL_GPL(nf_ct_set_timeout);
void nf_ct_destroy_timeout(struct nf_conn *ct)
{
struct nf_conn_timeout *timeout_ext;
- typeof(nf_ct_timeout_put_hook) timeout_put;
+ const struct nf_ct_timeout_hooks *h;
rcu_read_lock();
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
+ h = rcu_dereference(nf_ct_timeout_hook);
- if (timeout_put) {
+ if (h) {
timeout_ext = nf_ct_timeout_find(ct);
if (timeout_ext) {
- timeout_put(timeout_ext->timeout);
+ struct nf_ct_timeout *t;
+
+ t = rcu_dereference(timeout_ext->timeout);
+ if (t)
+ h->timeout_put(t);
RCU_INIT_POINTER(timeout_ext->timeout, NULL);
}
}
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_ct_destroy_timeout);
-
-static const struct nf_ct_ext_type timeout_extend = {
- .len = sizeof(struct nf_conn_timeout),
- .align = __alignof__(struct nf_conn_timeout),
- .id = NF_CT_EXT_TIMEOUT,
-};
-
-int nf_conntrack_timeout_init(void)
-{
- int ret = nf_ct_extend_register(&timeout_extend);
- if (ret < 0)
- pr_err("nf_ct_timeout: Unable to register timeout extension.\n");
- return ret;
-}
-
-void nf_conntrack_timeout_fini(void)
-{
- nf_ct_extend_unregister(&timeout_extend);
-}
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
index f656d393fa92..9e43a0a59e73 100644
--- a/net/netfilter/nf_conntrack_timestamp.c
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -19,27 +19,7 @@ static bool nf_ct_tstamp __read_mostly;
module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
-static const struct nf_ct_ext_type tstamp_extend = {
- .len = sizeof(struct nf_conn_tstamp),
- .align = __alignof__(struct nf_conn_tstamp),
- .id = NF_CT_EXT_TSTAMP,
-};
-
void nf_conntrack_tstamp_pernet_init(struct net *net)
{
net->ct.sysctl_tstamp = nf_ct_tstamp;
}
-
-int nf_conntrack_tstamp_init(void)
-{
- int ret;
- ret = nf_ct_extend_register(&tstamp_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_tstamp_fini(void)
-{
- nf_ct_extend_unregister(&tstamp_extend);
-}
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index f108a76925dd..a8e2425e43b0 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,13 +13,31 @@
#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
-static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+#define NF_RECURSION_LIMIT 2
+
+static DEFINE_PER_CPU(u8, nf_dup_skb_recursion);
+
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
+ enum nf_dev_hooks hook)
{
- if (skb_mac_header_was_set(skb))
+ if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT)
+ goto err;
+
+ if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
+ if (skb_cow_head(skb, skb->mac_len))
+ goto err;
+
skb_push(skb, skb->mac_len);
+ }
skb->dev = dev;
+ skb_clear_tstamp(skb);
+ __this_cpu_inc(nf_dup_skb_recursion);
dev_queue_xmit(skb);
+ __this_cpu_dec(nf_dup_skb_recursion);
+ return;
+err:
+ kfree_skb(skb);
}
void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
@@ -32,7 +50,7 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
return;
}
- nf_do_netdev_egress(pkt->skb, dev);
+ nf_do_netdev_egress(pkt->skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
@@ -47,7 +65,7 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
skb = skb_clone(pkt->skb, GFP_ATOMIC);
if (skb)
- nf_do_netdev_egress(skb, dev);
+ nf_do_netdev_egress(skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
@@ -73,3 +91,4 @@ EXPORT_SYMBOL_GPL(nft_fwd_dup_netdev_offload);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter packet duplication support");
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 8af28e10b4e6..81c26a96c30b 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -39,22 +39,28 @@ flow_offload_fill_dir(struct flow_offload *flow,
ft->l3proto = ctt->src.l3num;
ft->l4proto = ctt->dst.protonum;
- ft->src_port = ctt->src.u.tcp.port;
- ft->dst_port = ctt->dst.u.tcp.port;
+
+ switch (ctt->dst.protonum) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ft->src_port = ctt->src.u.tcp.port;
+ ft->dst_port = ctt->dst.u.tcp.port;
+ break;
+ }
}
struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
{
struct flow_offload *flow;
- if (unlikely(nf_ct_is_dying(ct) ||
- !atomic_inc_not_zero(&ct->ct_general.use)))
+ if (unlikely(nf_ct_is_dying(ct)))
return NULL;
flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
if (!flow)
- goto err_ct_refcnt;
+ return NULL;
+ refcount_inc(&ct->ct_general.use);
flow->ct = ct;
flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
@@ -66,40 +72,82 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
__set_bit(NF_FLOW_DNAT, &flow->flags);
return flow;
+}
+EXPORT_SYMBOL_GPL(flow_offload_alloc);
+
+static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
+{
+ const struct rt6_info *rt;
-err_ct_refcnt:
- nf_ct_put(ct);
+ if (flow_tuple->l3proto == NFPROTO_IPV6) {
+ rt = (const struct rt6_info *)flow_tuple->dst_cache;
+ return rt6_get_cookie(rt);
+ }
- return NULL;
+ return 0;
}
-EXPORT_SYMBOL_GPL(flow_offload_alloc);
static int flow_offload_fill_route(struct flow_offload *flow,
const struct nf_flow_route *route,
enum flow_offload_tuple_dir dir)
{
struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
- struct dst_entry *other_dst = route->tuple[!dir].dst;
struct dst_entry *dst = route->tuple[dir].dst;
-
- if (!dst_hold_safe(route->tuple[dir].dst))
- return -1;
+ int i, j = 0;
switch (flow_tuple->l3proto) {
case NFPROTO_IPV4:
flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
break;
case NFPROTO_IPV6:
- flow_tuple->mtu = ip6_dst_mtu_forward(dst);
+ flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
break;
}
- flow_tuple->iifidx = other_dst->dev->ifindex;
- flow_tuple->dst_cache = dst;
+ flow_tuple->iifidx = route->tuple[dir].in.ifindex;
+ for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
+ flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
+ flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
+ if (route->tuple[dir].in.ingress_vlans & BIT(i))
+ flow_tuple->in_vlan_ingress |= BIT(j);
+ j++;
+ }
+ flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+
+ switch (route->tuple[dir].xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
+ ETH_ALEN);
+ memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
+ ETH_ALEN);
+ flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
+ flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex;
+ break;
+ case FLOW_OFFLOAD_XMIT_XFRM:
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ if (!dst_hold_safe(route->tuple[dir].dst))
+ return -1;
+
+ flow_tuple->dst_cache = dst;
+ flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ flow_tuple->xmit_type = route->tuple[dir].xmit_type;
return 0;
}
+static void nft_flow_dst_release(struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir)
+{
+ if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
+ dst_release(flow->tuplehash[dir].tuple.dst_cache);
+}
+
int flow_offload_route_init(struct flow_offload *flow,
const struct nf_flow_route *route)
{
@@ -118,7 +166,7 @@ int flow_offload_route_init(struct flow_offload *flow,
return 0;
err_route_reply:
- dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
return err;
}
@@ -126,51 +174,43 @@ EXPORT_SYMBOL_GPL(flow_offload_route_init);
static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
{
- tcp->state = TCP_CONNTRACK_ESTABLISHED;
tcp->seen[0].td_maxwin = 0;
tcp->seen[1].td_maxwin = 0;
}
-#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
-#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
-
-static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct nf_conn *ct)
{
- const struct nf_conntrack_l4proto *l4proto;
+ struct net *net = nf_ct_net(ct);
int l4num = nf_ct_protonum(ct);
- unsigned int timeout;
+ s32 timeout;
- l4proto = nf_ct_l4proto_find(l4num);
- if (!l4proto)
- return;
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
- if (l4num == IPPROTO_TCP)
- timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
- else if (l4num == IPPROTO_UDP)
- timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
- else
- return;
+ flow_offload_fixup_tcp(&ct->proto.tcp);
- if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
- ct->timeout = nfct_time_stamp + timeout;
-}
+ timeout = tn->timeouts[ct->proto.tcp.state];
+ timeout -= tn->offload_timeout;
+ } else if (l4num == IPPROTO_UDP) {
+ struct nf_udp_net *tn = nf_udp_pernet(net);
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
-{
- if (nf_ct_protonum(ct) == IPPROTO_TCP)
- flow_offload_fixup_tcp(&ct->proto.tcp);
-}
+ timeout = tn->timeouts[UDP_CT_REPLIED];
+ timeout -= tn->offload_timeout;
+ } else {
+ return;
+ }
-static void flow_offload_fixup_ct(struct nf_conn *ct)
-{
- flow_offload_fixup_ct_state(ct);
- flow_offload_fixup_ct_timeout(ct);
+ if (timeout < 0)
+ timeout = 0;
+
+ if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
+ WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
}
static void flow_offload_route_release(struct flow_offload *flow)
{
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
}
void flow_offload_free(struct flow_offload *flow)
@@ -191,14 +231,14 @@ static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple *tuple = data;
- return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+ return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}
static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple_rhash *tuplehash = data;
- return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}
static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -207,7 +247,7 @@ static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
const struct flow_offload_tuple *tuple = arg->key;
const struct flow_offload_tuple_rhash *x = ptr;
- if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
return 1;
return 0;
@@ -221,11 +261,30 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = {
.automatic_shrinking = true,
};
+unsigned long flow_offload_get_timeout(struct flow_offload *flow)
+{
+ unsigned long timeout = NF_FLOW_TIMEOUT;
+ struct net *net = nf_ct_net(flow->ct);
+ int l4num = nf_ct_protonum(flow->ct);
+
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+ timeout = tn->offload_timeout;
+ } else if (l4num == IPPROTO_UDP) {
+ struct nf_udp_net *tn = nf_udp_pernet(net);
+
+ timeout = tn->offload_timeout;
+ }
+
+ return timeout;
+}
+
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
{
int err;
- flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
+ flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
err = rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[0].node,
@@ -243,6 +302,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
return err;
}
+ nf_ct_offload_timeout(flow->ct);
+
if (nf_flowtable_hw_offload(flow_table)) {
__set_bit(NF_FLOW_HW, &flow->flags);
nf_flow_offload_add(flow_table, flow);
@@ -252,6 +313,24 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
}
EXPORT_SYMBOL_GPL(flow_offload_add);
+void flow_offload_refresh(struct nf_flowtable *flow_table,
+ struct flow_offload *flow)
+{
+ u32 timeout;
+
+ timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
+ if (timeout - READ_ONCE(flow->timeout) > HZ)
+ WRITE_ONCE(flow->timeout, timeout);
+ else
+ return;
+
+ if (likely(!nf_flowtable_hw_offload(flow_table)))
+ return;
+
+ nf_flow_offload_add(flow_table, flow);
+}
+EXPORT_SYMBOL_GPL(flow_offload_refresh);
+
static inline bool nf_flow_has_expired(const struct flow_offload *flow)
{
return nf_flow_timeout_delta(flow->timeout) <= 0;
@@ -266,22 +345,14 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
nf_flow_offload_rhash_params);
-
- clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
-
- if (nf_flow_has_expired(flow))
- flow_offload_fixup_ct(flow->ct);
- else if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
- flow_offload_fixup_ct_timeout(flow->ct);
-
flow_offload_free(flow);
}
void flow_offload_teardown(struct flow_offload *flow)
{
+ clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
set_bit(NF_FLOW_TEARDOWN, &flow->flags);
-
- flow_offload_fixup_ct_state(flow->ct);
+ flow_offload_fixup_ct(flow->ct);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
@@ -312,7 +383,8 @@ EXPORT_SYMBOL_GPL(flow_offload_lookup);
static int
nf_flow_table_iterate(struct nf_flowtable *flow_table,
- void (*iter)(struct flow_offload *flow, void *data),
+ void (*iter)(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, void *data),
void *data)
{
struct flow_offload_tuple_rhash *tuplehash;
@@ -336,7 +408,7 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
- iter(flow, data);
+ iter(flow_table, flow, data);
}
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
@@ -344,12 +416,14 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
return err;
}
-static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
+static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
{
- struct nf_flowtable *flow_table = data;
+ if (nf_flow_has_expired(flow) ||
+ nf_ct_is_dying(flow->ct))
+ flow_offload_teardown(flow);
- if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) ||
- test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
+ if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
if (test_bit(NF_FLOW_HW, &flow->flags)) {
if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
nf_flow_offload_del(flow_table, flow);
@@ -363,78 +437,63 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
}
}
+void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
+{
+ nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
+}
+
static void nf_flow_offload_work_gc(struct work_struct *work)
{
struct nf_flowtable *flow_table;
flow_table = container_of(work, struct nf_flowtable, gc_work.work);
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
+ nf_flow_table_gc_run(flow_table);
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
-static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
-
- return 0;
+ inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
}
-static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace2(&udph->check, skb, port,
- new_port, true);
+ new_port, false);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
- u8 protocol, __be16 port, __be16 new_port)
+static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, __be16 port, __be16 new_port)
{
switch (protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_tcp(skb, thoff, port, new_port);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_udp(skb, thoff, port, new_port);
break;
}
-
- return 0;
}
-int nf_flow_snat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
@@ -448,25 +507,19 @@ int nf_flow_snat_port(const struct flow_offload *flow,
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
hdr->dest = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_snat_port);
-int nf_flow_dnat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, u8 protocol,
+ enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
@@ -480,11 +533,9 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
hdr->source = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
@@ -492,8 +543,9 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
{
int err;
- INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+ INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
flow_block_init(&flowtable->flow_block);
+ init_rwsem(&flowtable->flow_block_lock);
err = rhashtable_init(&flowtable->rhashtable,
&nf_flow_offload_rhash_params);
@@ -511,7 +563,8 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
}
EXPORT_SYMBOL_GPL(nf_flow_table_init);
-static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
+static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
{
struct net_device *dev = data;
@@ -526,8 +579,8 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
flow_offload_teardown(flow);
}
-static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
- struct net_device *dev)
+void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
+ struct net_device *dev)
{
nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
flush_delayed_work(&flowtable->gc_work);
@@ -540,7 +593,7 @@ void nf_flow_table_cleanup(struct net_device *dev)
mutex_lock(&flowtable_lock);
list_for_each_entry(flowtable, &flowtables, list)
- nf_flow_table_iterate_cleanup(flowtable, dev);
+ nf_flow_table_gc_cleanup(flowtable, dev);
mutex_unlock(&flowtable_lock);
}
EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
@@ -550,22 +603,85 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
mutex_lock(&flowtable_lock);
list_del(&flow_table->list);
mutex_unlock(&flowtable_lock);
+
cancel_delayed_work_sync(&flow_table->gc_work);
- nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
nf_flow_table_offload_flush(flow_table);
+ /* ... no more pending work after this stage ... */
+ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
+ nf_flow_table_gc_run(flow_table);
+ nf_flow_table_offload_flush_cleanup(flow_table);
rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
+static int nf_flow_table_init_net(struct net *net)
+{
+ net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
+ return net->ft.stat ? 0 : -ENOMEM;
+}
+
+static void nf_flow_table_fini_net(struct net *net)
+{
+ free_percpu(net->ft.stat);
+}
+
+static int nf_flow_table_pernet_init(struct net *net)
+{
+ int ret;
+
+ ret = nf_flow_table_init_net(net);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_flow_table_init_proc(net);
+ if (ret < 0)
+ goto out_proc;
+
+ return 0;
+
+out_proc:
+ nf_flow_table_fini_net(net);
+ return ret;
+}
+
+static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_flow_table_fini_proc(net);
+ nf_flow_table_fini_net(net);
+ }
+}
+
+static struct pernet_operations nf_flow_table_net_ops = {
+ .init = nf_flow_table_pernet_init,
+ .exit_batch = nf_flow_table_pernet_exit,
+};
+
static int __init nf_flow_table_module_init(void)
{
- return nf_flow_table_offload_init();
+ int ret;
+
+ ret = register_pernet_subsys(&nf_flow_table_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_flow_table_offload_init();
+ if (ret)
+ goto out_offload;
+
+ return 0;
+
+out_offload:
+ unregister_pernet_subsys(&nf_flow_table_net_ops);
+ return ret;
}
static void __exit nf_flow_table_module_exit(void)
{
nf_flow_table_offload_exit();
+ unregister_pernet_subsys(&nf_flow_table_net_ops);
}
module_init(nf_flow_table_module_init);
@@ -573,3 +689,4 @@ module_exit(nf_flow_table_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter flow table module");
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 88bedf1ff1ae..0ccabf3fa6aa 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -6,12 +6,29 @@
#include <linux/rhashtable.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
+#include <linux/if_vlan.h>
static unsigned int
nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
+ struct vlan_ethhdr *veth;
+ __be16 proto;
+
switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ proto = veth->h_vlan_encapsulated_proto;
+ break;
+ case htons(ETH_P_PPP_SES):
+ proto = nf_flow_pppoe_proto(skb);
+ break;
+ default:
+ proto = skb->protocol;
+ break;
+ }
+
+ switch (proto) {
case htons(ETH_P_IP):
return nf_flow_offload_ip_hook(priv, skb, state);
case htons(ETH_P_IPV6):
@@ -54,8 +71,30 @@ static struct nf_flowtable_type flowtable_inet = {
.owner = THIS_MODULE,
};
+static struct nf_flowtable_type flowtable_ipv4 = {
+ .family = NFPROTO_IPV4,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv4,
+ .free = nf_flow_table_free,
+ .hook = nf_flow_offload_ip_hook,
+ .owner = THIS_MODULE,
+};
+
+static struct nf_flowtable_type flowtable_ipv6 = {
+ .family = NFPROTO_IPV6,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv6,
+ .free = nf_flow_table_free,
+ .hook = nf_flow_offload_ipv6_hook,
+ .owner = THIS_MODULE,
+};
+
static int __init nf_flow_inet_module_init(void)
{
+ nft_register_flowtable_type(&flowtable_ipv4);
+ nft_register_flowtable_type(&flowtable_ipv6);
nft_register_flowtable_type(&flowtable_inet);
return 0;
@@ -64,6 +103,8 @@ static int __init nf_flow_inet_module_init(void)
static void __exit nf_flow_inet_module_exit(void)
{
nft_unregister_flowtable_type(&flowtable_inet);
+ nft_unregister_flowtable_type(&flowtable_ipv6);
+ nft_unregister_flowtable_type(&flowtable_ipv4);
}
module_init(nf_flow_inet_module_init);
@@ -71,4 +112,7 @@ module_exit(nf_flow_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET);
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET6);
MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */
+MODULE_DESCRIPTION("Netfilter flow table mixed IPv4/IPv6 module");
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 9e563fd3da0f..b350fe9d00b0 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -7,11 +7,13 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/netdevice.h>
+#include <linux/if_ether.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack_acct.h>
/* For layer 4 checksum field offset. */
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -24,9 +26,6 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
if (proto != IPPROTO_TCP)
return 0;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
if (unlikely(tcph->fin || tcph->rst)) {
flow_offload_teardown(flow);
@@ -36,30 +35,20 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
return 0;
}
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
- return 0;
}
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&udph->check, skb, addr,
@@ -67,31 +56,25 @@ static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
- unsigned int thoff, __be32 addr,
- __be32 new_addr)
+static void nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
{
switch (iph->protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -106,17 +89,15 @@ static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
iph->daddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -131,29 +112,24 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
iph->saddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- unsigned int thoff, enum flow_offload_tuple_dir dir)
+static void nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, enum flow_offload_tuple_dir dir,
+ struct iphdr *iph)
{
- struct iphdr *iph = ip_hdr(skb);
-
- if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
- (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
- if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
- (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir);
+ }
}
static bool ip_has_options(unsigned int thoff)
@@ -161,43 +137,101 @@ static bool ip_has_options(unsigned int thoff)
return thoff != sizeof(struct iphdr);
}
+static void nf_flow_tuple_encap(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
+{
+ struct vlan_ethhdr *veth;
+ struct pppoe_hdr *phdr;
+ int i = 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ tuple->encap[i].id = skb_vlan_tag_get(skb);
+ tuple->encap[i].proto = skb->vlan_proto;
+ i++;
+ }
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ tuple->encap[i].id = ntohs(veth->h_vlan_TCI);
+ tuple->encap[i].proto = skb->protocol;
+ break;
+ case htons(ETH_P_PPP_SES):
+ phdr = (struct pppoe_hdr *)skb_mac_header(skb);
+ tuple->encap[i].id = ntohs(phdr->sid);
+ tuple->encap[i].proto = skb->protocol;
+ break;
+ }
+}
+
static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
+ struct flow_offload_tuple *tuple, u32 *hdrsize,
+ u32 offset)
{
struct flow_ports *ports;
unsigned int thoff;
struct iphdr *iph;
+ u8 ipproto;
- if (!pskb_may_pull(skb, sizeof(*iph)))
+ if (!pskb_may_pull(skb, sizeof(*iph) + offset))
return -1;
- iph = ip_hdr(skb);
- thoff = iph->ihl * 4;
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ thoff = (iph->ihl * 4);
if (ip_is_fragment(iph) ||
unlikely(ip_has_options(thoff)))
return -1;
- if (iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
+ thoff += offset;
+
+ ipproto = iph->protocol;
+ switch (ipproto) {
+ case IPPROTO_TCP:
+ *hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ *hdrsize = sizeof(struct udphdr);
+ break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ *hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
+ default:
return -1;
+ }
if (iph->ttl <= 1)
return -1;
- thoff = iph->ihl * 4;
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ switch (ipproto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return -1;
+ break;
+ }
+ }
+
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
tuple->src_v4.s_addr = iph->saddr;
tuple->dst_v4.s_addr = iph->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
tuple->l3proto = AF_INET;
- tuple->l4proto = iph->protocol;
+ tuple->l4proto = ipproto;
tuple->iifidx = dev->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
@@ -214,12 +248,13 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
return true;
}
-static int nf_flow_offload_dst_check(struct dst_entry *dst)
+static inline bool nf_flow_dst_check(struct flow_offload_tuple *tuple)
{
- if (unlikely(dst_xfrm(dst)))
- return dst_check(dst, 0) ? 0 : -1;
+ if (tuple->xmit_type != FLOW_OFFLOAD_XMIT_NEIGH &&
+ tuple->xmit_type != FLOW_OFFLOAD_XMIT_XFRM)
+ return true;
- return 0;
+ return dst_check(tuple->dst_cache, tuple->dst_cookie);
}
static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
@@ -232,11 +267,73 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
return NF_STOLEN;
}
-static bool nf_flow_offload_refresh(struct nf_flowtable *flow_table,
- struct flow_offload *flow)
+static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto,
+ u32 *offset)
{
- return nf_flowtable_hw_offload(flow_table) &&
- test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags);
+ struct vlan_ethhdr *veth;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ if (veth->h_vlan_encapsulated_proto == proto) {
+ *offset += VLAN_HLEN;
+ return true;
+ }
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (nf_flow_pppoe_proto(skb) == proto) {
+ *offset += PPPOE_SES_HLEN;
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+static void nf_flow_encap_pop(struct sk_buff *skb,
+ struct flow_offload_tuple_rhash *tuplehash)
+{
+ struct vlan_hdr *vlan_hdr;
+ int i;
+
+ for (i = 0; i < tuplehash->tuple.encap_num; i++) {
+ if (skb_vlan_tag_present(skb)) {
+ __vlan_hwaccel_clear_tag(skb);
+ continue;
+ }
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ vlan_hdr = (struct vlan_hdr *)skb->data;
+ __skb_pull(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vlan_hdr);
+ skb_reset_network_header(skb);
+ break;
+ case htons(ETH_P_PPP_SES):
+ skb->protocol = nf_flow_pppoe_proto(skb);
+ skb_pull(skb, PPPOE_SES_HLEN);
+ skb_reset_network_header(skb);
+ break;
+ }
+ }
+}
+
+static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ const struct flow_offload_tuple_rhash *tuplehash,
+ unsigned short type)
+{
+ struct net_device *outdev;
+
+ outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx);
+ if (!outdev)
+ return NF_DROP;
+
+ skb->dev = outdev;
+ dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest,
+ tuplehash->tuple.out.h_source, skb->len);
+ dev_queue_xmit(skb);
+
+ return NF_STOLEN;
}
unsigned int
@@ -249,15 +346,18 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
enum flow_offload_tuple_dir dir;
struct flow_offload *flow;
struct net_device *outdev;
+ u32 hdrsize, offset = 0;
+ unsigned int thoff, mtu;
struct rtable *rt;
- unsigned int thoff;
struct iphdr *iph;
__be32 nexthop;
+ int ret;
- if (skb->protocol != htons(ETH_P_IP))
+ if (skb->protocol != htons(ETH_P_IP) &&
+ !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &offset))
return NF_ACCEPT;
- if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+ if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize, offset) < 0)
return NF_ACCEPT;
tuplehash = flow_offload_lookup(flow_table, &tuple);
@@ -266,78 +366,85 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
- outdev = rt->dst.dev;
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ mtu = flow->tuplehash[dir].tuple.mtu + offset;
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
return NF_ACCEPT;
- if (skb_try_make_writable(skb, sizeof(*iph)))
- return NF_DROP;
-
- thoff = ip_hdr(skb)->ihl * 4;
- if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ thoff = (iph->ihl * 4) + offset;
+ if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
return NF_ACCEPT;
- if (unlikely(nf_flow_offload_refresh(flow_table, flow)))
- nf_flow_offload_add(flow_table, flow);
-
- if (nf_flow_offload_dst_check(&rt->dst)) {
+ if (!nf_flow_dst_check(&tuplehash->tuple)) {
flow_offload_teardown(flow);
return NF_ACCEPT;
}
- if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
+ if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
- flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
+ flow_offload_refresh(flow_table, flow);
+
+ nf_flow_encap_pop(skb, tuplehash);
+ thoff -= offset;
+
iph = ip_hdr(skb);
+ nf_flow_nat_ip(flow, skb, thoff, dir, iph);
+
ip_decrease_ttl(iph);
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
- if (unlikely(dst_xfrm(&rt->dst))) {
+ if (flow_table->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
+
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
+ rt = (struct rtable *)tuplehash->tuple.dst_cache;
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->dev->ifindex;
IPCB(skb)->flags = IPSKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
- skb->dev = outdev;
- nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ rt = (struct rtable *)tuplehash->tuple.dst_cache;
+ outdev = rt->dst.dev;
+ skb->dev = outdev;
+ nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+ ret = NF_STOLEN;
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP);
+ if (ret == NF_DROP)
+ flow_offload_teardown(flow);
+ break;
+ }
- return NF_STOLEN;
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
-static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr,
+ struct ipv6hdr *ip6h)
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
new_addr->s6_addr32, true);
-
- return 0;
}
-static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
@@ -345,32 +452,26 @@ static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff, struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff, struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
switch (ip6h->nexthdr) {
case IPPROTO_TCP:
- if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr, ip6h);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -385,17 +486,15 @@ static int nf_flow_snat_ipv6(const struct flow_offload *flow,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
ip6h->daddr = new_addr;
break;
- default:
- return -1;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -410,64 +509,91 @@ static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
ip6h->saddr = new_addr;
break;
- default:
- return -1;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_nat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_nat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir,
+ struct ipv6hdr *ip6h)
{
- struct ipv6hdr *ip6h = ipv6_hdr(skb);
unsigned int thoff = sizeof(*ip6h);
- if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
- (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
- if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
- (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
}
static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
+ struct flow_offload_tuple *tuple, u32 *hdrsize,
+ u32 offset)
{
struct flow_ports *ports;
struct ipv6hdr *ip6h;
unsigned int thoff;
+ u8 nexthdr;
- if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ thoff = sizeof(*ip6h) + offset;
+ if (!pskb_may_pull(skb, thoff))
return -1;
- ip6h = ipv6_hdr(skb);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
- if (ip6h->nexthdr != IPPROTO_TCP &&
- ip6h->nexthdr != IPPROTO_UDP)
+ nexthdr = ip6h->nexthdr;
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ *hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ *hdrsize = sizeof(struct udphdr);
+ break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ *hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
+ default:
return -1;
+ }
if (ip6h->hop_limit <= 1)
return -1;
- thoff = sizeof(*ip6h);
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return -1;
+ break;
+ }
+ }
+
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
tuple->src_v6 = ip6h->saddr;
tuple->dst_v6 = ip6h->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
tuple->l3proto = AF_INET6;
- tuple->l4proto = ip6h->nexthdr;
+ tuple->l4proto = nexthdr;
tuple->iifidx = dev->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
@@ -483,13 +609,17 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
const struct in6_addr *nexthop;
struct flow_offload *flow;
struct net_device *outdev;
+ unsigned int thoff, mtu;
+ u32 hdrsize, offset = 0;
struct ipv6hdr *ip6h;
struct rt6_info *rt;
+ int ret;
- if (skb->protocol != htons(ETH_P_IPV6))
+ if (skb->protocol != htons(ETH_P_IPV6) &&
+ !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &offset))
return NF_ACCEPT;
- if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+ if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize, offset) < 0)
return NF_ACCEPT;
tuplehash = flow_offload_lookup(flow_table, &tuple);
@@ -498,47 +628,62 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
- outdev = rt->dst.dev;
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ mtu = flow->tuplehash[dir].tuple.mtu + offset;
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
return NF_ACCEPT;
- if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb,
- sizeof(*ip6h)))
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
+ thoff = sizeof(*ip6h) + offset;
+ if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff))
return NF_ACCEPT;
- if (unlikely(nf_flow_offload_refresh(flow_table, flow)))
- nf_flow_offload_add(flow_table, flow);
-
- if (nf_flow_offload_dst_check(&rt->dst)) {
+ if (!nf_flow_dst_check(&tuplehash->tuple)) {
flow_offload_teardown(flow);
return NF_ACCEPT;
}
- if (skb_try_make_writable(skb, sizeof(*ip6h)))
+ if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
- if (nf_flow_nat_ipv6(flow, skb, dir) < 0)
- return NF_DROP;
+ flow_offload_refresh(flow_table, flow);
+
+ nf_flow_encap_pop(skb, tuplehash);
- flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
ip6h = ipv6_hdr(skb);
+ nf_flow_nat_ipv6(flow, skb, dir, ip6h);
+
ip6h->hop_limit--;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
+
+ if (flow_table->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
- if (unlikely(dst_xfrm(&rt->dst))) {
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
+ rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
IP6CB(skb)->iif = skb->dev->ifindex;
IP6CB(skb)->flags = IP6SKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
- skb->dev = outdev;
- nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
+ outdev = rt->dst.dev;
+ skb->dev = outdev;
+ nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+ ret = NF_STOLEN;
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6);
+ if (ret == NF_DROP)
+ flow_offload_teardown(flow);
+ break;
+ }
- return NF_STOLEN;
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 06f00cdc3891..b04645ced89b 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -7,54 +7,94 @@
#include <linux/tc_act/tc_csum.h>
#include <net/flow_offload.h>
#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_tuple.h>
-static struct work_struct nf_flow_offload_work;
-static DEFINE_SPINLOCK(flow_offload_pending_list_lock);
-static LIST_HEAD(flow_offload_pending_list);
+static struct workqueue_struct *nf_flow_offload_add_wq;
+static struct workqueue_struct *nf_flow_offload_del_wq;
+static struct workqueue_struct *nf_flow_offload_stats_wq;
struct flow_offload_work {
struct list_head list;
enum flow_cls_command cmd;
- int priority;
struct nf_flowtable *flowtable;
struct flow_offload *flow;
-};
-
-struct nf_flow_key {
- struct flow_dissector_key_meta meta;
- struct flow_dissector_key_control control;
- struct flow_dissector_key_basic basic;
- union {
- struct flow_dissector_key_ipv4_addrs ipv4;
- struct flow_dissector_key_ipv6_addrs ipv6;
- };
- struct flow_dissector_key_tcp tcp;
- struct flow_dissector_key_ports tp;
-} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
-
-struct nf_flow_match {
- struct flow_dissector dissector;
- struct nf_flow_key key;
- struct nf_flow_key mask;
-};
-
-struct nf_flow_rule {
- struct nf_flow_match match;
- struct flow_rule *rule;
+ struct work_struct work;
};
#define NF_FLOW_DISSECTOR(__match, __type, __field) \
(__match)->dissector.offset[__type] = \
offsetof(struct nf_flow_key, __field)
+static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
+ struct ip_tunnel_info *tun_info)
+{
+ struct nf_flow_key *mask = &match->mask;
+ struct nf_flow_key *key = &match->key;
+ unsigned int enc_keys;
+
+ if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ return;
+
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
+ key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id);
+ mask->enc_key_id.keyid = 0xffffffff;
+ enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+ BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL);
+
+ if (ip_tunnel_info_af(tun_info) == AF_INET) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+ enc_ipv4);
+ key->enc_ipv4.src = tun_info->key.u.ipv4.dst;
+ key->enc_ipv4.dst = tun_info->key.u.ipv4.src;
+ if (key->enc_ipv4.src)
+ mask->enc_ipv4.src = 0xffffffff;
+ if (key->enc_ipv4.dst)
+ mask->enc_ipv4.dst = 0xffffffff;
+ enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ } else {
+ memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst,
+ sizeof(struct in6_addr));
+ memcpy(&key->enc_ipv6.dst, &tun_info->key.u.ipv6.src,
+ sizeof(struct in6_addr));
+ if (memcmp(&key->enc_ipv6.src, &in6addr_any,
+ sizeof(struct in6_addr)))
+ memset(&mask->enc_ipv6.src, 0xff,
+ sizeof(struct in6_addr));
+ if (memcmp(&key->enc_ipv6.dst, &in6addr_any,
+ sizeof(struct in6_addr)))
+ memset(&mask->enc_ipv6.dst, 0xff,
+ sizeof(struct in6_addr));
+ enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS);
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ match->dissector.used_keys |= enc_keys;
+}
+
+static void nf_flow_rule_vlan_match(struct flow_dissector_key_vlan *key,
+ struct flow_dissector_key_vlan *mask,
+ u16 vlan_id, __be16 proto)
+{
+ key->vlan_id = vlan_id;
+ mask->vlan_id = VLAN_VID_MASK;
+ key->vlan_tpid = proto;
+ mask->vlan_tpid = 0xffff;
+}
+
static int nf_flow_rule_match(struct nf_flow_match *match,
- const struct flow_offload_tuple *tuple)
+ const struct flow_offload_tuple *tuple,
+ struct dst_entry *other_dst)
{
struct nf_flow_key *mask = &match->mask;
struct nf_flow_key *key = &match->key;
+ struct ip_tunnel_info *tun_info;
+ bool vlan_encap = false;
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta);
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control);
@@ -64,9 +104,44 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp);
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp);
- key->meta.ingress_ifindex = tuple->iifidx;
+ if (other_dst && other_dst->lwtstate) {
+ tun_info = lwt_tun_info(other_dst->lwtstate);
+ nf_flow_rule_lwt_match(match, tun_info);
+ }
+
+ if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC)
+ key->meta.ingress_ifindex = tuple->tc.iifidx;
+ else
+ key->meta.ingress_ifindex = tuple->iifidx;
+
mask->meta.ingress_ifindex = 0xffffffff;
+ if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) &&
+ tuple->encap[0].proto == htons(ETH_P_8021Q)) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, vlan);
+ nf_flow_rule_vlan_match(&key->vlan, &mask->vlan,
+ tuple->encap[0].id,
+ tuple->encap[0].proto);
+ vlan_encap = true;
+ }
+
+ if (tuple->encap_num > 1 && !(tuple->in_vlan_ingress & BIT(1)) &&
+ tuple->encap[1].proto == htons(ETH_P_8021Q)) {
+ if (vlan_encap) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CVLAN,
+ cvlan);
+ nf_flow_rule_vlan_match(&key->cvlan, &mask->cvlan,
+ tuple->encap[1].id,
+ tuple->encap[1].proto);
+ } else {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN,
+ vlan);
+ nf_flow_rule_vlan_match(&key->vlan, &mask->vlan,
+ tuple->encap[1].id,
+ tuple->encap[1].proto);
+ }
+ }
+
switch (tuple->l3proto) {
case AF_INET:
key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -87,6 +162,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
default:
return -EOPNOTSUPP;
}
+ mask->control.addr_type = 0xffff;
match->dissector.used_keys |= BIT(key->control.addr_type);
mask->basic.n_proto = 0xffff;
@@ -97,6 +173,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP);
break;
case IPPROTO_UDP:
+ case IPPROTO_GRE:
break;
default:
return -EOPNOTSUPP;
@@ -105,15 +182,22 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
key->basic.ip_proto = tuple->l4proto;
mask->basic.ip_proto = 0xff;
- key->tp.src = tuple->src_port;
- mask->tp.src = 0xffff;
- key->tp.dst = tuple->dst_port;
- mask->tp.dst = 0xffff;
-
match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) |
BIT(FLOW_DISSECTOR_KEY_CONTROL) |
- BIT(FLOW_DISSECTOR_KEY_BASIC) |
- BIT(FLOW_DISSECTOR_KEY_PORTS);
+ BIT(FLOW_DISSECTOR_KEY_BASIC);
+
+ switch (tuple->l4proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ key->tp.src = tuple->src_port;
+ mask->tp.src = 0xffff;
+ key->tp.dst = tuple->dst_port;
+ mask->tp.dst = 0xffff;
+
+ match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_PORTS);
+ break;
+ }
+
return 0;
}
@@ -141,27 +225,43 @@ static int flow_offload_eth_src(struct net *net,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple;
struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
- struct net_device *dev;
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
+ struct net_device *dev = NULL;
+ const unsigned char *addr;
u32 mask, val;
u16 val16;
- dev = dev_get_by_index(net, tuple->iifidx);
- if (!dev)
- return -ENOENT;
+ this_tuple = &flow->tuplehash[dir].tuple;
+
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ addr = this_tuple->out.h_source;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ dev = dev_get_by_index(net, other_tuple->iifidx);
+ if (!dev)
+ return -ENOENT;
+
+ addr = dev->dev_addr;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
mask = ~0xffff0000;
- memcpy(&val16, dev->dev_addr, 2);
+ memcpy(&val16, addr, 2);
val = val16 << 16;
flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
&val, &mask);
mask = ~0xffffffff;
- memcpy(&val, dev->dev_addr + 2, 4);
+ memcpy(&val, addr + 2, 4);
flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8,
&val, &mask);
+
dev_put(dev);
return 0;
@@ -174,27 +274,40 @@ static int flow_offload_eth_dst(struct net *net,
{
struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
- const void *daddr = &flow->tuplehash[!dir].tuple.src_v4;
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
const struct dst_entry *dst_cache;
unsigned char ha[ETH_ALEN];
struct neighbour *n;
+ const void *daddr;
u32 mask, val;
u8 nud_state;
u16 val16;
- dst_cache = flow->tuplehash[dir].tuple.dst_cache;
- n = dst_neigh_lookup(dst_cache, daddr);
- if (!n)
- return -ENOENT;
+ this_tuple = &flow->tuplehash[dir].tuple;
- read_lock_bh(&n->lock);
- nud_state = n->nud_state;
- ether_addr_copy(ha, n->ha);
- read_unlock_bh(&n->lock);
-
- if (!(nud_state & NUD_VALID)) {
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ether_addr_copy(ha, this_tuple->out.h_dest);
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ daddr = &other_tuple->src_v4;
+ dst_cache = this_tuple->dst_cache;
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -ENOENT;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
neigh_release(n);
- return -ENOENT;
+
+ if (!(nud_state & NUD_VALID))
+ return -ENOENT;
+ break;
+ default:
+ return -EOPNOTSUPP;
}
mask = ~0xffffffff;
@@ -207,7 +320,6 @@ static int flow_offload_eth_dst(struct net *net,
val = val16;
flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
&val, &mask);
- neigh_release(n);
return 0;
}
@@ -271,12 +383,12 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule,
const __be32 *addr, const __be32 *mask)
{
struct flow_action_entry *entry;
- int i;
+ int i, j;
- for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) {
+ for (i = 0, j = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32), j++) {
entry = flow_action_entry_next(flow_rule);
flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
- offset + i, &addr[i], mask);
+ offset + i, &addr[j], mask);
}
}
@@ -429,25 +541,149 @@ static void flow_offload_ipv4_checksum(struct net *net,
}
}
-static void flow_offload_redirect(const struct flow_offload *flow,
+static void flow_offload_redirect(struct net *net,
+ const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
- struct rtable *rt;
+ const struct flow_offload_tuple *this_tuple, *other_tuple;
+ struct flow_action_entry *entry;
+ struct net_device *dev;
+ int ifindex;
- rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+ this_tuple = &flow->tuplehash[dir].tuple;
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ this_tuple = &flow->tuplehash[dir].tuple;
+ ifindex = this_tuple->out.hw_ifidx;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ifindex = other_tuple->iifidx;
+ break;
+ default:
+ return;
+ }
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return;
+
+ entry = flow_action_entry_next(flow_rule);
entry->id = FLOW_ACTION_REDIRECT;
- entry->dev = rt->dst.dev;
- dev_hold(rt->dst.dev);
+ entry->dev = dev;
+}
+
+static void flow_offload_encap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *this_tuple;
+ struct flow_action_entry *entry;
+ struct dst_entry *dst;
+
+ this_tuple = &flow->tuplehash[dir].tuple;
+ if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = this_tuple->dst_cache;
+ if (dst && dst->lwtstate) {
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = lwt_tun_info(dst->lwtstate);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_TUNNEL_ENCAP;
+ entry->tunnel = tun_info;
+ }
+ }
+}
+
+static void flow_offload_decap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *other_tuple;
+ struct flow_action_entry *entry;
+ struct dst_entry *dst;
+
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = other_tuple->dst_cache;
+ if (dst && dst->lwtstate) {
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = lwt_tun_info(dst->lwtstate);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_TUNNEL_DECAP;
+ }
+ }
+}
+
+static int
+nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *other_tuple;
+ const struct flow_offload_tuple *tuple;
+ int i;
+
+ flow_offload_decap_tunnel(flow, dir, flow_rule);
+ flow_offload_encap_tunnel(flow, dir, flow_rule);
+
+ if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ return -1;
+
+ tuple = &flow->tuplehash[dir].tuple;
+
+ for (i = 0; i < tuple->encap_num; i++) {
+ struct flow_action_entry *entry;
+
+ if (tuple->in_vlan_ingress & BIT(i))
+ continue;
+
+ if (tuple->encap[i].proto == htons(ETH_P_8021Q)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_VLAN_POP;
+ }
+ }
+
+ other_tuple = &flow->tuplehash[!dir].tuple;
+
+ for (i = 0; i < other_tuple->encap_num; i++) {
+ struct flow_action_entry *entry;
+
+ if (other_tuple->in_vlan_ingress & BIT(i))
+ continue;
+
+ entry = flow_action_entry_next(flow_rule);
+
+ switch (other_tuple->encap[i].proto) {
+ case htons(ETH_P_PPP_SES):
+ entry->id = FLOW_ACTION_PPPOE_PUSH;
+ entry->pppoe.sid = other_tuple->encap[i].id;
+ break;
+ case htons(ETH_P_8021Q):
+ entry->id = FLOW_ACTION_VLAN_PUSH;
+ entry->vlan.vid = other_tuple->encap[i].id;
+ entry->vlan.proto = other_tuple->encap[i].proto;
+ break;
+ }
+ }
+
+ return 0;
}
int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
- flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
return -1;
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
@@ -462,7 +698,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
test_bit(NF_FLOW_DNAT, &flow->flags))
flow_offload_ipv4_checksum(net, flow, flow_rule);
- flow_offload_redirect(flow, dir, flow_rule);
+ flow_offload_redirect(net, flow, dir, flow_rule);
return 0;
}
@@ -472,8 +708,7 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
- flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
return -1;
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
@@ -485,7 +720,7 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
flow_offload_port_dnat(net, flow, dir, flow_rule);
}
- flow_offload_redirect(flow, dir, flow_rule);
+ flow_offload_redirect(net, flow, dir, flow_rule);
return 0;
}
@@ -499,8 +734,9 @@ nf_flow_offload_rule_alloc(struct net *net,
enum flow_offload_tuple_dir dir)
{
const struct nf_flowtable *flowtable = offload->flowtable;
+ const struct flow_offload_tuple *tuple, *other_tuple;
const struct flow_offload *flow = offload->flow;
- const struct flow_offload_tuple *tuple;
+ struct dst_entry *other_dst = NULL;
struct nf_flow_rule *flow_rule;
int err = -ENOMEM;
@@ -517,7 +753,11 @@ nf_flow_offload_rule_alloc(struct net *net,
flow_rule->rule->match.key = &flow_rule->match.key;
tuple = &flow->tuplehash[dir].tuple;
- err = nf_flow_rule_match(&flow_rule->match, tuple);
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ other_dst = other_tuple->dst_cache;
+
+ err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst);
if (err < 0)
goto err_flow_match;
@@ -597,6 +837,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
struct nf_flow_rule *flow_rule,
enum flow_offload_tuple_dir dir,
int priority, int cmd,
+ struct flow_stats *stats,
struct list_head *block_cb_list)
{
struct flow_cls_offload cls_flow = {};
@@ -610,6 +851,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
if (cmd == FLOW_CLS_REPLACE)
cls_flow.rule = flow_rule->rule;
+ down_read(&flowtable->flow_block_lock);
list_for_each_entry(block_cb, block_cb_list, list) {
err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow,
block_cb->cb_priv);
@@ -618,6 +860,10 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
i++;
}
+ up_read(&flowtable->flow_block_lock);
+
+ if (cmd == FLOW_CLS_STATS)
+ memcpy(stats, &cls_flow.stats, sizeof(*stats));
return i;
}
@@ -627,8 +873,9 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir)
{
return nf_flow_offload_tuple(offload->flowtable, offload->flow,
- flow_rule, dir, offload->priority,
- FLOW_CLS_REPLACE,
+ flow_rule, dir,
+ offload->flowtable->priority,
+ FLOW_CLS_REPLACE, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -636,7 +883,8 @@ static void flow_offload_tuple_del(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir)
{
nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
- offload->priority, FLOW_CLS_DESTROY,
+ offload->flowtable->priority,
+ FLOW_CLS_DESTROY, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -666,13 +914,17 @@ static void flow_offload_work_add(struct flow_offload_work *offload)
err = flow_offload_rule_add(offload, flow_rule);
if (err < 0)
- set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags);
+ goto out;
+
+ set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
+out:
nf_flow_offload_destroy(flow_rule);
}
static void flow_offload_work_del(struct flow_offload_work *offload)
{
+ clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
@@ -682,19 +934,10 @@ static void flow_offload_tuple_stats(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir,
struct flow_stats *stats)
{
- struct nf_flowtable *flowtable = offload->flowtable;
- struct flow_cls_offload cls_flow = {};
- struct flow_block_cb *block_cb;
- struct netlink_ext_ack extack;
- __be16 proto = ETH_P_ALL;
-
- nf_flow_offload_init(&cls_flow, proto, offload->priority,
- FLOW_CLS_STATS,
- &offload->flow->tuplehash[dir].tuple, &extack);
-
- list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list)
- block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv);
- memcpy(stats, &cls_flow.stats, sizeof(*stats));
+ nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
+ offload->flowtable->priority,
+ FLOW_CLS_STATS, stats,
+ &offload->flowtable->flow_block.cb_list);
}
static void flow_offload_work_stats(struct flow_offload_work *offload)
@@ -707,44 +950,62 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
offload->flow->timeout = max_t(u64, offload->flow->timeout,
- lastused + NF_FLOW_TIMEOUT);
+ lastused + flow_offload_get_timeout(offload->flow));
+
+ if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) {
+ if (stats[0].pkts)
+ nf_ct_acct_add(offload->flow->ct,
+ FLOW_OFFLOAD_DIR_ORIGINAL,
+ stats[0].pkts, stats[0].bytes);
+ if (stats[1].pkts)
+ nf_ct_acct_add(offload->flow->ct,
+ FLOW_OFFLOAD_DIR_REPLY,
+ stats[1].pkts, stats[1].bytes);
+ }
}
static void flow_offload_work_handler(struct work_struct *work)
{
- struct flow_offload_work *offload, *next;
- LIST_HEAD(offload_pending_list);
-
- spin_lock_bh(&flow_offload_pending_list_lock);
- list_replace_init(&flow_offload_pending_list, &offload_pending_list);
- spin_unlock_bh(&flow_offload_pending_list_lock);
+ struct flow_offload_work *offload;
+ struct net *net;
- list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
- switch (offload->cmd) {
+ offload = container_of(work, struct flow_offload_work, work);
+ net = read_pnet(&offload->flowtable->net);
+ switch (offload->cmd) {
case FLOW_CLS_REPLACE:
flow_offload_work_add(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_add);
break;
case FLOW_CLS_DESTROY:
flow_offload_work_del(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_del);
break;
case FLOW_CLS_STATS:
flow_offload_work_stats(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_stats);
break;
default:
WARN_ON_ONCE(1);
- }
- list_del(&offload->list);
- kfree(offload);
}
+
+ clear_bit(NF_FLOW_HW_PENDING, &offload->flow->flags);
+ kfree(offload);
}
static void flow_offload_queue_work(struct flow_offload_work *offload)
{
- spin_lock_bh(&flow_offload_pending_list_lock);
- list_add_tail(&offload->list, &flow_offload_pending_list);
- spin_unlock_bh(&flow_offload_pending_list_lock);
+ struct net *net = read_pnet(&offload->flowtable->net);
- schedule_work(&nf_flow_offload_work);
+ if (offload->cmd == FLOW_CLS_REPLACE) {
+ NF_FLOW_TABLE_STAT_INC(net, count_wq_add);
+ queue_work(nf_flow_offload_add_wq, &offload->work);
+ } else if (offload->cmd == FLOW_CLS_DESTROY) {
+ NF_FLOW_TABLE_STAT_INC(net, count_wq_del);
+ queue_work(nf_flow_offload_del_wq, &offload->work);
+ } else {
+ NF_FLOW_TABLE_STAT_INC(net, count_wq_stats);
+ queue_work(nf_flow_offload_stats_wq, &offload->work);
+ }
}
static struct flow_offload_work *
@@ -753,14 +1014,19 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable,
{
struct flow_offload_work *offload;
+ if (test_and_set_bit(NF_FLOW_HW_PENDING, &flow->flags))
+ return NULL;
+
offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC);
- if (!offload)
+ if (!offload) {
+ clear_bit(NF_FLOW_HW_PENDING, &flow->flags);
return NULL;
+ }
offload->cmd = cmd;
offload->flow = flow;
- offload->priority = flowtable->priority;
offload->flowtable = flowtable;
+ INIT_WORK(&offload->work, flow_offload_work_handler);
return offload;
}
@@ -798,7 +1064,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable,
__s32 delta;
delta = nf_flow_timeout_delta(flow->timeout);
- if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10))
+ if ((delta >= (9 * flow_offload_get_timeout(flow)) / 10))
return;
offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS);
@@ -808,10 +1074,21 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable,
flow_offload_queue_work(offload);
}
+void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable)
+{
+ if (nf_flowtable_hw_offload(flowtable)) {
+ flush_workqueue(nf_flow_offload_del_wq);
+ nf_flow_table_gc_run(flowtable);
+ }
+}
+
void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
{
- if (nf_flowtable_hw_offload(flowtable))
- flush_work(&nf_flow_offload_work);
+ if (nf_flowtable_hw_offload(flowtable)) {
+ flush_workqueue(nf_flow_offload_add_wq);
+ flush_workqueue(nf_flow_offload_del_wq);
+ flush_workqueue(nf_flow_offload_stats_wq);
+ }
}
static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
@@ -839,25 +1116,58 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
return err;
}
-static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
- struct nf_flowtable *flowtable,
- struct net_device *dev,
- enum flow_block_command cmd,
- struct netlink_ext_ack *extack)
+static void nf_flow_table_block_offload_init(struct flow_block_offload *bo,
+ struct net *net,
+ enum flow_block_command cmd,
+ struct nf_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
{
- int err;
-
- if (!dev->netdev_ops->ndo_setup_tc)
- return -EOPNOTSUPP;
-
memset(bo, 0, sizeof(*bo));
- bo->net = dev_net(dev);
+ bo->net = net;
bo->block = &flowtable->flow_block;
bo->command = cmd;
bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
bo->extack = extack;
+ bo->cb_list_head = &flowtable->flow_block.cb_list;
INIT_LIST_HEAD(&bo->cb_list);
+}
+static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb)
+{
+ struct nf_flowtable *flowtable = block_cb->indr.data;
+ struct net_device *dev = block_cb->indr.dev;
+
+ nf_flow_table_gc_cleanup(flowtable, dev);
+ down_write(&flowtable->flow_block_lock);
+ list_del(&block_cb->list);
+ list_del(&block_cb->driver_list);
+ flow_block_cb_free(block_cb);
+ up_write(&flowtable->flow_block_lock);
+}
+
+static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo,
+ struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
+ extack);
+
+ return flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_FT, flowtable, bo,
+ nf_flow_table_indr_cleanup);
+}
+
+static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
+ struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
+ extack);
err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo);
if (err < 0)
return err;
@@ -876,7 +1186,12 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
if (!nf_flowtable_hw_offload(flowtable))
return 0;
- err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack);
+ if (dev->netdev_ops->ndo_setup_tc)
+ err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd,
+ &extack);
+ else
+ err = nf_flow_table_indr_offload_cmd(&bo, flowtable, dev, cmd,
+ &extack);
if (err < 0)
return err;
@@ -886,20 +1201,33 @@ EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup);
int nf_flow_table_offload_init(void)
{
- INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler);
+ nf_flow_offload_add_wq = alloc_workqueue("nf_ft_offload_add",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_add_wq)
+ return -ENOMEM;
+
+ nf_flow_offload_del_wq = alloc_workqueue("nf_ft_offload_del",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_del_wq)
+ goto err_del_wq;
+
+ nf_flow_offload_stats_wq = alloc_workqueue("nf_ft_offload_stats",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_stats_wq)
+ goto err_stats_wq;
return 0;
+
+err_stats_wq:
+ destroy_workqueue(nf_flow_offload_del_wq);
+err_del_wq:
+ destroy_workqueue(nf_flow_offload_add_wq);
+ return -ENOMEM;
}
void nf_flow_table_offload_exit(void)
{
- struct flow_offload_work *offload, *next;
- LIST_HEAD(offload_pending_list);
-
- cancel_work_sync(&nf_flow_offload_work);
-
- list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
- list_del(&offload->list);
- kfree(offload);
- }
+ destroy_workqueue(nf_flow_offload_add_wq);
+ destroy_workqueue(nf_flow_offload_del_wq);
+ destroy_workqueue(nf_flow_offload_stats_wq);
}
diff --git a/net/netfilter/nf_flow_table_procfs.c b/net/netfilter/nf_flow_table_procfs.c
new file mode 100644
index 000000000000..159b033a43e6
--- /dev/null
+++ b/net/netfilter/nf_flow_table_procfs.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <net/netfilter/nf_flow_table.h>
+
+static void *nf_flow_table_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos - 1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ft.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void *nf_flow_table_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ft.stat, cpu);
+ }
+ (*pos)++;
+ return NULL;
+}
+
+static void nf_flow_table_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int nf_flow_table_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ const struct nf_flow_table_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "wq_add wq_del wq_stats\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%8d %8d %8d\n",
+ st->count_wq_add,
+ st->count_wq_del,
+ st->count_wq_stats
+ );
+ return 0;
+}
+
+static const struct seq_operations nf_flow_table_cpu_seq_ops = {
+ .start = nf_flow_table_cpu_seq_start,
+ .next = nf_flow_table_cpu_seq_next,
+ .stop = nf_flow_table_cpu_seq_stop,
+ .show = nf_flow_table_cpu_seq_show,
+};
+
+int nf_flow_table_init_proc(struct net *net)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_net("nf_flowtable", 0444, net->proc_net_stat,
+ &nf_flow_table_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+ return pde ? 0 : -ENOMEM;
+}
+
+void nf_flow_table_fini_proc(struct net *net)
+{
+ remove_proc_entry("nf_flowtable", net->proc_net_stat);
+}
diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c
new file mode 100644
index 000000000000..00e89ffd78f6
--- /dev/null
+++ b/net/netfilter/nf_hooks_lwtunnel.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sysctl.h>
+#include <net/lwtunnel.h>
+#include <net/netfilter/nf_hooks_lwtunnel.h>
+
+static inline int nf_hooks_lwtunnel_get(void)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return 1;
+ else
+ return 0;
+}
+
+static inline int nf_hooks_lwtunnel_set(int enable)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) {
+ if (!enable)
+ return -EBUSY;
+ } else if (enable) {
+ static_branch_enable(&nf_hooks_lwtunnel_enabled);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int proc_nf_hooks_lwtunnel_enabled = 0;
+ struct ctl_table tmp = {
+ .procname = table->procname,
+ .data = &proc_nf_hooks_lwtunnel_enabled,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+ int ret;
+
+ if (!write)
+ proc_nf_hooks_lwtunnel_enabled = nf_hooks_lwtunnel_get();
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0)
+ ret = nf_hooks_lwtunnel_set(proc_nf_hooks_lwtunnel_enabled);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler);
+#endif /* CONFIG_SYSCTL */
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index d6c43902ebd7..832ae64179f0 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -6,6 +6,23 @@
#include <linux/skbuff.h>
#include <linux/netdevice.h>
+/* nf_conntrack_netlink.c: applied on tuple filters */
+#define CTA_FILTER_F_CTA_IP_SRC (1 << 0)
+#define CTA_FILTER_F_CTA_IP_DST (1 << 1)
+#define CTA_FILTER_F_CTA_TUPLE_ZONE (1 << 2)
+#define CTA_FILTER_F_CTA_PROTO_NUM (1 << 3)
+#define CTA_FILTER_F_CTA_PROTO_SRC_PORT (1 << 4)
+#define CTA_FILTER_F_CTA_PROTO_DST_PORT (1 << 5)
+#define CTA_FILTER_F_CTA_PROTO_ICMP_TYPE (1 << 6)
+#define CTA_FILTER_F_CTA_PROTO_ICMP_CODE (1 << 7)
+#define CTA_FILTER_F_CTA_PROTO_ICMP_ID (1 << 8)
+#define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE (1 << 9)
+#define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE (1 << 10)
+#define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID (1 << 11)
+#define CTA_FILTER_F_MAX (1 << 12)
+#define CTA_FILTER_F_ALL (CTA_FILTER_F_MAX-1)
+#define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr
+
/* nf_queue.c */
void nf_queue_nf_hook_drop(struct net *net);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index bb25d4c794c7..8a29290149bd 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -151,13 +151,6 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf)
}
EXPORT_SYMBOL(nf_log_unbind_pf);
-void nf_logger_request_module(int pf, enum nf_log_type type)
-{
- if (loggers[pf][type] == NULL)
- request_module("nf-logger-%u-%u", pf, type);
-}
-EXPORT_SYMBOL_GPL(nf_logger_request_module);
-
int nf_logger_find_get(int pf, enum nf_log_type type)
{
struct nf_logger *logger;
@@ -177,9 +170,6 @@ int nf_logger_find_get(int pf, enum nf_log_type type)
return 0;
}
- if (rcu_access_pointer(loggers[pf][type]) == NULL)
- request_module("nf-logger-%u-%u", pf, type);
-
rcu_read_lock();
logger = rcu_dereference(loggers[pf][type]);
if (logger == NULL)
@@ -414,7 +404,7 @@ static struct ctl_table nf_log_sysctl_ftable[] = {
};
static int nf_log_proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
const struct nf_logger *logger;
char buf[NFLOGGER_NAME_LEN];
@@ -453,9 +443,9 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
mutex_lock(&nf_log_mutex);
logger = nft_log_dereference(net->nf.nf_loggers[tindex]);
if (!logger)
- strlcpy(buf, "NONE", sizeof(buf));
+ strscpy(buf, "NONE", sizeof(buf));
else
- strlcpy(buf, logger->name, sizeof(buf));
+ strscpy(buf, logger->name, sizeof(buf));
mutex_unlock(&nf_log_mutex);
r = proc_dostring(&tmp, write, buffer, lenp, ppos);
}
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
deleted file mode 100644
index ae5628ddbe6d..000000000000
--- a/net/netfilter/nf_log_common.c
+++ /dev/null
@@ -1,212 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_bridge.h>
-#include <linux/netfilter/xt_LOG.h>
-#include <net/netfilter/nf_log.h>
-
-int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
- u8 proto, int fragment, unsigned int offset)
-{
- struct udphdr _udph;
- const struct udphdr *uh;
-
- if (proto == IPPROTO_UDP)
- /* Max length: 10 "PROTO=UDP " */
- nf_log_buf_add(m, "PROTO=UDP ");
- else /* Max length: 14 "PROTO=UDPLITE " */
- nf_log_buf_add(m, "PROTO=UDPLITE ");
-
- if (fragment)
- goto out;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
- if (uh == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
-
- return 1;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ",
- ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len));
-
-out:
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_udp_header);
-
-int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
- u8 proto, int fragment, unsigned int offset,
- unsigned int logflags)
-{
- struct tcphdr _tcph;
- const struct tcphdr *th;
-
- /* Max length: 10 "PROTO=TCP " */
- nf_log_buf_add(m, "PROTO=TCP ");
-
- if (fragment)
- return 0;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
- if (th == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
- return 1;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- nf_log_buf_add(m, "SPT=%u DPT=%u ",
- ntohs(th->source), ntohs(th->dest));
- /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (logflags & NF_LOG_TCPSEQ) {
- nf_log_buf_add(m, "SEQ=%u ACK=%u ",
- ntohl(th->seq), ntohl(th->ack_seq));
- }
-
- /* Max length: 13 "WINDOW=65535 " */
- nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window));
- /* Max length: 9 "RES=0x3C " */
- nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
- TCP_RESERVED_BITS) >> 22));
- /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
- if (th->cwr)
- nf_log_buf_add(m, "CWR ");
- if (th->ece)
- nf_log_buf_add(m, "ECE ");
- if (th->urg)
- nf_log_buf_add(m, "URG ");
- if (th->ack)
- nf_log_buf_add(m, "ACK ");
- if (th->psh)
- nf_log_buf_add(m, "PSH ");
- if (th->rst)
- nf_log_buf_add(m, "RST ");
- if (th->syn)
- nf_log_buf_add(m, "SYN ");
- if (th->fin)
- nf_log_buf_add(m, "FIN ");
- /* Max length: 11 "URGP=65535 " */
- nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
-
- if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
- u_int8_t _opt[60 - sizeof(struct tcphdr)];
- const u_int8_t *op;
- unsigned int i;
- unsigned int optsize = th->doff*4 - sizeof(struct tcphdr);
-
- op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
- optsize, _opt);
- if (op == NULL) {
- nf_log_buf_add(m, "OPT (TRUNCATED)");
- return 1;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- nf_log_buf_add(m, "OPT (");
- for (i = 0; i < optsize; i++)
- nf_log_buf_add(m, "%02X", op[i]);
-
- nf_log_buf_add(m, ") ");
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header);
-
-void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
- struct sock *sk)
-{
- if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
- return;
-
- read_lock_bh(&sk->sk_callback_lock);
- if (sk->sk_socket && sk->sk_socket->file) {
- const struct cred *cred = sk->sk_socket->file->f_cred;
- nf_log_buf_add(m, "UID=%u GID=%u ",
- from_kuid_munged(&init_user_ns, cred->fsuid),
- from_kgid_munged(&init_user_ns, cred->fsgid));
- }
- read_unlock_bh(&sk->sk_callback_lock);
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid);
-
-void
-nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
- unsigned int hooknum, const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo, const char *prefix)
-{
- const struct net_device *physoutdev __maybe_unused;
- const struct net_device *physindev __maybe_unused;
-
- nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
- '0' + loginfo->u.log.level, prefix,
- in ? in->name : "",
- out ? out->name : "");
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- physindev = nf_bridge_get_physindev(skb);
- if (physindev && in != physindev)
- nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
- physoutdev = nf_bridge_get_physoutdev(skb);
- if (physoutdev && out != physoutdev)
- nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
-#endif
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
-
-/* bridge and netdev logging families share this code. */
-void nf_log_l2packet(struct net *net, u_int8_t pf,
- __be16 protocol,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- switch (protocol) {
- case htons(ETH_P_IP):
- nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_IPV6):
- nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_ARP):
- case htons(ETH_P_RARP):
- nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- }
-}
-EXPORT_SYMBOL_GPL(nf_log_l2packet);
-
-static int __init nf_log_common_init(void)
-{
- return 0;
-}
-
-static void __exit nf_log_common_exit(void) {}
-
-module_init(nf_log_common_init);
-module_exit(nf_log_common_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c
deleted file mode 100644
index 968dafa684c9..000000000000
--- a/net/netfilter/nf_log_netdev.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org>
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_log.h>
-
-static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out,
- loginfo, prefix);
-}
-
-static struct nf_logger nf_netdev_logger __read_mostly = {
- .name = "nf_log_netdev",
- .type = NF_LOG_TYPE_LOG,
- .logfn = nf_log_netdev_packet,
- .me = THIS_MODULE,
-};
-
-static int __net_init nf_log_netdev_net_init(struct net *net)
-{
- return nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger);
-}
-
-static void __net_exit nf_log_netdev_net_exit(struct net *net)
-{
- nf_log_unset(net, &nf_netdev_logger);
-}
-
-static struct pernet_operations nf_log_netdev_net_ops = {
- .init = nf_log_netdev_net_init,
- .exit = nf_log_netdev_net_exit,
-};
-
-static int __init nf_log_netdev_init(void)
-{
- int ret;
-
- /* Request to load the real packet loggers. */
- nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG);
- nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG);
- nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG);
-
- ret = register_pernet_subsys(&nf_log_netdev_net_ops);
- if (ret < 0)
- return ret;
-
- nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger);
- return 0;
-}
-
-static void __exit nf_log_netdev_exit(void)
-{
- unregister_pernet_subsys(&nf_log_netdev_net_ops);
- nf_log_unregister(&nf_netdev_logger);
-}
-
-module_init(nf_log_netdev_init);
-module_exit(nf_log_netdev_exit);
-
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_DESCRIPTION("Netfilter netdev packet logging");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */
diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
new file mode 100644
index 000000000000..cb894f0d63e9
--- /dev/null
+++ b/net/netfilter/nf_log_syslog.c
@@ -0,0 +1,1082 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static const struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_NOTICE,
+ .logflags = NF_LOG_DEFAULT_MASK,
+ },
+ },
+};
+
+struct arppayload {
+ unsigned char mac_src[ETH_ALEN];
+ unsigned char ip_src[4];
+ unsigned char mac_dst[ETH_ALEN];
+ unsigned char ip_dst[4];
+};
+
+/* Guard against containers flooding syslog. */
+static bool nf_log_allowed(const struct net *net)
+{
+ return net_eq(net, &init_net) || sysctl_nf_log_all_netns;
+}
+
+static void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb)
+{
+ u16 vid;
+
+ if (!skb_vlan_tag_present(skb))
+ return;
+
+ vid = skb_vlan_tag_get(skb);
+ nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid);
+}
+static void noinline_for_stack
+dump_arp_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int nhoff)
+{
+ const struct arppayload *ap;
+ struct arppayload _arpp;
+ const struct arphdr *ah;
+ unsigned int logflags;
+ struct arphdr _arph;
+
+ ah = skb_header_pointer(skb, nhoff, sizeof(_arph), &_arph);
+ if (!ah) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ if (logflags & NF_LOG_MACDECODE) {
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+ nf_log_dump_vlan(m, skb);
+ nf_log_buf_add(m, "MACPROTO=%04x ",
+ ntohs(eth_hdr(skb)->h_proto));
+ }
+
+ nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
+ ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
+ /* If it's for Ethernet and the lengths are OK, then log the ARP
+ * payload.
+ */
+ if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
+ ah->ar_hln != ETH_ALEN ||
+ ah->ar_pln != sizeof(__be32))
+ return;
+
+ ap = skb_header_pointer(skb, nhoff + sizeof(_arph), sizeof(_arpp), &_arpp);
+ if (!ap) {
+ nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
+ skb->len - sizeof(_arph));
+ return;
+ }
+ nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
+ ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+}
+
+static void
+nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo, const char *prefix)
+{
+ const struct net_device *physoutdev __maybe_unused;
+ const struct net_device *physindev __maybe_unused;
+
+ nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
+ '0' + loginfo->u.log.level, prefix,
+ in ? in->name : "",
+ out ? out->name : "");
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ physindev = nf_bridge_get_physindev(skb);
+ if (physindev && in != physindev)
+ nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
+ physoutdev = nf_bridge_get_physoutdev(skb);
+ if (physoutdev && out != physoutdev)
+ nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
+#endif
+}
+
+static void nf_log_arp_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix);
+ dump_arp_packet(m, loginfo, skb, skb_network_offset(skb));
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_arp_logger __read_mostly = {
+ .name = "nf_log_arp",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_arp_packet,
+ .me = THIS_MODULE,
+};
+
+static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
+ struct sock *sk)
+{
+ if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
+ return;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ const struct cred *cred = sk->sk_socket->file->f_cred;
+
+ nf_log_buf_add(m, "UID=%u GID=%u ",
+ from_kuid_munged(&init_user_ns, cred->fsuid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static noinline_for_stack int
+nf_log_dump_tcp_header(struct nf_log_buf *m,
+ const struct sk_buff *skb,
+ u8 proto, int fragment,
+ unsigned int offset,
+ unsigned int logflags)
+{
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ /* Max length: 10 "PROTO=TCP " */
+ nf_log_buf_add(m, "PROTO=TCP ");
+
+ if (fragment)
+ return 0;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+ if (!th) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ nf_log_buf_add(m, "SPT=%u DPT=%u ",
+ ntohs(th->source), ntohs(th->dest));
+ /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+ if (logflags & NF_LOG_TCPSEQ) {
+ nf_log_buf_add(m, "SEQ=%u ACK=%u ",
+ ntohl(th->seq), ntohl(th->ack_seq));
+ }
+
+ /* Max length: 13 "WINDOW=65535 " */
+ nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window));
+ /* Max length: 9 "RES=0x3C " */
+ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
+ TCP_RESERVED_BITS) >> 22));
+ /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+ if (th->cwr)
+ nf_log_buf_add(m, "CWR ");
+ if (th->ece)
+ nf_log_buf_add(m, "ECE ");
+ if (th->urg)
+ nf_log_buf_add(m, "URG ");
+ if (th->ack)
+ nf_log_buf_add(m, "ACK ");
+ if (th->psh)
+ nf_log_buf_add(m, "PSH ");
+ if (th->rst)
+ nf_log_buf_add(m, "RST ");
+ if (th->syn)
+ nf_log_buf_add(m, "SYN ");
+ if (th->fin)
+ nf_log_buf_add(m, "FIN ");
+ /* Max length: 11 "URGP=65535 " */
+ nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
+
+ if ((logflags & NF_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) {
+ unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr);
+ u8 _opt[60 - sizeof(struct tcphdr)];
+ unsigned int i;
+ const u8 *op;
+
+ op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
+ optsize, _opt);
+ if (!op) {
+ nf_log_buf_add(m, "OPT (TRUNCATED)");
+ return 1;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+
+ nf_log_buf_add(m, ") ");
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+nf_log_dump_udp_header(struct nf_log_buf *m,
+ const struct sk_buff *skb,
+ u8 proto, int fragment,
+ unsigned int offset)
+{
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ if (proto == IPPROTO_UDP)
+ /* Max length: 10 "PROTO=UDP " */
+ nf_log_buf_add(m, "PROTO=UDP ");
+ else /* Max length: 14 "PROTO=UDPLITE " */
+ nf_log_buf_add(m, "PROTO=UDPLITE ");
+
+ if (fragment)
+ goto out;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+ if (!uh) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ",
+ ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len));
+
+out:
+ return 0;
+}
+
+/* One level of recursion won't kill us */
+static noinline_for_stack void
+dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int iphoff)
+{
+ const struct iphdr *ih;
+ unsigned int logflags;
+ struct iphdr _iph;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ if (!ih) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options.
+ * Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 "
+ */
+ nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(ih->frag_off) & IP_CE)
+ nf_log_buf_add(m, "CE ");
+ if (ntohs(ih->frag_off) & IP_DF)
+ nf_log_buf_add(m, "DF ");
+ if (ntohs(ih->frag_off) & IP_MF)
+ nf_log_buf_add(m, "MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+ if ((logflags & NF_LOG_IPOPT) &&
+ ih->ihl * 4 > sizeof(struct iphdr)) {
+ unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+ const unsigned char *op;
+ unsigned int i, optsize;
+
+ optsize = ih->ihl * 4 - sizeof(struct iphdr);
+ op = skb_header_pointer(skb, iphoff + sizeof(_iph),
+ optsize, _opt);
+ if (!op) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+ nf_log_buf_add(m, ") ");
+ }
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff + ih->ihl * 4, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff + ih->ihl * 4))
+ return;
+ break;
+ case IPPROTO_ICMP: {
+ static const size_t required_len[NR_ICMP_TYPES + 1] = {
+ [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr),
+ [ICMP_SOURCE_QUENCH] = 8 + sizeof(struct iphdr),
+ [ICMP_REDIRECT] = 8 + sizeof(struct iphdr),
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED] = 8 + sizeof(struct iphdr),
+ [ICMP_PARAMETERPROB] = 8 + sizeof(struct iphdr),
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+ const struct icmphdr *ich;
+ struct icmphdr _icmph;
+
+ /* Max length: 11 "PROTO=ICMP " */
+ nf_log_buf_add(m, "PROTO=ICMP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_icmph), &_icmph);
+ if (!ich) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (ich->type <= NR_ICMP_TYPES &&
+ required_len[ich->type] &&
+ skb->len - iphoff - ih->ihl * 4 < required_len[ich->type]) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ switch (ich->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ich->un.echo.id),
+ ntohs(ich->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ nf_log_buf_add(m, "PARAMETER=%u ",
+ ntohl(ich->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+ fallthrough;
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (!iphoff) { /* Only recurse once. */
+ nf_log_buf_add(m, "[");
+ dump_ipv4_packet(net, m, info, skb,
+ iphoff + ih->ihl * 4 + sizeof(_icmph));
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ich->type == ICMP_DEST_UNREACH &&
+ ich->code == ICMP_FRAG_NEEDED) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohs(ich->un.frag.mtu));
+ }
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH: {
+ const struct ip_auth_hdr *ah;
+ struct ip_auth_hdr _ahdr;
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 9 "PROTO=AH " */
+ nf_log_buf_add(m, "PROTO=AH ");
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ah = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_ahdr), &_ahdr);
+ if (!ah) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ break;
+ }
+ case IPPROTO_ESP: {
+ const struct ip_esp_hdr *eh;
+ struct ip_esp_hdr _esph;
+
+ /* Max length: 10 "PROTO=ESP " */
+ nf_log_buf_add(m, "PROTO=ESP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ eh = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_esph), &_esph);
+ if (!eh) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & NF_LOG_UID) && !iphoff)
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (!iphoff && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* UDPLITE: 14+max(25,20) = 39 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 252 = 803 */
+}
+
+static noinline_for_stack void
+dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int ip6hoff,
+ int recurse)
+{
+ const struct ipv6hdr *ih;
+ unsigned int hdrlen = 0;
+ unsigned int logflags;
+ struct ipv6hdr _ip6h;
+ unsigned int ptr;
+ u8 currenthdr;
+ int fragment;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
+ if (!ih) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
+ nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
+ nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+ ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
+ (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
+ ih->hop_limit,
+ (ntohl(*(__be32 *)ih) & 0x000fffff));
+
+ fragment = 0;
+ ptr = ip6hoff + sizeof(struct ipv6hdr);
+ currenthdr = ih->nexthdr;
+ while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) {
+ struct ipv6_opt_hdr _hdr;
+ const struct ipv6_opt_hdr *hp;
+
+ hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
+ if (!hp) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 48 "OPT (...) " */
+ if (logflags & NF_LOG_IPOPT)
+ nf_log_buf_add(m, "OPT ( ");
+
+ switch (currenthdr) {
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr _fhdr;
+ const struct frag_hdr *fh;
+
+ nf_log_buf_add(m, "FRAG:");
+ fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
+ &_fhdr);
+ if (!fh) {
+ nf_log_buf_add(m, "TRUNCATED ");
+ return;
+ }
+
+ /* Max length: 6 "65535 " */
+ nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
+
+ /* Max length: 11 "INCOMPLETE " */
+ if (fh->frag_off & htons(0x0001))
+ nf_log_buf_add(m, "INCOMPLETE ");
+
+ nf_log_buf_add(m, "ID:%08x ",
+ ntohl(fh->identification));
+
+ if (ntohs(fh->frag_off) & 0xFFF8)
+ fragment = 1;
+
+ hdrlen = 8;
+ break;
+ }
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_HOPOPTS:
+ if (fragment) {
+ if (logflags & NF_LOG_IPOPT)
+ nf_log_buf_add(m, ")");
+ return;
+ }
+ hdrlen = ipv6_optlen(hp);
+ break;
+ /* Max Length */
+ case IPPROTO_AH:
+ if (logflags & NF_LOG_IPOPT) {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ /* Max length: 3 "AH " */
+ nf_log_buf_add(m, "AH ");
+
+ if (fragment) {
+ nf_log_buf_add(m, ")");
+ return;
+ }
+
+ ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
+ &_ahdr);
+ if (!ah) {
+ /* Max length: 26 "INCOMPLETE [65535 bytes] )" */
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ }
+
+ hdrlen = ipv6_authlen(hp);
+ break;
+ case IPPROTO_ESP:
+ if (logflags & NF_LOG_IPOPT) {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 4 "ESP " */
+ nf_log_buf_add(m, "ESP ");
+
+ if (fragment) {
+ nf_log_buf_add(m, ")");
+ return;
+ }
+
+ /* Max length: 26 "INCOMPLETE [65535 bytes] )" */
+ eh = skb_header_pointer(skb, ptr, sizeof(_esph),
+ &_esph);
+ if (!eh) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 16 "SPI=0xF1234567 )" */
+ nf_log_buf_add(m, "SPI=0x%x )",
+ ntohl(eh->spi));
+ }
+ return;
+ default:
+ /* Max length: 20 "Unknown Ext Hdr 255" */
+ nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
+ return;
+ }
+ if (logflags & NF_LOG_IPOPT)
+ nf_log_buf_add(m, ") ");
+
+ currenthdr = hp->nexthdr;
+ ptr += hdrlen;
+ }
+
+ switch (currenthdr) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment,
+ ptr, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr))
+ return;
+ break;
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _icmp6h;
+ const struct icmp6hdr *ic;
+
+ /* Max length: 13 "PROTO=ICMPv6 " */
+ nf_log_buf_add(m, "PROTO=ICMPv6 ");
+
+ if (fragment)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
+ if (!ic) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ",
+ ic->icmp6_type, ic->icmp6_code);
+
+ switch (ic->icmp6_type) {
+ case ICMPV6_ECHO_REQUEST:
+ case ICMPV6_ECHO_REPLY:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ic->icmp6_identifier),
+ ntohs(ic->icmp6_sequence));
+ break;
+ case ICMPV6_MGM_QUERY:
+ case ICMPV6_MGM_REPORT:
+ case ICMPV6_MGM_REDUCTION:
+ break;
+
+ case ICMPV6_PARAMPROB:
+ /* Max length: 17 "POINTER=ffffffff " */
+ nf_log_buf_add(m, "POINTER=%08x ",
+ ntohl(ic->icmp6_pointer));
+ fallthrough;
+ case ICMPV6_DEST_UNREACH:
+ case ICMPV6_PKT_TOOBIG:
+ case ICMPV6_TIME_EXCEED:
+ /* Max length: 3+maxlen */
+ if (recurse) {
+ nf_log_buf_add(m, "[");
+ dump_ipv6_packet(net, m, info, skb,
+ ptr + sizeof(_icmp6h), 0);
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohl(ic->icmp6_mtu));
+ }
+ }
+ break;
+ }
+ /* Max length: 10 "PROTO=255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", currenthdr);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & NF_LOG_UID) && recurse)
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (recurse && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+}
+
+static void dump_mac_header(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & NF_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+ nf_log_dump_vlan(m, skb);
+ nf_log_buf_add(m, "MACPROTO=%04x ",
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ nf_log_buf_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ if (dev->type == ARPHRD_SIT) {
+ p -= ETH_HLEN;
+
+ if (p < skb->head)
+ p = NULL;
+ }
+
+ if (p) {
+ nf_log_buf_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++)
+ nf_log_buf_add(m, ":%02x", *p++);
+ }
+
+ if (dev->type == ARPHRD_SIT) {
+ const struct iphdr *iph =
+ (struct iphdr *)skb_mac_header(skb);
+
+ nf_log_buf_add(m, " TUNNEL=%pI4->%pI4", &iph->saddr,
+ &iph->daddr);
+ }
+ }
+ nf_log_buf_add(m, " ");
+}
+
+static void nf_log_ip_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in,
+ out, loginfo, prefix);
+
+ if (in)
+ dump_mac_header(m, loginfo, skb);
+
+ dump_ipv4_packet(net, m, loginfo, skb, skb_network_offset(skb));
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip_logger __read_mostly = {
+ .name = "nf_log_ipv4",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip_packet,
+ .me = THIS_MODULE,
+};
+
+static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
+ loginfo, prefix);
+
+ if (in)
+ dump_mac_header(m, loginfo, skb);
+
+ dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip6_logger __read_mostly = {
+ .name = "nf_log_ipv6",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip6_packet,
+ .me = THIS_MODULE,
+};
+
+static void nf_log_unknown_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix);
+
+ dump_mac_header(m, loginfo, skb);
+
+ nf_log_buf_close(m);
+}
+
+static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nf_log_ip_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_log_ip6_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ break;
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP):
+ nf_log_arp_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ break;
+ default:
+ nf_log_unknown_packet(net, pf, hooknum, skb,
+ in, out, loginfo, prefix);
+ break;
+ }
+}
+
+static struct nf_logger nf_netdev_logger __read_mostly = {
+ .name = "nf_log_netdev",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_netdev_packet,
+ .me = THIS_MODULE,
+};
+
+static struct nf_logger nf_bridge_logger __read_mostly = {
+ .name = "nf_log_bridge",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_netdev_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_syslog_net_init(struct net *net)
+{
+ int ret = nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
+
+ if (ret)
+ return ret;
+
+ ret = nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
+ if (ret)
+ goto err1;
+
+ ret = nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
+ if (ret)
+ goto err2;
+
+ ret = nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger);
+ if (ret)
+ goto err3;
+
+ ret = nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
+ if (ret)
+ goto err4;
+ return 0;
+err4:
+ nf_log_unset(net, &nf_netdev_logger);
+err3:
+ nf_log_unset(net, &nf_ip6_logger);
+err2:
+ nf_log_unset(net, &nf_arp_logger);
+err1:
+ nf_log_unset(net, &nf_ip_logger);
+ return ret;
+}
+
+static void __net_exit nf_log_syslog_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_ip_logger);
+ nf_log_unset(net, &nf_arp_logger);
+ nf_log_unset(net, &nf_ip6_logger);
+ nf_log_unset(net, &nf_netdev_logger);
+ nf_log_unset(net, &nf_bridge_logger);
+}
+
+static struct pernet_operations nf_log_syslog_net_ops = {
+ .init = nf_log_syslog_net_init,
+ .exit = nf_log_syslog_net_exit,
+};
+
+static int __init nf_log_syslog_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_syslog_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+ if (ret < 0)
+ goto err1;
+
+ ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+ if (ret < 0)
+ goto err2;
+
+ ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
+ if (ret < 0)
+ goto err3;
+
+ ret = nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger);
+ if (ret < 0)
+ goto err4;
+
+ ret = nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger);
+ if (ret < 0)
+ goto err5;
+
+ return 0;
+err5:
+ nf_log_unregister(&nf_netdev_logger);
+err4:
+ nf_log_unregister(&nf_ip6_logger);
+err3:
+ nf_log_unregister(&nf_arp_logger);
+err2:
+ nf_log_unregister(&nf_ip_logger);
+err1:
+ pr_err("failed to register logger\n");
+ unregister_pernet_subsys(&nf_log_syslog_net_ops);
+ return ret;
+}
+
+static void __exit nf_log_syslog_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_syslog_net_ops);
+ nf_log_unregister(&nf_ip_logger);
+ nf_log_unregister(&nf_arp_logger);
+ nf_log_unregister(&nf_ip6_logger);
+ nf_log_unregister(&nf_netdev_logger);
+ nf_log_unregister(&nf_bridge_logger);
+}
+
+module_init(nf_log_syslog_init);
+module_exit(nf_log_syslog_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter syslog packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf_log_arp");
+MODULE_ALIAS("nf_log_bridge");
+MODULE_ALIAS("nf_log_ipv4");
+MODULE_ALIAS("nf_log_ipv6");
+MODULE_ALIAS("nf_log_netdev");
+MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0);
+MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
+MODULE_ALIAS_NF_LOGGER(3, 0);
+MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */
+MODULE_ALIAS_NF_LOGGER(AF_INET6, 0);
diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
index 3bc7e0854efe..98deef6cde69 100644
--- a/net/netfilter/nf_nat_amanda.c
+++ b/net/netfilter/nf_nat_amanda.c
@@ -44,19 +44,7 @@ static unsigned int help(struct sk_buff *skb,
exp->expectfn = nf_nat_follow_master;
/* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int res;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- res = nf_ct_expect_related(exp, 0);
- if (res == 0)
- break;
- else if (res != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
nf_ct_helper_log(skb, exp->master, "all ports in use");
return NF_DROP;
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
new file mode 100644
index 000000000000..0fa5a0bbb0ff
--- /dev/null
+++ b/net/netfilter/nf_nat_bpf.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable NAT Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in nf_nat BTF");
+
+/* bpf_ct_set_nat_info - Set source or destination nat address
+ *
+ * Set source or destination nat address of the newly allocated
+ * nf_conn before insertion. This must be invoked for referenced
+ * PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @addr - Nat source/destination address
+ * @port - Nat source/destination port. Non-positive values are
+ * interpreted as select a random port.
+ * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
+ */
+int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
+ union nf_inet_addr *addr, int port,
+ enum nf_nat_manip_type manip)
+{
+ struct nf_conn *ct = (struct nf_conn *)nfct;
+ u16 proto = nf_ct_l3num(ct);
+ struct nf_nat_range2 range;
+
+ if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6)
+ return -EINVAL;
+
+ memset(&range, 0, sizeof(struct nf_nat_range2));
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = *addr;
+ range.max_addr = range.min_addr;
+ if (port > 0) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto.all = cpu_to_be16(port);
+ range.max_proto.all = range.min_proto.all;
+ }
+
+ return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(nf_nat_kfunc_set)
+BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
+BTF_SET8_END(nf_nat_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_nat_kfunc_set,
+};
+
+int register_nf_nat_bpf(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &nf_bpf_nat_kfunc_set);
+ if (ret)
+ return ret;
+
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+ &nf_bpf_nat_kfunc_set);
+}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index bfc555fcbc72..e29e4ccb5c5a 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -13,10 +13,10 @@
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <net/xfrm.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/rtnetlink.h>
-#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
@@ -34,7 +34,7 @@ static unsigned int nat_net_id __read_mostly;
static struct hlist_head *nf_nat_bysource __read_mostly;
static unsigned int nf_nat_htable_size __read_mostly;
-static unsigned int nf_nat_hash_rnd __read_mostly;
+static siphash_aligned_key_t nf_nat_hash_rnd;
struct nf_nat_lookup_hook_priv {
struct nf_hook_entries __rcu *entries;
@@ -146,56 +146,36 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
return;
}
}
-
-int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
-{
- struct flowi fl;
- unsigned int hh_len;
- struct dst_entry *dst;
- struct sock *sk = skb->sk;
- int err;
-
- err = xfrm_decode_session(skb, &fl, family);
- if (err < 0)
- return err;
-
- dst = skb_dst(skb);
- if (dst->xfrm)
- dst = ((struct xfrm_dst *)dst)->route;
- if (!dst_hold_safe(dst))
- return -EHOSTUNREACH;
-
- if (sk && !net_eq(net, sock_net(sk)))
- sk = NULL;
-
- dst = xfrm_lookup(net, dst, &fl, sk, 0);
- if (IS_ERR(dst))
- return PTR_ERR(dst);
-
- skb_dst_drop(skb);
- skb_dst_set(skb, dst);
-
- /* Change in oif may mean change in hh_len. */
- hh_len = skb_dst(skb)->dev->hard_header_len;
- if (skb_headroom(skb) < hh_len &&
- pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
- return -ENOMEM;
- return 0;
-}
-EXPORT_SYMBOL(nf_xfrm_me_harder);
#endif /* CONFIG_XFRM */
/* We keep an extra hash for each conntrack, for fast searching. */
static unsigned int
-hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *net,
+ const struct nf_conntrack_zone *zone,
+ const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
+ struct {
+ struct nf_conntrack_man src;
+ u32 net_mix;
+ u32 protonum;
+ u32 zone;
+ } __aligned(SIPHASH_ALIGNMENT) combined;
get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+ memset(&combined, 0, sizeof(combined));
+
/* Original src, to ensure we map it consistently if poss. */
- hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
- tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
+ combined.src = tuple->src;
+ combined.net_mix = net_hash_mix(net);
+ combined.protonum = tuple->dst.protonum;
+
+ /* Zone ID can be used provided its valid for both directions */
+ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR)
+ combined.zone = zone->id;
+
+ hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd);
return reciprocal_scale(hash, nf_nat_htable_size);
}
@@ -299,7 +279,7 @@ find_appropriate_src(struct net *net,
struct nf_conntrack_tuple *result,
const struct nf_nat_range2 *range)
{
- unsigned int h = hash_by_src(net, tuple);
+ unsigned int h = hash_by_src(net, zone, tuple);
const struct nf_conn *ct;
hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
@@ -408,7 +388,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
static const unsigned int max_attempts = 128;
switch (tuple->dst.protonum) {
- case IPPROTO_ICMP: /* fallthrough */
+ case IPPROTO_ICMP:
case IPPROTO_ICMPV6:
/* id is same for either direction... */
keyptr = &tuple->src.u.icmp.id;
@@ -442,11 +422,11 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
}
goto find_free_id;
#endif
- case IPPROTO_UDP: /* fallthrough */
- case IPPROTO_UDPLITE: /* fallthrough */
- case IPPROTO_TCP: /* fallthrough */
- case IPPROTO_SCTP: /* fallthrough */
- case IPPROTO_DCCP: /* fallthrough */
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_DCCP:
if (maniptype == NF_NAT_MANIP_SRC)
keyptr = &tuple->src.u.all;
else
@@ -488,7 +468,7 @@ find_free_id:
if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
else
- off = prandom_u32();
+ off = get_random_u16();
attempts = range_size;
if (attempts > max_attempts)
@@ -510,7 +490,7 @@ another_round:
if (attempts >= range_size || attempts < 16)
return;
attempts /= 2;
- off = prandom_u32();
+ off = get_random_u16();
goto another_round;
}
@@ -646,7 +626,7 @@ nf_nat_setup_info(struct nf_conn *ct,
unsigned int srchash;
spinlock_t *lock;
- srchash = hash_by_src(net,
+ srchash = hash_by_src(net, nf_ct_zone(ct),
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
spin_lock_bh(lock);
@@ -719,6 +699,16 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
}
EXPORT_SYMBOL_GPL(nf_nat_packet);
+static bool in_vrf_postrouting(const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (state->hook == NF_INET_POST_ROUTING &&
+ netif_is_l3_master(state->out))
+ return true;
+#endif
+ return false;
+}
+
unsigned int
nf_nat_inet_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -735,7 +725,7 @@ nf_nat_inet_fn(void *priv, struct sk_buff *skb,
* packet filter it out, or implement conntrack/NAT for that
* protocol. 8) --RR
*/
- if (!ct)
+ if (!ct || in_vrf_postrouting(state))
return NF_ACCEPT;
nat = nfct_nat(ct);
@@ -811,11 +801,11 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
return i->status & IPS_NAT_MASK ? 1 : 0;
}
-static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
unsigned int h;
- h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
hlist_del_rcu(&ct->nat_bysource);
spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
@@ -833,7 +823,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* will delete entry from already-freed table.
*/
if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
- __nf_nat_cleanup_conntrack(ct);
+ nf_nat_cleanup_conntrack(ct);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -841,20 +831,6 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
return 0;
}
-/* No one using conntrack by the time this called. */
-static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
-{
- if (ct->status & IPS_SRC_NAT_DONE)
- __nf_nat_cleanup_conntrack(ct);
-}
-
-static struct nf_ct_ext_type nat_extend __read_mostly = {
- .len = sizeof(struct nf_conn_nat),
- .align = __alignof__(struct nf_conn_nat),
- .destroy = nf_nat_cleanup_conntrack,
- .id = NF_CT_EXT_NAT,
-};
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
@@ -1140,12 +1116,13 @@ static struct pernet_operations nat_net_ops = {
.size = sizeof(struct nat_net),
};
-static struct nf_nat_hook nat_hook = {
+static const struct nf_nat_hook nat_hook = {
.parse_nat_setup = nfnetlink_parse_nat_setup,
#ifdef CONFIG_XFRM
.decode_session = __nf_nat_decode_session,
#endif
.manip_pkt = nf_nat_manip_pkt,
+ .remove_nat_bysrc = nf_nat_cleanup_conntrack,
};
static int __init nf_nat_init(void)
@@ -1161,19 +1138,12 @@ static int __init nf_nat_init(void)
if (!nf_nat_bysource)
return -ENOMEM;
- ret = nf_ct_extend_register(&nat_extend);
- if (ret < 0) {
- kvfree(nf_nat_bysource);
- pr_err("Unable to register extension\n");
- return ret;
- }
-
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_nat_locks[i]);
ret = register_pernet_subsys(&nat_net_ops);
if (ret < 0) {
- nf_ct_extend_unregister(&nat_extend);
+ kvfree(nf_nat_bysource);
return ret;
}
@@ -1182,7 +1152,16 @@ static int __init nf_nat_init(void)
WARN_ON(nf_nat_hook != NULL);
RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
- return 0;
+ ret = register_nf_nat_bpf();
+ if (ret < 0) {
+ RCU_INIT_POINTER(nf_nat_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
+ synchronize_net();
+ unregister_pernet_subsys(&nat_net_ops);
+ kvfree(nf_nat_bysource);
+ }
+
+ return ret;
}
static void __exit nf_nat_cleanup(void)
@@ -1191,7 +1170,6 @@ static void __exit nf_nat_cleanup(void)
nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);
- nf_ct_extend_unregister(&nat_extend);
nf_ct_helper_expectfn_unregister(&follow_master_nat);
RCU_INIT_POINTER(nf_nat_hook, NULL);
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
index aace6768a64e..c92a436d9c48 100644
--- a/net/netfilter/nf_nat_ftp.c
+++ b/net/netfilter/nf_nat_ftp.c
@@ -86,22 +86,9 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
* this one. */
exp->expectfn = nf_nat_follow_master;
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp, 0);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
- nf_ct_helper_log(skb, ct, "all ports in use");
+ nf_ct_helper_log(skb, exp->master, "all ports in use");
return NF_DROP;
}
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index a263505455fc..a95a25196943 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -198,3 +198,34 @@ void nf_nat_follow_master(struct nf_conn *ct,
nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
}
EXPORT_SYMBOL(nf_nat_follow_master);
+
+u16 nf_nat_exp_find_port(struct nf_conntrack_expect *exp, u16 port)
+{
+ static const unsigned int max_attempts = 128;
+ int range, attempts_left;
+ u16 min = port;
+
+ range = USHRT_MAX - port;
+ attempts_left = range;
+
+ if (attempts_left > max_attempts)
+ attempts_left = max_attempts;
+
+ /* Try to get same port: if not, try to change it. */
+ for (;;) {
+ int res;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ res = nf_ct_expect_related(exp, 0);
+ if (res == 0)
+ return port;
+
+ if (res != -EBUSY || (--attempts_left < 0))
+ break;
+
+ port = min + prandom_u32_max(range);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_exp_find_port);
diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c
index c691ab8d234c..19c4fcc60c50 100644
--- a/net/netfilter/nf_nat_irc.c
+++ b/net/netfilter/nf_nat_irc.c
@@ -48,20 +48,8 @@ static unsigned int help(struct sk_buff *skb,
exp->dir = IP_CT_DIR_REPLY;
exp->expectfn = nf_nat_follow_master;
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp, 0);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp,
+ ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
nf_ct_helper_log(skb, ct, "all ports in use");
return NF_DROP;
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index 8e8a65d46345..1a506b0c6511 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -9,8 +9,20 @@
#include <net/netfilter/nf_nat_masquerade.h>
+struct masq_dev_work {
+ struct work_struct work;
+ struct net *net;
+ netns_tracker ns_tracker;
+ union nf_inet_addr addr;
+ int ifindex;
+ int (*iter)(struct nf_conn *i, void *data);
+};
+
+#define MAX_MASQ_WORKER_COUNT 16
+
static DEFINE_MUTEX(masq_mutex);
static unsigned int masq_refcnt __read_mostly;
+static atomic_t masq_worker_count __read_mostly;
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
@@ -63,13 +75,75 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
-static int device_cmp(struct nf_conn *i, void *ifindex)
+static void iterate_cleanup_work(struct work_struct *work)
+{
+ struct nf_ct_iter_data iter_data = {};
+ struct masq_dev_work *w;
+
+ w = container_of(work, struct masq_dev_work, work);
+
+ iter_data.net = w->net;
+ iter_data.data = (void *)w;
+ nf_ct_iterate_cleanup_net(w->iter, &iter_data);
+
+ put_net_track(w->net, &w->ns_tracker);
+ kfree(w);
+ atomic_dec(&masq_worker_count);
+ module_put(THIS_MODULE);
+}
+
+/* Iterate conntrack table in the background and remove conntrack entries
+ * that use the device/address being removed.
+ *
+ * In case too many work items have been queued already or memory allocation
+ * fails iteration is skipped, conntrack entries will time out eventually.
+ */
+static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr,
+ int ifindex,
+ int (*iter)(struct nf_conn *i, void *data),
+ gfp_t gfp_flags)
+{
+ struct masq_dev_work *w;
+
+ if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT)
+ return;
+
+ net = maybe_get_net(net);
+ if (!net)
+ return;
+
+ if (!try_module_get(THIS_MODULE))
+ goto err_module;
+
+ w = kzalloc(sizeof(*w), gfp_flags);
+ if (w) {
+ /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */
+ atomic_inc(&masq_worker_count);
+
+ INIT_WORK(&w->work, iterate_cleanup_work);
+ w->ifindex = ifindex;
+ w->net = net;
+ netns_tracker_alloc(net, &w->ns_tracker, gfp_flags);
+ w->iter = iter;
+ if (addr)
+ w->addr = *addr;
+ schedule_work(&w->work);
+ return;
+ }
+
+ module_put(THIS_MODULE);
+ err_module:
+ put_net(net);
+}
+
+static int device_cmp(struct nf_conn *i, void *arg)
{
const struct nf_conn_nat *nat = nfct_nat(i);
+ const struct masq_dev_work *w = arg;
if (!nat)
return 0;
- return nat->masq_index == (int)(long)ifindex;
+ return nat->masq_index == w->ifindex;
}
static int masq_device_event(struct notifier_block *this,
@@ -85,8 +159,8 @@ static int masq_device_event(struct notifier_block *this,
* and forget them.
*/
- nf_ct_iterate_cleanup_net(net, device_cmp,
- (void *)(long)dev->ifindex, 0, 0);
+ nf_nat_masq_schedule(net, NULL, dev->ifindex,
+ device_cmp, GFP_KERNEL);
}
return NOTIFY_DONE;
@@ -94,35 +168,45 @@ static int masq_device_event(struct notifier_block *this,
static int inet_cmp(struct nf_conn *ct, void *ptr)
{
- struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
- struct net_device *dev = ifa->ifa_dev->dev;
struct nf_conntrack_tuple *tuple;
+ struct masq_dev_work *w = ptr;
- if (!device_cmp(ct, (void *)(long)dev->ifindex))
+ if (!device_cmp(ct, ptr))
return 0;
tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- return ifa->ifa_address == tuple->dst.u3.ip;
+ return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3);
}
static int masq_inet_event(struct notifier_block *this,
unsigned long event,
void *ptr)
{
- struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
- struct net *net = dev_net(idev->dev);
+ const struct in_ifaddr *ifa = ptr;
+ const struct in_device *idev;
+ const struct net_device *dev;
+ union nf_inet_addr addr;
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
/* The masq_dev_notifier will catch the case of the device going
* down. So if the inetdev is dead and being destroyed we have
* no work to do. Otherwise this is an individual address removal
* and we have to perform the flush.
*/
+ idev = ifa->ifa_dev;
if (idev->dead)
return NOTIFY_DONE;
- if (event == NETDEV_DOWN)
- nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0);
+ memset(&addr, 0, sizeof(addr));
+
+ addr.ip = ifa->ifa_address;
+
+ dev = idev->dev;
+ nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex,
+ inet_cmp, GFP_KERNEL);
return NOTIFY_DONE;
}
@@ -136,8 +220,6 @@ static struct notifier_block masq_inet_notifier = {
};
#if IS_ENABLED(CONFIG_IPV6)
-static atomic_t v6_worker_count __read_mostly;
-
static int
nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
const struct in6_addr *daddr, unsigned int srcprefs,
@@ -187,40 +269,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
-struct masq_dev_work {
- struct work_struct work;
- struct net *net;
- struct in6_addr addr;
- int ifindex;
-};
-
-static int inet6_cmp(struct nf_conn *ct, void *work)
-{
- struct masq_dev_work *w = (struct masq_dev_work *)work;
- struct nf_conntrack_tuple *tuple;
-
- if (!device_cmp(ct, (void *)(long)w->ifindex))
- return 0;
-
- tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-
- return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6);
-}
-
-static void iterate_cleanup_work(struct work_struct *work)
-{
- struct masq_dev_work *w;
-
- w = container_of(work, struct masq_dev_work, work);
-
- nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0);
-
- put_net(w->net);
- kfree(w);
- atomic_dec(&v6_worker_count);
- module_put(THIS_MODULE);
-}
-
/* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep).
*
* Defer it to the system workqueue.
@@ -233,36 +281,19 @@ static int masq_inet6_event(struct notifier_block *this,
{
struct inet6_ifaddr *ifa = ptr;
const struct net_device *dev;
- struct masq_dev_work *w;
- struct net *net;
+ union nf_inet_addr addr;
- if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16)
+ if (event != NETDEV_DOWN)
return NOTIFY_DONE;
dev = ifa->idev->dev;
- net = maybe_get_net(dev_net(dev));
- if (!net)
- return NOTIFY_DONE;
-
- if (!try_module_get(THIS_MODULE))
- goto err_module;
- w = kmalloc(sizeof(*w), GFP_ATOMIC);
- if (w) {
- atomic_inc(&v6_worker_count);
+ memset(&addr, 0, sizeof(addr));
- INIT_WORK(&w->work, iterate_cleanup_work);
- w->ifindex = dev->ifindex;
- w->net = net;
- w->addr = ifa->addr;
- schedule_work(&w->work);
+ addr.in6 = ifa->addr;
- return NOTIFY_DONE;
- }
-
- module_put(THIS_MODULE);
- err_module:
- put_net(net);
+ nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp,
+ GFP_ATOMIC);
return NOTIFY_DONE;
}
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 64eedc17037a..48cc60084d28 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -68,15 +68,13 @@ static bool udp_manip_pkt(struct sk_buff *skb,
enum nf_nat_manip_type maniptype)
{
struct udphdr *hdr;
- bool do_csum;
if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct udphdr *)(skb->data + hdroff);
- do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL;
+ __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check);
- __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, do_csum);
return true;
}
@@ -648,8 +646,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
}
static unsigned int
-nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
+nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
@@ -661,6 +659,61 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
return ret;
}
+#ifdef CONFIG_XFRM
+static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
+{
+ struct sock *sk = skb->sk;
+ struct dst_entry *dst;
+ unsigned int hh_len;
+ struct flowi fl;
+ int err;
+
+ err = xfrm_decode_session(skb, &fl, family);
+ if (err < 0)
+ return err;
+
+ dst = skb_dst(skb);
+ if (dst->xfrm)
+ dst = ((struct xfrm_dst *)dst)->route;
+ if (!dst_hold_safe(dst))
+ return -EHOSTUNREACH;
+
+ if (sk && !net_eq(net, sock_net(sk)))
+ sk = NULL;
+
+ dst = xfrm_lookup(net, dst, &fl, sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
+ return -ENOMEM;
+ return 0;
+}
+#endif
+
+static unsigned int
+nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ __be32 saddr = ip_hdr(skb)->saddr;
+ struct sock *sk = skb->sk;
+ unsigned int ret;
+
+ ret = nf_nat_ipv4_fn(priv, skb, state);
+
+ if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr &&
+ !inet_sk_transparent(sk))
+ skb_orphan(skb); /* TCP edemux obtained wrong socket */
+
+ return ret;
+}
+
static unsigned int
nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -717,7 +770,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -738,7 +791,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
/* Before packet filtering, change destination */
{
- .hook = nf_nat_ipv4_in,
+ .hook = nf_nat_ipv4_pre_routing,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
@@ -759,7 +812,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
},
/* After packet filtering, change source */
{
- .hook = nf_nat_ipv4_fn,
+ .hook = nf_nat_ipv4_local_in,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
@@ -955,7 +1008,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
&ct->tuplehash[!dir].tuple.src.u3)) {
- err = nf_ip6_route_me_harder(state->net, skb);
+ err = nf_ip6_route_me_harder(state->net, state->sk, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -1035,8 +1088,8 @@ int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops)
ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops,
ARRAY_SIZE(nf_nat_ipv4_ops));
if (ret)
- nf_nat_ipv6_unregister_fn(net, ops);
-
+ nf_nat_unregister_fn(net, NFPROTO_IPV6, ops,
+ ARRAY_SIZE(nf_nat_ipv6_ops));
return ret;
}
EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn);
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index f0a735e86851..cf4aeb299bde 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -410,19 +410,7 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
exp->dir = !dir;
exp->expectfn = nf_nat_sip_expected;
- for (; port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.udp.port = htons(port);
- ret = nf_ct_expect_related(exp, NF_CT_EXP_F_SKIP_MASTER);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, port);
if (port == 0) {
nf_ct_helper_log(skb, ct, "all ports in use for SIP");
return NF_DROP;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index f8f52ff99cfb..63d1516816b1 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -21,6 +21,8 @@
#include "nf_internals.h"
+static const struct nf_queue_handler __rcu *nf_queue_handler;
+
/*
* Hook for nfnetlink_queue to register its queue handler.
* We do this so that most of the NFQUEUE code can be modular.
@@ -29,88 +31,86 @@
* receives, no matter what.
*/
-/* return EBUSY when somebody else is registered, return EEXIST if the
- * same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
+void nf_register_queue_handler(const struct nf_queue_handler *qh)
{
/* should never happen, we only have one queueing backend in kernel */
- WARN_ON(rcu_access_pointer(net->nf.queue_handler));
- rcu_assign_pointer(net->nf.queue_handler, qh);
+ WARN_ON(rcu_access_pointer(nf_queue_handler));
+ rcu_assign_pointer(nf_queue_handler, qh);
}
EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
-void nf_unregister_queue_handler(struct net *net)
+void nf_unregister_queue_handler(void)
{
- RCU_INIT_POINTER(net->nf.queue_handler, NULL);
+ RCU_INIT_POINTER(nf_queue_handler, NULL);
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
-static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb)
+static void nf_queue_sock_put(struct sock *sk)
{
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
-
- if (nf_bridge) {
- struct net_device *physdev;
-
- physdev = nf_bridge_get_physindev(skb);
- if (physdev)
- dev_put(physdev);
- physdev = nf_bridge_get_physoutdev(skb);
- if (physdev)
- dev_put(physdev);
- }
+#ifdef CONFIG_INET
+ sock_gen_put(sk);
+#else
+ sock_put(sk);
#endif
}
-void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
+static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
/* Release those devices we held, or Alexey will kill me. */
- if (state->in)
- dev_put(state->in);
- if (state->out)
- dev_put(state->out);
+ dev_put(state->in);
+ dev_put(state->out);
if (state->sk)
- sock_put(state->sk);
+ nf_queue_sock_put(state->sk);
- nf_queue_entry_release_br_nf_refs(entry->skb);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ dev_put(entry->physin);
+ dev_put(entry->physout);
+#endif
}
-EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
-static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb)
+void nf_queue_entry_free(struct nf_queue_entry *entry)
+{
+ nf_queue_entry_release_refs(entry);
+ kfree(entry);
+}
+EXPORT_SYMBOL_GPL(nf_queue_entry_free);
+
+static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ const struct sk_buff *skb = entry->skb;
+ struct nf_bridge_info *nf_bridge;
+ nf_bridge = nf_bridge_info_get(skb);
if (nf_bridge) {
- struct net_device *physdev;
-
- physdev = nf_bridge_get_physindev(skb);
- if (physdev)
- dev_hold(physdev);
- physdev = nf_bridge_get_physoutdev(skb);
- if (physdev)
- dev_hold(physdev);
+ entry->physin = nf_bridge_get_physindev(skb);
+ entry->physout = nf_bridge_get_physoutdev(skb);
+ } else {
+ entry->physin = NULL;
+ entry->physout = NULL;
}
#endif
}
/* Bump dev refs so they don't vanish while packet is out */
-void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
- if (state->in)
- dev_hold(state->in);
- if (state->out)
- dev_hold(state->out);
- if (state->sk)
- sock_hold(state->sk);
+ if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt))
+ return false;
- nf_queue_entry_get_br_nf_refs(entry->skb);
+ dev_hold(state->in);
+ dev_hold(state->out);
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ dev_hold(entry->physin);
+ dev_hold(entry->physout);
+#endif
+ return true;
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
@@ -119,7 +119,7 @@ void nf_queue_nf_hook_drop(struct net *net)
const struct nf_queue_handler *qh;
rcu_read_lock();
- qh = rcu_dereference(net->nf.queue_handler);
+ qh = rcu_dereference(nf_queue_handler);
if (qh)
qh->nf_hook_drop(net);
rcu_read_unlock();
@@ -158,18 +158,15 @@ static void nf_ip6_saveroute(const struct sk_buff *skb,
static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
unsigned int index, unsigned int queuenum)
{
- int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
const struct nf_queue_handler *qh;
- struct net *net = state->net;
unsigned int route_key_size;
+ int status;
/* QUEUE == DROP if no one is waiting, to be safe. */
- qh = rcu_dereference(net->nf.queue_handler);
- if (!qh) {
- status = -ESRCH;
- goto err;
- }
+ qh = rcu_dereference(nf_queue_handler);
+ if (!qh)
+ return -ESRCH;
switch (state->pf) {
case AF_INET:
@@ -183,15 +180,25 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
break;
}
- entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
- if (!entry) {
- status = -ENOMEM;
- goto err;
+ if (skb_sk_is_prefetched(skb)) {
+ struct sock *sk = skb->sk;
+
+ if (!sk_is_refcounted(sk)) {
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ return -ENOTCONN;
+
+ /* drop refcount on skb_orphan */
+ skb->destructor = sock_edemux;
+ }
}
+ entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+
if (skb_dst(skb) && !skb_dst_force(skb)) {
- status = -ENETDOWN;
- goto err;
+ kfree(entry);
+ return -ENETDOWN;
}
*entry = (struct nf_queue_entry) {
@@ -201,7 +208,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
.size = sizeof(*entry) + route_key_size,
};
- nf_queue_entry_get_refs(entry);
+ __nf_queue_entry_init_physdevs(entry);
+
+ if (!nf_queue_entry_get_refs(entry)) {
+ kfree(entry);
+ return -ENOTCONN;
+ }
switch (entry->state.pf) {
case AF_INET:
@@ -213,17 +225,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
}
status = qh->outfn(entry, queuenum);
-
if (status < 0) {
- nf_queue_entry_release_refs(entry);
- goto err;
+ nf_queue_entry_free(entry);
+ return status;
}
return 0;
-
-err:
- kfree(entry);
- return status;
}
/* Packets leaving via this function must come back through nf_reinject(). */
@@ -304,12 +311,10 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
hooks = nf_hook_entries_head(net, pf, entry->state.hook);
- nf_queue_entry_release_refs(entry);
-
i = entry->hook_index;
if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
kfree_skb(skb);
- kfree(entry);
+ nf_queue_entry_free(entry);
return;
}
@@ -348,6 +353,6 @@ next_hook:
kfree_skb(skb);
}
- kfree(entry);
+ nf_queue_entry_free(entry);
}
EXPORT_SYMBOL(nf_reinject);
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index 46cb3786e0ec..34afcd03b6f6 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -89,78 +89,32 @@ out:
return ops;
}
-/* Call get/setsockopt() */
-static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
- char __user *opt, int *len, int get)
+int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, sockptr_t opt,
+ unsigned int len)
{
struct nf_sockopt_ops *ops;
int ret;
- ops = nf_sockopt_find(sk, pf, val, get);
+ ops = nf_sockopt_find(sk, pf, val, 0);
if (IS_ERR(ops))
return PTR_ERR(ops);
-
- if (get)
- ret = ops->get(sk, val, opt, len);
- else
- ret = ops->set(sk, val, opt, *len);
-
+ ret = ops->set(sk, val, opt, len);
module_put(ops->owner);
return ret;
}
-
-int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
- unsigned int len)
-{
- return nf_sockopt(sk, pf, val, opt, &len, 0);
-}
EXPORT_SYMBOL(nf_setsockopt);
int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
int *len)
{
- return nf_sockopt(sk, pf, val, opt, len, 1);
-}
-EXPORT_SYMBOL(nf_getsockopt);
-
-#ifdef CONFIG_COMPAT
-static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val,
- char __user *opt, int *len, int get)
-{
struct nf_sockopt_ops *ops;
int ret;
- ops = nf_sockopt_find(sk, pf, val, get);
+ ops = nf_sockopt_find(sk, pf, val, 1);
if (IS_ERR(ops))
return PTR_ERR(ops);
-
- if (get) {
- if (ops->compat_get)
- ret = ops->compat_get(sk, val, opt, len);
- else
- ret = ops->get(sk, val, opt, len);
- } else {
- if (ops->compat_set)
- ret = ops->compat_set(sk, val, opt, *len);
- else
- ret = ops->set(sk, val, opt, *len);
- }
-
+ ret = ops->get(sk, val, opt, len);
module_put(ops->owner);
return ret;
}
-
-int compat_nf_setsockopt(struct sock *sk, u_int8_t pf,
- int val, char __user *opt, unsigned int len)
-{
- return compat_nf_sockopt(sk, pf, val, opt, &len, 0);
-}
-EXPORT_SYMBOL(compat_nf_setsockopt);
-
-int compat_nf_getsockopt(struct sock *sk, u_int8_t pf,
- int val, char __user *opt, int *len)
-{
- return compat_nf_sockopt(sk, pf, val, opt, len, 1);
-}
-EXPORT_SYMBOL(compat_nf_getsockopt);
-#endif
+EXPORT_SYMBOL(nf_getsockopt);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index b9cbe1e2453e..16915f8eef2b 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -31,6 +31,9 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
int length = (th->doff * 4) - sizeof(*th);
u8 buf[40], *ptr;
+ if (unlikely(length < 0))
+ return false;
+
ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
if (ptr == NULL)
return false;
@@ -47,6 +50,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
length--;
continue;
default:
+ if (length < 2)
+ return true;
opsize = *ptr++;
if (opsize < 2)
return true;
@@ -231,12 +236,6 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
return 1;
}
-static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
- .len = sizeof(struct nf_conn_synproxy),
- .align = __alignof__(struct nf_conn_synproxy),
- .id = NF_CT_EXT_SYNPROXY,
-};
-
#ifdef CONFIG_PROC_FS
static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -344,7 +343,6 @@ static int __net_init synproxy_net_init(struct net *net)
goto err2;
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
- nf_conntrack_get(&ct->ct_general);
snet->tmpl = ct;
snet->stats = alloc_percpu(struct synproxy_stats);
@@ -383,28 +381,12 @@ static struct pernet_operations synproxy_net_ops = {
static int __init synproxy_core_init(void)
{
- int err;
-
- err = nf_ct_extend_register(&nf_ct_synproxy_extend);
- if (err < 0)
- goto err1;
-
- err = register_pernet_subsys(&synproxy_net_ops);
- if (err < 0)
- goto err2;
-
- return 0;
-
-err2:
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
-err1:
- return err;
+ return register_pernet_subsys(&synproxy_net_ops);
}
static void __exit synproxy_core_exit(void)
{
unregister_pernet_subsys(&synproxy_net_ops);
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
}
module_init(synproxy_core_init);
@@ -423,7 +405,7 @@ synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
iph->tos = 0;
iph->id = 0;
iph->frag_off = htons(IP_DF);
- iph->ttl = net->ipv4.sysctl_ip_default_ttl;
+ iph->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
iph->protocol = IPPROTO_TCP;
iph->check = 0;
iph->saddr = saddr;
@@ -446,7 +428,7 @@ synproxy_send_tcp(struct net *net,
skb_dst_set_noref(nskb, skb_dst(skb));
nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC))
goto free_nskb;
if (nfct) {
@@ -704,8 +686,7 @@ ipv4_synproxy_hook(void *priv, struct sk_buff *skb,
nf_ct_seqadj_init(ct, ctinfo, 0);
synproxy->tsoff = 0;
this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
+ fallthrough;
case TCP_CONNTRACK_SYN_SENT:
if (!synproxy_parse_options(skb, thoff, th, &opts))
return NF_DROP;
@@ -850,7 +831,7 @@ synproxy_send_tcp_ipv6(struct net *net,
fl6.fl6_sport = nth->source;
fl6.fl6_dport = nth->dest;
security_skb_classify_flow((struct sk_buff *)skb,
- flowi6_to_flowi(&fl6));
+ flowi6_to_flowi_common(&fl6));
err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
if (err) {
goto free_nskb;
@@ -1128,8 +1109,7 @@ ipv6_synproxy_hook(void *priv, struct sk_buff *skb,
nf_ct_seqadj_init(ct, ctinfo, 0);
synproxy->tsoff = 0;
this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
+ fallthrough;
case TCP_CONNTRACK_SYN_SENT:
if (!synproxy_parse_options(skb, thoff, th, &opts))
return NF_DROP;
@@ -1237,3 +1217,4 @@ EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("nftables SYNPROXY expression support");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 38c680f28f15..e7152d599d73 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -12,6 +12,7 @@
#include <linux/netlink.h>
#include <linux/vmalloc.h>
#include <linux/rhashtable.h>
+#include <linux/audit.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
@@ -24,12 +25,13 @@
#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
+unsigned int nf_tables_net_id __read_mostly;
+
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
static LIST_HEAD(nf_tables_destroy_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
-static u64 table_handle;
enum {
NFT_VALIDATE_SKIP = 0,
@@ -65,9 +67,46 @@ static const struct rhashtable_params nft_objname_ht_params = {
.automatic_shrinking = true,
};
+struct nft_audit_data {
+ struct nft_table *table;
+ int entries;
+ int op;
+ struct list_head list;
+};
+
+static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
+ [NFT_MSG_NEWTABLE] = AUDIT_NFT_OP_TABLE_REGISTER,
+ [NFT_MSG_GETTABLE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELTABLE] = AUDIT_NFT_OP_TABLE_UNREGISTER,
+ [NFT_MSG_NEWCHAIN] = AUDIT_NFT_OP_CHAIN_REGISTER,
+ [NFT_MSG_GETCHAIN] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELCHAIN] = AUDIT_NFT_OP_CHAIN_UNREGISTER,
+ [NFT_MSG_NEWRULE] = AUDIT_NFT_OP_RULE_REGISTER,
+ [NFT_MSG_GETRULE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELRULE] = AUDIT_NFT_OP_RULE_UNREGISTER,
+ [NFT_MSG_NEWSET] = AUDIT_NFT_OP_SET_REGISTER,
+ [NFT_MSG_GETSET] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELSET] = AUDIT_NFT_OP_SET_UNREGISTER,
+ [NFT_MSG_NEWSETELEM] = AUDIT_NFT_OP_SETELEM_REGISTER,
+ [NFT_MSG_GETSETELEM] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELSETELEM] = AUDIT_NFT_OP_SETELEM_UNREGISTER,
+ [NFT_MSG_NEWGEN] = AUDIT_NFT_OP_GEN_REGISTER,
+ [NFT_MSG_GETGEN] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_TRACE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_NEWOBJ] = AUDIT_NFT_OP_OBJ_REGISTER,
+ [NFT_MSG_GETOBJ] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELOBJ] = AUDIT_NFT_OP_OBJ_UNREGISTER,
+ [NFT_MSG_GETOBJ_RESET] = AUDIT_NFT_OP_OBJ_RESET,
+ [NFT_MSG_NEWFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_REGISTER,
+ [NFT_MSG_GETFLOWTABLE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+};
+
static void nft_validate_state_update(struct net *net, u8 new_validate_state)
{
- switch (net->nft.validate_state) {
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ switch (nft_net->validate_state) {
case NFT_VALIDATE_SKIP:
WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
break;
@@ -78,7 +117,7 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
return;
}
- net->nft.validate_state = new_validate_state;
+ nft_net->validate_state = new_validate_state;
}
static void nf_tables_trans_destroy_work(struct work_struct *w);
static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
@@ -113,6 +152,7 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
if (trans == NULL)
return NULL;
+ INIT_LIST_HEAD(&trans->list);
trans->msg_type = msg_type;
trans->ctx = *ctx;
@@ -133,13 +173,15 @@ static void nft_trans_destroy(struct nft_trans *trans)
static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
{
+ struct nftables_pernet *nft_net;
struct net *net = ctx->net;
struct nft_trans *trans;
if (!nft_set_is_anonymous(set))
return;
- list_for_each_entry_reverse(trans, &net->nft.commit_list, list) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
switch (trans->msg_type) {
case NFT_MSG_NEWSET:
if (nft_trans_set(trans) == set)
@@ -180,30 +222,18 @@ err_register:
}
static void nft_netdev_unregister_hooks(struct net *net,
- struct list_head *hook_list)
+ struct list_head *hook_list,
+ bool release_netdev)
{
- struct nft_hook *hook;
+ struct nft_hook *hook, *next;
- list_for_each_entry(hook, hook_list, list)
+ list_for_each_entry_safe(hook, next, hook_list, list) {
nf_unregister_net_hook(net, &hook->ops);
-}
-
-static int nft_register_basechain_hooks(struct net *net, int family,
- struct nft_base_chain *basechain)
-{
- if (family == NFPROTO_NETDEV)
- return nft_netdev_register_hooks(net, &basechain->hook_list);
-
- return nf_register_net_hook(net, &basechain->ops);
-}
-
-static void nft_unregister_basechain_hooks(struct net *net, int family,
- struct nft_base_chain *basechain)
-{
- if (family == NFPROTO_NETDEV)
- nft_netdev_unregister_hooks(net, &basechain->hook_list);
- else
- nf_unregister_net_hook(net, &basechain->ops);
+ if (release_netdev) {
+ list_del(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
+ }
}
static int nf_tables_register_hook(struct net *net,
@@ -223,12 +253,16 @@ static int nf_tables_register_hook(struct net *net,
if (basechain->type->ops_register)
return basechain->type->ops_register(net, ops);
- return nft_register_basechain_hooks(net, table->family, basechain);
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+ return nft_netdev_register_hooks(net, &basechain->hook_list);
+
+ return nf_register_net_hook(net, &basechain->ops);
}
-static void nf_tables_unregister_hook(struct net *net,
- const struct nft_table *table,
- struct nft_chain *chain)
+static void __nf_tables_unregister_hook(struct net *net,
+ const struct nft_table *table,
+ struct nft_chain *chain,
+ bool release_netdev)
{
struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
@@ -242,7 +276,25 @@ static void nf_tables_unregister_hook(struct net *net,
if (basechain->type->ops_unregister)
return basechain->type->ops_unregister(net, ops);
- nft_unregister_basechain_hooks(net, table->family, basechain);
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+ nft_netdev_unregister_hooks(net, &basechain->hook_list,
+ release_netdev);
+ else
+ nf_unregister_net_hook(net, &basechain->ops);
+}
+
+static void nf_tables_unregister_hook(struct net *net,
+ const struct nft_table *table,
+ struct nft_chain *chain)
+{
+ return __nf_tables_unregister_hook(net, table, chain, false);
+}
+
+static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ list_add_tail(&trans->list, &nft_net->commit_list);
}
static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -256,7 +308,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
if (msg_type == NFT_MSG_NEWTABLE)
nft_activate_next(ctx->net, ctx->table);
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
@@ -280,10 +332,16 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
if (trans == NULL)
return ERR_PTR(-ENOMEM);
- if (msg_type == NFT_MSG_NEWCHAIN)
+ if (msg_type == NFT_MSG_NEWCHAIN) {
nft_activate_next(ctx->net, ctx->chain);
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ if (ctx->nla[NFTA_CHAIN_ID]) {
+ nft_trans_chain_id(trans) =
+ ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID]));
+ }
+ }
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return trans;
}
@@ -307,7 +365,7 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx,
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
if (expr->ops->activate)
expr->ops->activate(ctx, expr);
@@ -322,7 +380,7 @@ static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
if (expr->ops->deactivate)
expr->ops->deactivate(ctx, expr, phase);
@@ -356,7 +414,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
}
nft_trans_rule(trans) = rule;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return trans;
}
@@ -422,7 +480,7 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
nft_activate_next(ctx->net, set);
}
nft_trans_set(trans) = set;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
@@ -454,7 +512,7 @@ static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
nft_activate_next(ctx->net, obj);
nft_trans_obj(trans) = obj;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
@@ -486,8 +544,9 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
if (msg_type == NFT_MSG_NEWFLOWTABLE)
nft_activate_next(ctx->net, flowtable);
+ INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
nft_trans_flowtable(trans) = flowtable;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
@@ -507,24 +566,84 @@ static int nft_delflowtable(struct nft_ctx *ctx,
return err;
}
+static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg)
+{
+ int i;
+
+ for (i = track->regs[dreg].num_reg; i > 0; i--)
+ __nft_reg_track_cancel(track, dreg - i);
+}
+
+static void __nft_reg_track_update(struct nft_regs_track *track,
+ const struct nft_expr *expr,
+ u8 dreg, u8 num_reg)
+{
+ track->regs[dreg].selector = expr;
+ track->regs[dreg].bitwise = NULL;
+ track->regs[dreg].num_reg = num_reg;
+}
+
+void nft_reg_track_update(struct nft_regs_track *track,
+ const struct nft_expr *expr, u8 dreg, u8 len)
+{
+ unsigned int regcount;
+ int i;
+
+ __nft_reg_track_clobber(track, dreg);
+
+ regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ __nft_reg_track_update(track, expr, dreg, i);
+}
+EXPORT_SYMBOL_GPL(nft_reg_track_update);
+
+void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len)
+{
+ unsigned int regcount;
+ int i;
+
+ __nft_reg_track_clobber(track, dreg);
+
+ regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ __nft_reg_track_cancel(track, dreg);
+}
+EXPORT_SYMBOL_GPL(nft_reg_track_cancel);
+
+void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg)
+{
+ track->regs[dreg].selector = NULL;
+ track->regs[dreg].bitwise = NULL;
+ track->regs[dreg].num_reg = 0;
+}
+EXPORT_SYMBOL_GPL(__nft_reg_track_cancel);
+
/*
* Tables
*/
static struct nft_table *nft_table_lookup(const struct net *net,
const struct nlattr *nla,
- u8 family, u8 genmask)
+ u8 family, u8 genmask, u32 nlpid)
{
+ struct nftables_pernet *nft_net;
struct nft_table *table;
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry_rcu(table, &nft_net->tables, list,
+ lockdep_is_held(&nft_net->commit_mutex)) {
if (!nla_strcmp(nla, table->name) &&
table->family == family &&
- nft_active_genmask(table, genmask))
+ nft_active_genmask(table, genmask)) {
+ if (nft_table_has_owner(table) &&
+ nlpid && table->nlpid != nlpid)
+ return ERR_PTR(-EPERM);
+
return table;
+ }
}
return ERR_PTR(-ENOENT);
@@ -532,14 +651,21 @@ static struct nft_table *nft_table_lookup(const struct net *net,
static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
const struct nlattr *nla,
- u8 genmask)
+ u8 genmask, u32 nlpid)
{
+ struct nftables_pernet *nft_net;
struct nft_table *table;
- list_for_each_entry(table, &net->nft.tables, list) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry(table, &nft_net->tables, list) {
if (be64_to_cpu(nla_get_be64(nla)) == table->handle &&
- nft_active_genmask(table, genmask))
+ nft_active_genmask(table, genmask)) {
+ if (nft_table_has_owner(table) &&
+ nlpid && table->nlpid != nlpid)
+ return ERR_PTR(-EPERM);
+
return table;
+ }
}
return ERR_PTR(-ENOENT);
@@ -585,9 +711,11 @@ struct nft_module_request {
};
#ifdef CONFIG_MODULES
-static int nft_request_module(struct net *net, const char *fmt, ...)
+__printf(2, 3) int nft_request_module(struct net *net, const char *fmt,
+ ...)
{
char module_name[MODULE_NAME_LEN];
+ struct nftables_pernet *nft_net;
struct nft_module_request *req;
va_list args;
int ret;
@@ -598,7 +726,8 @@ static int nft_request_module(struct net *net, const char *fmt, ...)
if (ret >= MODULE_NAME_LEN)
return 0;
- list_for_each_entry(req, &net->nft.module_list, list) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry(req, &nft_net->module_list, list) {
if (!strcmp(req->module, module_name)) {
if (req->done)
return 0;
@@ -613,17 +742,19 @@ static int nft_request_module(struct net *net, const char *fmt, ...)
return -ENOMEM;
req->done = false;
- strlcpy(req->module, module_name, MODULE_NAME_LEN);
- list_add_tail(&req->list, &net->nft.module_list);
+ strscpy(req->module, module_name, MODULE_NAME_LEN);
+ list_add_tail(&req->list, &nft_net->module_list);
return -EAGAIN;
}
+EXPORT_SYMBOL_GPL(nft_request_module);
#endif
static void lockdep_nfnl_nft_mutex_not_held(void)
{
#ifdef CONFIG_PROVE_LOCKING
- WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ if (debug_locks)
+ WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
#endif
}
@@ -649,11 +780,20 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
return ERR_PTR(-ENOENT);
}
+static __be16 nft_base_seq(const struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ return htons(nft_net->base_seq & 0xffff);
+}
+
static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
[NFTA_TABLE_NAME] = { .type = NLA_STRING,
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_TABLE_FLAGS] = { .type = NLA_U32 },
[NFTA_TABLE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN }
};
static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
@@ -661,24 +801,28 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
int family, const struct nft_table *table)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+ NFNETLINK_V0, nft_base_seq(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
- nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
+ nla_put_be32(skb, NFTA_TABLE_FLAGS,
+ htonl(table->flags & NFT_TABLE_F_MASK)) ||
nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle),
NFTA_TABLE_PAD))
goto nla_put_failure;
+ if (nft_table_has_owner(table) &&
+ nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid)))
+ goto nla_put_failure;
+
+ if (table->udata) {
+ if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata))
+ goto nla_put_failure;
+ }
nlmsg_end(skb, nlh);
return 0;
@@ -688,9 +832,23 @@ nla_put_failure:
return -1;
}
+struct nftnl_skb_parms {
+ bool report;
+};
+#define NFT_CB(skb) (*(struct nftnl_skb_parms*)&((skb)->cb))
+
+static void nft_notify_enqueue(struct sk_buff *skb, bool report,
+ struct list_head *notify_list)
+{
+ NFT_CB(skb).report = report;
+ list_add_tail(&skb->list, notify_list);
+}
+
static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
{
+ struct nftables_pernet *nft_net;
struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -701,15 +859,18 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq,
- event, 0, ctx->family, ctx->table);
+ event, flags, ctx->family, ctx->table);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_net = nft_pernet(ctx->net);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -719,15 +880,17 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nftables_pernet *nft_net;
const struct nft_table *table;
unsigned int idx = 0, s_idx = cb->args[0];
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = READ_ONCE(nft_net->base_seq);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -773,28 +936,27 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
}
/* called with rcu_read_lock held */
-static int nf_tables_gettable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_table *table;
+ struct net *net = info->net;
struct sk_buff *skb2;
- int family = nfmsg->nfgen_family;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_tables,
.module = THIS_MODULE,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
- table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]);
return PTR_ERR(table);
@@ -805,14 +967,14 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
return -ENOMEM;
err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
- family, table);
+ info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE,
+ 0, family, table);
if (err < 0)
- goto err;
+ goto err_fill_table_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_table_info:
kfree_skb(skb2);
return err;
}
@@ -831,8 +993,7 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt)
if (cnt && i++ == cnt)
break;
- nft_unregister_basechain_hooks(net, table->family,
- nft_base_chain(chain));
+ nf_tables_unregister_hook(net, table, chain);
}
}
@@ -847,8 +1008,7 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table)
if (!nft_is_base_chain(chain))
continue;
- err = nft_register_basechain_hooks(net, table->family,
- nft_base_chain(chain));
+ err = nf_tables_register_hook(net, table, chain);
if (err < 0)
goto err_register_hooks;
@@ -864,25 +1024,39 @@ err_register_hooks:
static void nf_tables_table_disable(struct net *net, struct nft_table *table)
{
+ table->flags &= ~NFT_TABLE_F_DORMANT;
nft_table_disable(net, table, 0);
+ table->flags |= NFT_TABLE_F_DORMANT;
}
+#define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1)
+#define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0)
+#define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1)
+#define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \
+ __NFT_TABLE_F_WAS_AWAKEN)
+
static int nf_tables_updtable(struct nft_ctx *ctx)
{
struct nft_trans *trans;
u32 flags;
- int ret = 0;
+ int ret;
if (!ctx->nla[NFTA_TABLE_FLAGS])
return 0;
flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS]));
- if (flags & ~NFT_TABLE_F_DORMANT)
- return -EINVAL;
+ if (flags & ~NFT_TABLE_F_MASK)
+ return -EOPNOTSUPP;
if (flags == ctx->table->flags)
return 0;
+ if ((nft_table_has_owner(ctx->table) &&
+ !(flags & NFT_TABLE_F_OWNER)) ||
+ (!nft_table_has_owner(ctx->table) &&
+ flags & NFT_TABLE_F_OWNER))
+ return -EOPNOTSUPP;
+
trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
sizeof(struct nft_trans_table));
if (trans == NULL)
@@ -890,22 +1064,27 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
if ((flags & NFT_TABLE_F_DORMANT) &&
!(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
- nft_trans_table_enable(trans) = false;
+ ctx->table->flags |= NFT_TABLE_F_DORMANT;
+ if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE))
+ ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN;
} else if (!(flags & NFT_TABLE_F_DORMANT) &&
ctx->table->flags & NFT_TABLE_F_DORMANT) {
- ret = nf_tables_table_enable(ctx->net, ctx->table);
- if (ret >= 0) {
- ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
- nft_trans_table_enable(trans) = true;
+ ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
+ if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) {
+ ret = nf_tables_table_enable(ctx->net, ctx->table);
+ if (ret < 0)
+ goto err_register_hooks;
+
+ ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT;
}
}
- if (ret < 0)
- goto err;
nft_trans_table_update(trans) = true;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
return 0;
-err:
+
+err_register_hooks:
nft_trans_destroy(trans);
return ret;
}
@@ -961,53 +1140,90 @@ static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg,
return strcmp(obj->key.name, k->name);
}
-static int nf_tables_newtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static bool nft_supported_family(u8 family)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ return false
+#ifdef CONFIG_NF_TABLES_INET
+ || family == NFPROTO_INET
+#endif
+#ifdef CONFIG_NF_TABLES_IPV4
+ || family == NFPROTO_IPV4
+#endif
+#ifdef CONFIG_NF_TABLES_ARP
+ || family == NFPROTO_ARP
+#endif
+#ifdef CONFIG_NF_TABLES_NETDEV
+ || family == NFPROTO_NETDEV
+#endif
+#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
+ || family == NFPROTO_BRIDGE
+#endif
+#ifdef CONFIG_NF_TABLES_IPV6
+ || family == NFPROTO_IPV6
+#endif
+ ;
+}
+
+static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
- u32 flags = 0;
struct nft_ctx ctx;
+ u32 flags = 0;
int err;
- lockdep_assert_held(&net->nft.commit_mutex);
+ if (!nft_supported_family(family))
+ return -EOPNOTSUPP;
+
+ lockdep_assert_held(&nft_net->commit_mutex);
attr = nla[NFTA_TABLE_NAME];
- table = nft_table_lookup(net, attr, family, genmask);
+ table = nft_table_lookup(net, attr, family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
if (PTR_ERR(table) != -ENOENT)
return PTR_ERR(table);
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
return nf_tables_updtable(&ctx);
}
if (nla[NFTA_TABLE_FLAGS]) {
flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
- if (flags & ~NFT_TABLE_F_DORMANT)
- return -EINVAL;
+ if (flags & ~NFT_TABLE_F_MASK)
+ return -EOPNOTSUPP;
}
err = -ENOMEM;
- table = kzalloc(sizeof(*table), GFP_KERNEL);
+ table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
if (table == NULL)
goto err_kzalloc;
- table->name = nla_strdup(attr, GFP_KERNEL);
+ table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
if (table->name == NULL)
goto err_strdup;
+ if (nla[NFTA_TABLE_USERDATA]) {
+ table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT);
+ if (table->udata == NULL)
+ goto err_table_udata;
+
+ table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]);
+ }
+
err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
if (err)
goto err_chain_ht;
@@ -1018,18 +1234,22 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
INIT_LIST_HEAD(&table->flowtables);
table->family = family;
table->flags = flags;
- table->handle = ++table_handle;
+ table->handle = ++nft_net->table_handle;
+ if (table->flags & NFT_TABLE_F_OWNER)
+ table->nlpid = NETLINK_CB(skb).portid;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
if (err < 0)
goto err_trans;
- list_add_tail_rcu(&table->list, &net->nft.tables);
+ list_add_tail_rcu(&table->list, &nft_net->tables);
return 0;
err_trans:
rhltable_destroy(&table->chains_ht);
err_chain_ht:
+ kfree(table->udata);
+err_table_udata:
kfree(table->name);
err_strdup:
kfree(table);
@@ -1049,6 +1269,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, chain))
continue;
+ if (nft_chain_is_bound(chain))
+ continue;
+
ctx->chain = chain;
err = nft_delrule_by_chain(ctx);
@@ -1091,6 +1314,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, chain))
continue;
+ if (nft_chain_is_bound(chain))
+ continue;
+
ctx->chain = chain;
err = nft_delchain(ctx);
@@ -1105,11 +1331,12 @@ out:
static int nft_flush(struct nft_ctx *ctx, int family)
{
- struct nft_table *table, *nt;
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
const struct nlattr * const *nla = ctx->nla;
+ struct nft_table *table, *nt;
int err = 0;
- list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) {
+ list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
if (family != AF_UNSPEC && table->family != family)
continue;
@@ -1118,6 +1345,9 @@ static int nft_flush(struct nft_ctx *ctx, int family)
if (!nft_is_active_next(ctx->net, table))
continue;
+ if (nft_table_has_owner(table) && table->nlpid != ctx->portid)
+ continue;
+
if (nla[NFTA_TABLE_NAME] &&
nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
continue;
@@ -1132,29 +1362,30 @@ out:
return err;
}
-static int nf_tables_deltable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
- nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla);
if (family == AF_UNSPEC ||
(!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
return nft_flush(&ctx, family);
if (nla[NFTA_TABLE_HANDLE]) {
attr = nla[NFTA_TABLE_HANDLE];
- table = nft_table_lookup_byhandle(net, attr, genmask);
+ table = nft_table_lookup_byhandle(net, attr, genmask,
+ NETLINK_CB(skb).portid);
} else {
attr = nla[NFTA_TABLE_NAME];
- table = nft_table_lookup(net, attr, family, genmask);
+ table = nft_table_lookup(net, attr, family, genmask,
+ NETLINK_CB(skb).portid);
}
if (IS_ERR(table)) {
@@ -1162,7 +1393,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
return PTR_ERR(table);
}
- if (nlh->nlmsg_flags & NLM_F_NONREC &&
+ if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
table->use > 0)
return -EBUSY;
@@ -1179,6 +1410,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
rhltable_destroy(&ctx->table->chains_ht);
kfree(ctx->table->name);
+ kfree(ctx->table->udata);
kfree(ctx->table);
}
@@ -1223,7 +1455,9 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
static bool lockdep_commit_lock_is_held(const struct net *net)
{
#ifdef CONFIG_PROVE_LOCKING
- return lockdep_is_held(&net->nft.commit_mutex);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ return lockdep_is_held(&nft_net->commit_mutex);
#else
return true;
#endif
@@ -1240,7 +1474,7 @@ static struct nft_chain *nft_chain_lookup(struct net *net,
if (nla == NULL)
return ERR_PTR(-EINVAL);
- nla_strlcpy(search, nla, sizeof(search));
+ nla_strscpy(search, nla, sizeof(search));
WARN_ON(!rcu_read_lock_held() &&
!lockdep_commit_lock_is_held(net));
@@ -1273,6 +1507,9 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
.len = NFT_MODULE_AUTOLOAD_LIMIT },
[NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED },
[NFTA_CHAIN_FLAGS] = { .type = NLA_U32 },
+ [NFTA_CHAIN_ID] = { .type = NLA_U32 },
+ [NFTA_CHAIN_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -1337,7 +1574,7 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
goto nla_put_failure;
- if (family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(family, ops->hooknum)) {
nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
list_for_each_entry(hook, &basechain->hook_list, list) {
if (!first)
@@ -1367,18 +1604,13 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
const struct nft_chain *chain)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+ NFNETLINK_V0, nft_base_seq(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
goto nla_put_failure;
if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
@@ -1405,16 +1637,19 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
lockdep_commit_lock_is_held(net));
if (nft_dump_stats(skb, stats))
goto nla_put_failure;
-
- if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) &&
- nla_put_be32(skb, NFTA_CHAIN_FLAGS,
- htonl(NFT_CHAIN_HW_OFFLOAD)))
- goto nla_put_failure;
}
+ if (chain->flags &&
+ nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags)))
+ goto nla_put_failure;
+
if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
goto nla_put_failure;
+ if (chain->udata &&
+ nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1425,7 +1660,9 @@ nla_put_failure:
static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
{
+ struct nftables_pernet *nft_net;
struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -1436,16 +1673,19 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
- event, 0, ctx->family, ctx->table,
+ event, flags, ctx->family, ctx->table,
ctx->chain);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_net = nft_pernet(ctx->net);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -1455,16 +1695,18 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- const struct nft_table *table;
- const struct nft_chain *chain;
unsigned int idx = 0, s_idx = cb->args[0];
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = READ_ONCE(nft_net->base_seq);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -1497,29 +1739,28 @@ done:
}
/* called with rcu_read_lock held */
-static int nf_tables_getchain(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_chain *chain;
+ struct net *net = info->net;
struct nft_table *table;
struct sk_buff *skb2;
- int family = nfmsg->nfgen_family;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_chains,
.module = THIS_MODULE,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
- table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
@@ -1536,14 +1777,14 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
return -ENOMEM;
err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
- family, table, chain);
+ info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
+ 0, family, table, chain);
if (err < 0)
- goto err;
+ goto err_fill_chain_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_chain_info:
kfree_skb(skb2);
return err;
}
@@ -1601,19 +1842,19 @@ static void nft_chain_stats_replace(struct nft_trans *trans)
static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
{
- struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
- struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+ struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0);
+ struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1);
if (g0 != g1)
kvfree(g1);
kvfree(g0);
/* should be NULL either via abort or via successful commit */
- WARN_ON_ONCE(chain->rules_next);
- kvfree(chain->rules_next);
+ WARN_ON_ONCE(chain->blob_next);
+ kvfree(chain->blob_next);
}
-static void nf_tables_chain_destroy(struct nft_ctx *ctx)
+void nf_tables_chain_destroy(struct nft_ctx *ctx)
{
struct nft_chain *chain = ctx->chain;
struct nft_hook *hook, *next;
@@ -1627,7 +1868,7 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
if (nft_is_base_chain(chain)) {
struct nft_base_chain *basechain = nft_base_chain(chain);
- if (ctx->family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
list_for_each_entry_safe(hook, next,
&basechain->hook_list, list) {
list_del_rcu(&hook->list);
@@ -1640,9 +1881,11 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
free_percpu(rcu_dereference_raw(basechain->stats));
}
kfree(chain->name);
+ kfree(chain->udata);
kfree(basechain);
} else {
kfree(chain->name);
+ kfree(chain->udata);
kfree(chain);
}
}
@@ -1655,13 +1898,17 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
struct nft_hook *hook;
int err;
- hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL);
+ hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
if (!hook) {
err = -ENOMEM;
goto err_hook_alloc;
}
- nla_strlcpy(ifname, attr, IFNAMSIZ);
+ nla_strscpy(ifname, attr, IFNAMSIZ);
+ /* nf_tables_netdev_event() is called under rtnl_mutex, this is
+ * indirectly serializing all the other holders of the commit_mutex with
+ * the rtnl_mutex.
+ */
dev = __dev_get_by_name(net, ifname);
if (!dev) {
err = -ENOENT;
@@ -1677,17 +1924,17 @@ err_hook_alloc:
return ERR_PTR(err);
}
-static bool nft_hook_list_find(struct list_head *hook_list,
- const struct nft_hook *this)
+static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
+ const struct nft_hook *this)
{
struct nft_hook *hook;
list_for_each_entry(hook, hook_list, list) {
if (this->ops.dev == hook->ops.dev)
- return true;
+ return hook;
}
- return false;
+ return NULL;
}
static int nf_tables_parse_netdev_hooks(struct net *net,
@@ -1722,8 +1969,6 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
goto err_hook;
}
}
- if (!n)
- return -EINVAL;
return 0;
@@ -1760,6 +2005,9 @@ static int nft_chain_parse_netdev(struct net *net,
hook_list);
if (err < 0)
return err;
+
+ if (list_empty(hook_list))
+ return -EINVAL;
} else {
return -EINVAL;
}
@@ -1770,13 +2018,14 @@ static int nft_chain_parse_netdev(struct net *net,
static int nft_chain_parse_hook(struct net *net,
const struct nlattr * const nla[],
struct nft_chain_hook *hook, u8 family,
- bool autoload)
+ struct netlink_ext_ack *extack, bool autoload)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nlattr *ha[NFTA_HOOK_MAX + 1];
const struct nft_chain_type *type;
int err;
- lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
lockdep_nfnl_nft_mutex_not_held();
err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX,
@@ -1799,23 +2048,28 @@ static int nft_chain_parse_hook(struct net *net,
if (nla[NFTA_CHAIN_TYPE]) {
type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
family, autoload);
- if (IS_ERR(type))
+ if (IS_ERR(type)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
return PTR_ERR(type);
+ }
}
- if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
+ if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
return -EOPNOTSUPP;
if (type->type == NFT_CHAIN_T_NAT &&
hook->priority <= NF_IP_PRI_CONNTRACK)
return -EOPNOTSUPP;
- if (!try_module_get(type->owner))
+ if (!try_module_get(type->owner)) {
+ if (nla[NFTA_CHAIN_TYPE])
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
return -ENOENT;
+ }
hook->type = type;
INIT_LIST_HEAD(&hook->list);
- if (family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(family, hook->num)) {
err = nft_chain_parse_netdev(net, ha, &hook->list);
if (err < 0) {
module_put(type->owner);
@@ -1842,34 +2096,50 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
struct nft_rules_old {
struct rcu_head h;
- struct nft_rule **start;
+ struct nft_rule_blob *blob;
};
-static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
- unsigned int alloc)
+static void nft_last_rule(struct nft_rule_blob *blob, const void *ptr)
{
- if (alloc > INT_MAX)
+ struct nft_rule_dp *prule;
+
+ prule = (struct nft_rule_dp *)ptr;
+ prule->is_last = 1;
+ /* blob size does not include the trailer rule */
+}
+
+static struct nft_rule_blob *nf_tables_chain_alloc_rules(unsigned int size)
+{
+ struct nft_rule_blob *blob;
+
+ /* size must include room for the last rule */
+ if (size < offsetof(struct nft_rule_dp, data))
+ return NULL;
+
+ size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rules_old);
+ if (size > INT_MAX)
return NULL;
- alloc += 1; /* NULL, ends rules */
- if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+ blob = kvmalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!blob)
return NULL;
- alloc *= sizeof(struct nft_rule *);
- alloc += sizeof(struct nft_rules_old);
+ blob->size = 0;
+ nft_last_rule(blob, blob->data);
- return kvmalloc(alloc, GFP_KERNEL);
+ return blob;
}
static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family,
const struct nft_chain_hook *hook,
struct nft_chain *chain)
{
- ops->pf = family;
- ops->hooknum = hook->num;
- ops->priority = hook->priority;
- ops->priv = chain;
- ops->hook = hook->type->hooks[ops->hooknum];
+ ops->pf = family;
+ ops->hooknum = hook->num;
+ ops->priority = hook->priority;
+ ops->priv = chain;
+ ops->hook = hook->type->hooks[ops->hooknum];
+ ops->hook_ops_type = NF_HOOK_OP_NF_TABLES;
}
static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
@@ -1882,7 +2152,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
INIT_LIST_HEAD(&basechain->hook_list);
chain = &basechain->chain;
- if (family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(family, hook->num)) {
list_splice_init(&hook->list, &basechain->hook_list);
list_for_each_entry(h, &basechain->hook_list, list)
nft_basechain_hook_init(&h->ops, family, hook, chain);
@@ -1893,41 +2163,66 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
nft_basechain_hook_init(&basechain->ops, family, hook, chain);
}
- chain->flags |= NFT_BASE_CHAIN | flags;
+ chain->flags |= NFT_CHAIN_BASE | flags;
basechain->policy = NF_ACCEPT;
if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
- nft_chain_offload_priority(basechain) < 0)
+ !nft_chain_offload_support(basechain)) {
+ list_splice_init(&basechain->hook_list, &hook->list);
return -EOPNOTSUPP;
+ }
flow_block_init(&basechain->flow_block);
return 0;
}
+static int nft_chain_add(struct nft_table *table, struct nft_chain *chain)
+{
+ int err;
+
+ err = rhltable_insert_key(&table->chains_ht, chain->name,
+ &chain->rhlhead, nft_chain_ht_params);
+ if (err)
+ return err;
+
+ list_add_tail_rcu(&chain->list, &table->chains);
+
+ return 0;
+}
+
+static u64 chain_id;
+
static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
- u8 policy, u32 flags)
+ u8 policy, u32 flags,
+ struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
struct nft_base_chain *basechain;
- struct nft_stats __percpu *stats;
struct net *net = ctx->net;
+ char name[NFT_NAME_MAXLEN];
+ struct nft_rule_blob *blob;
struct nft_trans *trans;
struct nft_chain *chain;
- struct nft_rule **rules;
+ unsigned int data_size;
int err;
if (table->use == UINT_MAX)
return -EOVERFLOW;
if (nla[NFTA_CHAIN_HOOK]) {
+ struct nft_stats __percpu *stats = NULL;
struct nft_chain_hook hook;
- err = nft_chain_parse_hook(net, nla, &hook, family, true);
+ if (flags & NFT_CHAIN_BINDING)
+ return -EOPNOTSUPP;
+
+ err = nft_chain_parse_hook(net, nla, &hook, family, extack,
+ true);
if (err < 0)
return err;
- basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+ basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT);
if (basechain == NULL) {
nft_chain_release_hook(&hook);
return -ENOMEM;
@@ -1942,69 +2237,97 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
return PTR_ERR(stats);
}
rcu_assign_pointer(basechain->stats, stats);
- static_branch_inc(&nft_counters_enabled);
}
err = nft_basechain_init(basechain, family, &hook, flags);
if (err < 0) {
nft_chain_release_hook(&hook);
kfree(basechain);
+ free_percpu(stats);
return err;
}
+ if (stats)
+ static_branch_inc(&nft_counters_enabled);
} else {
- chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+ if (flags & NFT_CHAIN_BASE)
+ return -EINVAL;
+ if (flags & NFT_CHAIN_HW_OFFLOAD)
+ return -EOPNOTSUPP;
+
+ chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
if (chain == NULL)
return -ENOMEM;
+
+ chain->flags = flags;
}
ctx->chain = chain;
INIT_LIST_HEAD(&chain->rules);
chain->handle = nf_tables_alloc_handle(table);
chain->table = table;
- chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+
+ if (nla[NFTA_CHAIN_NAME]) {
+ chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
+ } else {
+ if (!(flags & NFT_CHAIN_BINDING)) {
+ err = -EINVAL;
+ goto err_destroy_chain;
+ }
+
+ snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
+ chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
+ }
+
if (!chain->name) {
err = -ENOMEM;
- goto err1;
+ goto err_destroy_chain;
+ }
+
+ if (nla[NFTA_CHAIN_USERDATA]) {
+ chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
+ if (chain->udata == NULL) {
+ err = -ENOMEM;
+ goto err_destroy_chain;
+ }
+ chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);
}
- rules = nf_tables_chain_alloc_rules(chain, 0);
- if (!rules) {
+ data_size = offsetof(struct nft_rule_dp, data); /* last rule */
+ blob = nf_tables_chain_alloc_rules(data_size);
+ if (!blob) {
err = -ENOMEM;
- goto err1;
+ goto err_destroy_chain;
}
- *rules = NULL;
- rcu_assign_pointer(chain->rules_gen_0, rules);
- rcu_assign_pointer(chain->rules_gen_1, rules);
+ RCU_INIT_POINTER(chain->blob_gen_0, blob);
+ RCU_INIT_POINTER(chain->blob_gen_1, blob);
err = nf_tables_register_hook(net, table, chain);
if (err < 0)
- goto err1;
-
- err = rhltable_insert_key(&table->chains_ht, chain->name,
- &chain->rhlhead, nft_chain_ht_params);
- if (err)
- goto err2;
+ goto err_destroy_chain;
trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- rhltable_remove(&table->chains_ht, &chain->rhlhead,
- nft_chain_ht_params);
- goto err2;
+ goto err_unregister_hook;
}
nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
if (nft_is_base_chain(chain))
nft_trans_chain_policy(trans) = policy;
+ err = nft_chain_add(table, chain);
+ if (err < 0) {
+ nft_trans_destroy(trans);
+ goto err_unregister_hook;
+ }
+
table->use++;
- list_add_tail_rcu(&chain->list, &table->chains);
return 0;
-err2:
+err_unregister_hook:
nf_tables_unregister_hook(net, table, chain);
-err1:
+err_destroy_chain:
nf_tables_chain_destroy(ctx);
return err;
@@ -2030,7 +2353,8 @@ static bool nft_hook_list_equal(struct list_head *hook_list1,
}
static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
- u32 flags)
+ u32 flags, const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -2046,32 +2370,36 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
return -EOPNOTSUPP;
if (nla[NFTA_CHAIN_HOOK]) {
- if (!nft_is_base_chain(chain))
- return -EBUSY;
-
+ if (!nft_is_base_chain(chain)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
+ }
err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
- false);
+ extack, false);
if (err < 0)
return err;
basechain = nft_base_chain(chain);
if (basechain->type != hook.type) {
nft_chain_release_hook(&hook);
- return -EBUSY;
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
}
- if (ctx->family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(ctx->family, hook.num)) {
if (!nft_hook_list_equal(&basechain->hook_list,
&hook.list)) {
nft_chain_release_hook(&hook);
- return -EBUSY;
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
}
} else {
ops = &basechain->ops;
if (ops->hooknum != hook.num ||
ops->priority != hook.priority) {
nft_chain_release_hook(&hook);
- return -EBUSY;
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
}
}
nft_chain_release_hook(&hook);
@@ -2083,8 +2411,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
chain2 = nft_chain_lookup(ctx->net, table,
nla[NFTA_CHAIN_NAME], genmask);
- if (!IS_ERR(chain2))
+ if (!IS_ERR(chain2)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
return -EEXIST;
+ }
}
if (nla[NFTA_CHAIN_COUNTERS]) {
@@ -2112,21 +2442,23 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
if (nla[NFTA_CHAIN_HANDLE] &&
nla[NFTA_CHAIN_NAME]) {
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct nft_trans *tmp;
char *name;
err = -ENOMEM;
- name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+ name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
if (!name)
goto err;
err = -EEXIST;
- list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) {
+ list_for_each_entry(tmp, &nft_net->commit_list, list) {
if (tmp->msg_type == NFT_MSG_NEWCHAIN &&
tmp->ctx.table == table &&
nft_trans_chain_update(tmp) &&
nft_trans_chain_name(tmp) &&
strcmp(name, nft_trans_chain_name(tmp)) == 0) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
kfree(name);
goto err;
}
@@ -2134,7 +2466,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
nft_trans_chain_name(trans) = name;
}
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
err:
@@ -2143,25 +2475,45 @@ err:
return err;
}
-static int nf_tables_newchain(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
+ const struct nft_table *table,
+ const struct nlattr *nla)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ u32 id = ntohl(nla_get_be32(nla));
+ struct nft_trans *trans;
+
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ struct nft_chain *chain = trans->ctx.chain;
+
+ if (trans->msg_type == NFT_MSG_NEWCHAIN &&
+ chain->table == table &&
+ id == nft_trans_chain_id(trans))
+ return chain;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct nft_chain *chain = NULL;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
- struct nft_chain *chain;
u8 policy = NF_ACCEPT;
struct nft_ctx ctx;
u64 handle = 0;
u32 flags = 0;
- lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
- table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
@@ -2178,7 +2530,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
return PTR_ERR(chain);
}
attr = nla[NFTA_CHAIN_HANDLE];
- } else {
+ } else if (nla[NFTA_CHAIN_NAME]) {
chain = nft_chain_lookup(net, table, attr, genmask);
if (IS_ERR(chain)) {
if (PTR_ERR(chain) != -ENOENT) {
@@ -2187,6 +2539,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
}
chain = NULL;
}
+ } else if (!nla[NFTA_CHAIN_ID]) {
+ return -EINVAL;
}
if (nla[NFTA_CHAIN_POLICY]) {
@@ -2217,31 +2571,37 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
else if (chain)
flags = chain->flags;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ if (flags & ~NFT_CHAIN_FLAGS)
+ return -EOPNOTSUPP;
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
if (chain != NULL) {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (chain->flags & NFT_CHAIN_BINDING)
+ return -EINVAL;
+
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- flags |= chain->flags & NFT_BASE_CHAIN;
- return nf_tables_updchain(&ctx, genmask, policy, flags);
+ flags |= chain->flags & NFT_CHAIN_BASE;
+ return nf_tables_updchain(&ctx, genmask, policy, flags, attr,
+ extack);
}
- return nf_tables_addchain(&ctx, family, genmask, policy, flags);
+ return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack);
}
-static int nf_tables_delchain(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_chain *chain;
@@ -2251,7 +2611,8 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
u32 use;
int err;
- table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
@@ -2270,11 +2631,11 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
return PTR_ERR(chain);
}
- if (nlh->nlmsg_flags & NLM_F_NONREC &&
+ if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
chain->use > 0)
return -EBUSY;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
use = chain->use;
list_for_each_entry(rule, &chain->rules, list) {
@@ -2304,7 +2665,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
/**
* nft_register_expr - register nf_tables expr type
- * @ops: expr type
+ * @type: expr type
*
* Registers the expr type for use with nf_tables. Returns zero on
* success or a negative errno code otherwise.
@@ -2323,7 +2684,7 @@ EXPORT_SYMBOL_GPL(nft_register_expr);
/**
* nft_unregister_expr - unregister nf_tables expr type
- * @ops: expr type
+ * @type: expr type
*
* Unregisters the expr typefor use with nf_tables.
*/
@@ -2438,6 +2799,7 @@ nla_put_failure:
struct nft_expr_info {
const struct nft_expr_ops *ops;
+ const struct nlattr *attr;
struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
};
@@ -2485,7 +2847,9 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
} else
ops = type->ops;
+ info->attr = nla;
info->ops = ops;
+
return 0;
err1:
@@ -2494,15 +2858,15 @@ err1:
}
static int nf_tables_newexpr(const struct nft_ctx *ctx,
- const struct nft_expr_info *info,
+ const struct nft_expr_info *expr_info,
struct nft_expr *expr)
{
- const struct nft_expr_ops *ops = info->ops;
+ const struct nft_expr_ops *ops = expr_info->ops;
int err;
expr->ops = ops;
if (ops->init) {
- err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
+ err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb);
if (err < 0)
goto err1;
}
@@ -2523,40 +2887,62 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
module_put(type->owner);
}
-struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
- const struct nlattr *nla)
+static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
+ const struct nlattr *nla)
{
- struct nft_expr_info info;
+ struct nft_expr_info expr_info;
struct nft_expr *expr;
struct module *owner;
int err;
- err = nf_tables_expr_parse(ctx, nla, &info);
+ err = nf_tables_expr_parse(ctx, nla, &expr_info);
if (err < 0)
- goto err1;
+ goto err_expr_parse;
+
+ err = -EOPNOTSUPP;
+ if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL))
+ goto err_expr_stateful;
err = -ENOMEM;
- expr = kzalloc(info.ops->size, GFP_KERNEL);
+ expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT);
if (expr == NULL)
- goto err2;
+ goto err_expr_stateful;
- err = nf_tables_newexpr(ctx, &info, expr);
+ err = nf_tables_newexpr(ctx, &expr_info, expr);
if (err < 0)
- goto err3;
+ goto err_expr_new;
return expr;
-err3:
+err_expr_new:
kfree(expr);
-err2:
- owner = info.ops->type->owner;
- if (info.ops->type->release_ops)
- info.ops->type->release_ops(info.ops);
+err_expr_stateful:
+ owner = expr_info.ops->type->owner;
+ if (expr_info.ops->type->release_ops)
+ expr_info.ops->type->release_ops(expr_info.ops);
module_put(owner);
-err1:
+err_expr_parse:
return ERR_PTR(err);
}
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
+{
+ int err;
+
+ if (src->ops->clone) {
+ dst->ops = src->ops;
+ err = src->ops->clone(dst, src);
+ if (err < 0)
+ return err;
+ } else {
+ memcpy(dst, src, src->ops->size);
+ }
+
+ __module_get(src->ops->type->owner);
+
+ return 0;
+}
+
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
nf_tables_expr_destroy(ctx, expr);
@@ -2603,6 +2989,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
.len = NFT_USERDATA_MAXLEN },
[NFTA_RULE_ID] = { .type = NLA_U32 },
[NFTA_RULE_POSITION_ID] = { .type = NLA_U32 },
+ [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 },
};
static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
@@ -2610,24 +2997,18 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
u32 flags, int family,
const struct nft_table *table,
const struct nft_chain *chain,
- const struct nft_rule *rule,
- const struct nft_rule *prule)
+ const struct nft_rule *rule, u64 handle)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
const struct nft_expr *expr, *next;
struct nlattr *list;
u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
+ nft_base_seq(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
@@ -2636,13 +3017,15 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
NFTA_RULE_PAD))
goto nla_put_failure;
- if (event != NFT_MSG_DELRULE && prule) {
- if (nla_put_be64(skb, NFTA_RULE_POSITION,
- cpu_to_be64(prule->handle),
+ if (event != NFT_MSG_DELRULE && handle) {
+ if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle),
NFTA_RULE_PAD))
goto nla_put_failure;
}
+ if (chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_stats(chain, rule);
+
list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS);
if (list == NULL)
goto nla_put_failure;
@@ -2670,7 +3053,11 @@ nla_put_failure:
static void nf_tables_rule_notify(const struct nft_ctx *ctx,
const struct nft_rule *rule, int event)
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
+ const struct nft_rule *prule;
struct sk_buff *skb;
+ u64 handle = 0;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -2681,16 +3068,26 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
if (skb == NULL)
goto err;
+ if (event == NFT_MSG_NEWRULE &&
+ !list_is_first(&rule->list, &ctx->chain->rules) &&
+ !list_is_last(&rule->list, &ctx->chain->rules)) {
+ prule = list_prev_entry(rule, list);
+ handle = prule->handle;
+ }
+ if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE))
+ flags |= NLM_F_APPEND;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
- event, 0, ctx->family, ctx->table,
- ctx->chain, rule, NULL);
+ event, flags, ctx->family, ctx->table,
+ ctx->chain, rule, handle);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -2710,6 +3107,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
const struct nft_rule *rule, *prule;
unsigned int s_idx = cb->args[0];
+ u64 handle;
prule = NULL;
list_for_each_entry_rcu(rule, &chain->rules, list) {
@@ -2721,12 +3119,17 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
memset(&cb->args[1], 0,
sizeof(cb->args) - sizeof(cb->args[0]));
}
+ if (prule)
+ handle = prule->handle;
+ else
+ handle = 0;
+
if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NFT_MSG_NEWRULE,
NLM_F_MULTI | NLM_F_APPEND,
table->family,
- table, chain, rule, prule) < 0)
+ table, chain, rule, handle) < 0)
return 1;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -2748,11 +3151,13 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
unsigned int idx = 0;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = READ_ONCE(nft_net->base_seq);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -2838,21 +3243,20 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
}
/* called with rcu_read_lock held */
-static int nf_tables_getrule(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_chain *chain;
const struct nft_rule *rule;
+ struct net *net = info->net;
struct nft_table *table;
struct sk_buff *skb2;
- int family = nfmsg->nfgen_family;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start= nf_tables_dump_rules_start,
.dump = nf_tables_dump_rules,
@@ -2861,10 +3265,10 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
.data = (void *)nla,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
- table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
@@ -2887,14 +3291,14 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
return -ENOMEM;
err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
- family, table, chain, rule, NULL);
+ info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+ family, table, chain, rule, 0);
if (err < 0)
- goto err;
+ goto err_fill_rule_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_rule_info:
kfree_skb(skb2);
return err;
}
@@ -2909,7 +3313,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
* is called on error from nf_tables_newrule().
*/
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
next = nft_expr_next(expr);
nf_tables_expr_destroy(ctx, expr);
expr = next;
@@ -2917,8 +3321,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
kfree(rule);
}
-static void nf_tables_rule_release(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule)
{
nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
nf_tables_rule_destroy(ctx, rule);
@@ -2946,6 +3349,8 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
if (err < 0)
return err;
}
+
+ cond_resched();
}
return 0;
@@ -2975,44 +3380,60 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
}
static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla);
#define NFT_RULE_MAXEXPRS 128
-static int nf_tables_newrule(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- struct nft_expr_info *info = NULL;
- int family = nfmsg->nfgen_family;
- struct nft_flow_rule *flow;
- struct nft_table *table;
- struct nft_chain *chain;
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ unsigned int size, i, n, ulen = 0, usize = 0;
+ u8 genmask = nft_genmask_next(info->net);
struct nft_rule *rule, *old_rule = NULL;
+ struct nft_expr_info *expr_info = NULL;
+ u8 family = info->nfmsg->nfgen_family;
+ struct nft_flow_rule *flow = NULL;
+ struct net *net = info->net;
struct nft_userdata *udata;
- struct nft_trans *trans = NULL;
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct nft_trans *trans;
+ u64 handle, pos_handle;
struct nft_expr *expr;
struct nft_ctx ctx;
struct nlattr *tmp;
- unsigned int size, i, n, ulen = 0, usize = 0;
int err, rem;
- u64 handle, pos_handle;
- lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
- table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
}
- chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
- if (IS_ERR(chain)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
- return PTR_ERR(chain);
+ if (nla[NFTA_RULE_CHAIN]) {
+ chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
+ genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
+ return PTR_ERR(chain);
+ }
+ if (nft_chain_is_bound(chain))
+ return -EOPNOTSUPP;
+
+ } else if (nla[NFTA_RULE_CHAIN_ID]) {
+ chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID]);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
+ return PTR_ERR(chain);
+ }
+ } else {
+ return -EINVAL;
}
if (nla[NFTA_RULE_HANDLE]) {
@@ -3023,17 +3444,17 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
return PTR_ERR(rule);
}
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
old_rule = rule;
else
return -EOPNOTSUPP;
} else {
- if (!(nlh->nlmsg_flags & NLM_F_CREATE) ||
- nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) ||
+ info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EINVAL;
handle = nf_tables_alloc_handle(table);
@@ -3048,7 +3469,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
return PTR_ERR(old_rule);
}
} else if (nla[NFTA_RULE_POSITION_ID]) {
- old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]);
+ old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]);
return PTR_ERR(old_rule);
@@ -3056,34 +3477,36 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
n = 0;
size = 0;
if (nla[NFTA_RULE_EXPRESSIONS]) {
- info = kvmalloc_array(NFT_RULE_MAXEXPRS,
- sizeof(struct nft_expr_info),
- GFP_KERNEL);
- if (!info)
+ expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS,
+ sizeof(struct nft_expr_info),
+ GFP_KERNEL);
+ if (!expr_info)
return -ENOMEM;
nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
err = -EINVAL;
if (nla_type(tmp) != NFTA_LIST_ELEM)
- goto err1;
+ goto err_release_expr;
if (n == NFT_RULE_MAXEXPRS)
- goto err1;
- err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
- if (err < 0)
- goto err1;
- size += info[n].ops->size;
+ goto err_release_expr;
+ err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, tmp);
+ goto err_release_expr;
+ }
+ size += expr_info[n].ops->size;
n++;
}
}
/* Check for overflow of dlen field */
err = -EFBIG;
if (size >= 1 << 12)
- goto err1;
+ goto err_release_expr;
if (nla[NFTA_RULE_USERDATA]) {
ulen = nla_len(nla[NFTA_RULE_USERDATA]);
@@ -3092,9 +3515,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
err = -ENOMEM;
- rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL);
+ rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT);
if (rule == NULL)
- goto err1;
+ goto err_release_expr;
nft_activate_next(net, rule);
@@ -3110,38 +3533,46 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
expr = nft_expr_first(rule);
for (i = 0; i < n; i++) {
- err = nf_tables_newexpr(&ctx, &info[i], expr);
- if (err < 0)
- goto err2;
+ err = nf_tables_newexpr(&ctx, &expr_info[i], expr);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, expr_info[i].attr);
+ goto err_release_rule;
+ }
- if (info[i].ops->validate)
+ if (expr_info[i].ops->validate)
nft_validate_state_update(net, NFT_VALIDATE_NEED);
- info[i].ops = NULL;
+ expr_info[i].ops = NULL;
expr = nft_expr_next(expr);
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
+ flow = nft_flow_rule_create(net, rule);
+ if (IS_ERR(flow)) {
+ err = PTR_ERR(flow);
+ goto err_release_rule;
+ }
+ }
+
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
+ err = nft_delrule(&ctx, old_rule);
+ if (err < 0)
+ goto err_destroy_flow_rule;
+
trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (trans == NULL) {
err = -ENOMEM;
- goto err2;
+ goto err_destroy_flow_rule;
}
- err = nft_delrule(&ctx, old_rule);
- if (err < 0) {
- nft_trans_destroy(trans);
- goto err2;
- }
-
list_add_tail_rcu(&rule->list, &old_rule->list);
} else {
trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (!trans) {
err = -ENOMEM;
- goto err2;
+ goto err_destroy_flow_rule;
}
- if (nlh->nlmsg_flags & NLM_F_APPEND) {
+ if (info->nlh->nlmsg_flags & NLM_F_APPEND) {
if (old_rule)
list_add_rcu(&rule->list, &old_rule->list);
else
@@ -3153,65 +3584,69 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
list_add_rcu(&rule->list, &chain->rules);
}
}
- kvfree(info);
+ kvfree(expr_info);
chain->use++;
- if (net->nft.validate_state == NFT_VALIDATE_DO)
- return nft_table_validate(net, table);
-
- if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
- flow = nft_flow_rule_create(net, rule);
- if (IS_ERR(flow))
- return PTR_ERR(flow);
-
+ if (flow)
nft_trans_flow_rule(trans) = flow;
- }
+
+ if (nft_net->validate_state == NFT_VALIDATE_DO)
+ return nft_table_validate(net, table);
return 0;
-err2:
+
+err_destroy_flow_rule:
+ if (flow)
+ nft_flow_rule_destroy(flow);
+err_release_rule:
nf_tables_rule_release(&ctx, rule);
-err1:
+err_release_expr:
for (i = 0; i < n; i++) {
- if (info[i].ops) {
- module_put(info[i].ops->type->owner);
- if (info[i].ops->type->release_ops)
- info[i].ops->type->release_ops(info[i].ops);
+ if (expr_info[i].ops) {
+ module_put(expr_info[i].ops->type->owner);
+ if (expr_info[i].ops->type->release_ops)
+ expr_info[i].ops->type->release_ops(expr_info[i].ops);
}
}
- kvfree(info);
+ kvfree(expr_info);
+
return err;
}
static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
struct nft_trans *trans;
- list_for_each_entry(trans, &net->nft.commit_list, list) {
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
struct nft_rule *rule = nft_trans_rule(trans);
if (trans->msg_type == NFT_MSG_NEWRULE &&
+ trans->ctx.chain == chain &&
id == nft_trans_rule_id(trans))
return rule;
}
return ERR_PTR(-ENOENT);
}
-static int nf_tables_delrule(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- struct nft_table *table;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
struct nft_chain *chain = NULL;
+ struct net *net = info->net;
+ struct nft_table *table;
struct nft_rule *rule;
- int family = nfmsg->nfgen_family, err = 0;
struct nft_ctx ctx;
+ int err = 0;
- table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
@@ -3224,9 +3659,11 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
}
+ if (nft_chain_is_bound(chain))
+ return -EOPNOTSUPP;
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
@@ -3238,7 +3675,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
err = nft_delrule(&ctx, rule);
} else if (nla[NFTA_RULE_ID]) {
- rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
+ rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
return PTR_ERR(rule);
@@ -3266,25 +3703,17 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
/*
* Sets
*/
-
-static LIST_HEAD(nf_tables_set_types);
-
-int nft_register_set(struct nft_set_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_add_tail_rcu(&type->list, &nf_tables_set_types);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- return 0;
-}
-EXPORT_SYMBOL_GPL(nft_register_set);
-
-void nft_unregister_set(struct nft_set_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_del_rcu(&type->list);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_unregister_set);
+static const struct nft_set_type *nft_set_types[] = {
+ &nft_set_hash_fast_type,
+ &nft_set_hash_type,
+ &nft_set_rhash_type,
+ &nft_set_bitmap_type,
+ &nft_set_rbtree_type,
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ &nft_set_pipapo_avx2_type,
+#endif
+ &nft_set_pipapo_type,
+};
#define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \
NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
@@ -3306,19 +3735,16 @@ nft_select_set_ops(const struct nft_ctx *ctx,
const struct nft_set_desc *desc,
enum nft_set_policies policy)
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
const struct nft_set_ops *ops, *bops;
struct nft_set_estimate est, best;
const struct nft_set_type *type;
u32 flags = 0;
+ int i;
- lockdep_assert_held(&ctx->net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
lockdep_nfnl_nft_mutex_not_held();
-#ifdef CONFIG_MODULES
- if (list_empty(&nf_tables_set_types)) {
- if (nft_request_module(ctx->net, "nft-set") == -EAGAIN)
- return ERR_PTR(-EAGAIN);
- }
-#endif
+
if (nla[NFTA_SET_FLAGS] != NULL)
flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
@@ -3327,7 +3753,8 @@ nft_select_set_ops(const struct nft_ctx *ctx,
best.lookup = ~0;
best.space = ~0;
- list_for_each_entry(type, &nf_tables_set_types, list) {
+ for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) {
+ type = nft_set_types[i];
ops = &type->ops;
if (!nft_set_ops_candidate(type, flags))
@@ -3358,11 +3785,6 @@ nft_select_set_ops(const struct nft_ctx *ctx,
break;
}
- if (!try_module_get(type->owner))
- continue;
- if (bops != NULL)
- module_put(to_set_type(bops)->owner);
-
bops = ops;
best = est;
}
@@ -3392,6 +3814,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
.len = NFT_USERDATA_MAXLEN },
[NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_SET_HANDLE] = { .type = NLA_U64 },
+ [NFTA_SET_EXPR] = { .type = NLA_NESTED },
+ [NFTA_SET_EXPRESSIONS] = { .type = NLA_NESTED },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -3399,30 +3823,6 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED },
};
-static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
- const struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack,
- u8 genmask)
-{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- int family = nfmsg->nfgen_family;
- struct nft_table *table = NULL;
-
- if (nla[NFTA_SET_TABLE] != NULL) {
- table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
- genmask);
- if (IS_ERR(table)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
- return PTR_ERR(table);
- }
- }
-
- nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
- return 0;
-}
-
static struct nft_set *nft_set_lookup(const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
@@ -3454,16 +3854,19 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
}
static struct nft_set *nft_set_lookup_byid(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
- struct nft_trans *trans;
+ struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
+ struct nft_trans *trans;
- list_for_each_entry(trans, &net->nft.commit_list, list) {
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
if (trans->msg_type == NFT_MSG_NEWSET) {
struct nft_set *set = nft_trans_set(trans);
if (id == nft_trans_set_id(trans) &&
+ set->table == table &&
nft_active_genmask(set, genmask))
return set;
}
@@ -3484,7 +3887,7 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
if (!nla_set_id)
return set;
- set = nft_set_lookup_byid(net, nla_set_id, genmask);
+ set = nft_set_lookup_byid(net, table, nla_set_id, genmask);
}
return set;
}
@@ -3510,7 +3913,7 @@ cont:
list_for_each_entry(i, &ctx->table->sets, list) {
int tmp;
- if (!nft_is_active_next(ctx->net, set))
+ if (!nft_is_active_next(ctx->net, i))
continue;
if (!sscanf(i->name, name, &tmp))
continue;
@@ -3529,7 +3932,7 @@ cont:
free_page((unsigned long)inuse);
}
- set->name = kasprintf(GFP_KERNEL, name, min + n);
+ set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n);
if (!set->name)
return -ENOMEM;
@@ -3538,13 +3941,14 @@ cont:
continue;
if (!strcmp(set->name, i->name)) {
kfree(set->name);
+ set->name = NULL;
return -ENFILE;
}
}
return 0;
}
-static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
+int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
{
u64 ms = be64_to_cpu(nla_get_be64(nla));
u64 max = (u64)(~((u64)0));
@@ -3558,7 +3962,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
return 0;
}
-static __be64 nf_jiffies64_to_msecs(u64 input)
+__be64 nf_jiffies64_to_msecs(u64 input)
{
return cpu_to_be64(jiffies64_to_msecs(input));
}
@@ -3593,23 +3997,18 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb,
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
const struct nft_set *set, u16 event, u16 flags)
{
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
- struct nlattr *desc;
u32 portid = ctx->portid;
+ struct nlattr *nest;
u32 seq = ctx->seq;
+ int i;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
- flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
+ NFNETLINK_V0, nft_base_seq(ctx->net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = ctx->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_NAME, set->name))
@@ -3649,12 +4048,12 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
goto nla_put_failure;
}
- if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
+ if (set->udata &&
+ nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
goto nla_put_failure;
- desc = nla_nest_start_noflag(skb, NFTA_SET_DESC);
-
- if (desc == NULL)
+ nest = nla_nest_start_noflag(skb, NFTA_SET_DESC);
+ if (!nest)
goto nla_put_failure;
if (set->size &&
nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
@@ -3664,7 +4063,26 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
nf_tables_fill_set_concat(skb, set))
goto nla_put_failure;
- nla_nest_end(skb, desc);
+ nla_nest_end(skb, nest);
+
+ if (set->num_exprs == 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
+ if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ } else if (set->num_exprs > 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < set->num_exprs; i++) {
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM,
+ set->exprs[i]) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
nlmsg_end(skb, nlh);
return 0;
@@ -3678,8 +4096,10 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
const struct nft_set *set, int event,
gfp_t gfp_flags)
{
- struct sk_buff *skb;
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
u32 portid = ctx->portid;
+ struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -3690,14 +4110,16 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
if (skb == NULL)
goto err;
- err = nf_tables_fill_set(skb, ctx, set, event, 0);
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
+ err = nf_tables_fill_set(skb, ctx, set, event, flags);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report,
- gfp_flags);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -3710,14 +4132,16 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
struct net *net = sock_net(skb->sk);
struct nft_ctx *ctx = cb->data, ctx_set;
+ struct nftables_pernet *nft_net;
if (cb->args[1])
return skb->len;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = READ_ONCE(nft_net->base_seq);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (ctx->family != NFPROTO_UNSPEC &&
ctx->family != table->family)
continue;
@@ -3781,25 +4205,31 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
}
/* called with rcu_read_lock held */
-static int nf_tables_getset(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct nft_table *table = NULL;
+ struct net *net = info->net;
const struct nft_set *set;
- struct nft_ctx ctx;
struct sk_buff *skb2;
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_ctx ctx;
int err;
- /* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ if (nla[NFTA_SET_TABLE]) {
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+ genmask, 0);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
+ return PTR_ERR(table);
+ }
+ }
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_sets_start,
.dump = nf_tables_dump_sets,
@@ -3808,16 +4238,16 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
.module = THIS_MODULE,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
/* Only accept unspec with dump */
- if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
return -EAFNOSUPPORT;
if (!nla[NFTA_SET_TABLE])
return -EINVAL;
- set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+ set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
@@ -3827,11 +4257,11 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
if (err < 0)
- goto err;
+ goto err_fill_set_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_set_info:
kfree_skb(skb2);
return err;
}
@@ -3847,6 +4277,9 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr,
u32 len;
int err;
+ if (desc->field_count >= ARRAY_SIZE(desc->field_len))
+ return -E2BIG;
+
err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr,
nft_concat_policy, NULL);
if (err < 0)
@@ -3856,9 +4289,8 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr,
return -EINVAL;
len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN]));
-
- if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT)
- return -E2BIG;
+ if (!len || len > U8_MAX)
+ return -EINVAL;
desc->field_len[desc->field_count++] = len;
@@ -3869,7 +4301,8 @@ static int nft_set_desc_concat(struct nft_set_desc *desc,
const struct nlattr *nla)
{
struct nlattr *attr;
- int rem, err;
+ u32 num_regs = 0;
+ int rem, err, i;
nla_for_each_nested(attr, nla, rem) {
if (nla_type(attr) != NFTA_LIST_ELEM)
@@ -3880,6 +4313,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc,
return err;
}
+ for (i = 0; i < desc->field_count; i++)
+ num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32));
+
+ if (num_regs > NFT_REG32_COUNT)
+ return -E2BIG;
+
return 0;
}
@@ -3902,27 +4341,27 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
return err;
}
-static int nf_tables_newset(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ u32 ktype, dtype, flags, policy, gc_int, objtype;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_set_ops *ops;
+ struct nft_expr *expr = NULL;
+ struct net *net = info->net;
+ struct nft_set_desc desc;
struct nft_table *table;
+ unsigned char *udata;
struct nft_set *set;
struct nft_ctx ctx;
- char *name;
- u64 size;
+ size_t alloc_size;
u64 timeout;
- u32 ktype, dtype, flags, policy, gc_int, objtype;
- struct nft_set_desc desc;
- unsigned char *udata;
+ char *name;
+ int err, i;
u16 udlen;
- int err;
- int i;
+ u64 size;
if (nla[NFTA_SET_TABLE] == NULL ||
nla[NFTA_SET_NAME] == NULL ||
@@ -3949,8 +4388,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
NFT_SET_MAP | NFT_SET_EVAL |
- NFT_SET_OBJECT))
- return -EINVAL;
+ NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR))
+ return -EOPNOTSUPP;
/* Only one of these operations is supported */
if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==
(NFT_SET_MAP | NFT_SET_OBJECT))
@@ -3988,7 +4427,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
if (objtype == NFT_OBJECT_UNSPEC ||
objtype > NFT_OBJECT_MAX)
- return -EINVAL;
+ return -EOPNOTSUPP;
} else if (flags & NFT_SET_OBJECT)
return -EINVAL;
else
@@ -4018,15 +4457,24 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
if (err < 0)
return err;
+
+ if (desc.field_count > 1 && !(flags & NFT_SET_CONCAT))
+ return -EINVAL;
+ } else if (flags & NFT_SET_CONCAT) {
+ return -EINVAL;
}
- table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
+ if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS])
+ desc.expr = true;
+
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
return PTR_ERR(table);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set)) {
@@ -4035,17 +4483,17 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
return PTR_ERR(set);
}
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
return 0;
}
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
ops = nft_select_set_ops(&ctx, nla, &desc, policy);
@@ -4059,23 +4507,23 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
size = 0;
if (ops->privsize != NULL)
size = ops->privsize(nla, &desc);
+ alloc_size = sizeof(*set) + size + udlen;
+ if (alloc_size < size || alloc_size > INT_MAX)
+ return -ENOMEM;
+ set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!set)
+ return -ENOMEM;
- set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL);
- if (!set) {
- err = -ENOMEM;
- goto err1;
- }
-
- name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
+ name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT);
if (!name) {
err = -ENOMEM;
- goto err2;
+ goto err_set_name;
}
err = nf_tables_set_alloc_name(&ctx, set, name);
kfree(name);
if (err < 0)
- goto err2;
+ goto err_set_name;
udata = NULL;
if (udlen) {
@@ -4084,22 +4532,22 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
}
INIT_LIST_HEAD(&set->bindings);
+ INIT_LIST_HEAD(&set->catchall_list);
set->table = table;
write_pnet(&set->net, net);
- set->ops = ops;
+ set->ops = ops;
set->ktype = ktype;
- set->klen = desc.klen;
+ set->klen = desc.klen;
set->dtype = dtype;
set->objtype = objtype;
- set->dlen = desc.dlen;
+ set->dlen = desc.dlen;
set->flags = flags;
- set->size = desc.size;
+ set->size = desc.size;
set->policy = policy;
- set->udlen = udlen;
- set->udata = udata;
+ set->udlen = udlen;
+ set->udata = udata;
set->timeout = timeout;
set->gc_int = gc_int;
- set->handle = nf_tables_alloc_handle(table);
set->field_count = desc.field_count;
for (i = 0; i < desc.field_count; i++)
@@ -4107,66 +4555,129 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err = ops->init(set, &desc, nla);
if (err < 0)
- goto err3;
+ goto err_set_init;
+
+ if (nla[NFTA_SET_EXPR]) {
+ expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_expr_alloc;
+ }
+ set->exprs[0] = expr;
+ set->num_exprs++;
+ } else if (nla[NFTA_SET_EXPRESSIONS]) {
+ struct nft_expr *expr;
+ struct nlattr *tmp;
+ int left;
+
+ if (!(flags & NFT_SET_EXPR)) {
+ err = -EINVAL;
+ goto err_set_expr_alloc;
+ }
+ i = 0;
+ nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX) {
+ err = -E2BIG;
+ goto err_set_expr_alloc;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_set_expr_alloc;
+ }
+ expr = nft_set_elem_expr_alloc(&ctx, set, tmp);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_expr_alloc;
+ }
+ set->exprs[i++] = expr;
+ set->num_exprs++;
+ }
+ }
+
+ set->handle = nf_tables_alloc_handle(table);
err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
if (err < 0)
- goto err4;
+ goto err_set_expr_alloc;
list_add_tail_rcu(&set->list, &table->sets);
table->use++;
return 0;
-err4:
+err_set_expr_alloc:
+ for (i = 0; i < set->num_exprs; i++)
+ nft_expr_destroy(&ctx, set->exprs[i]);
+
ops->destroy(set);
-err3:
+err_set_init:
kfree(set->name);
-err2:
+err_set_name:
kvfree(set);
-err1:
- module_put(to_set_type(ops)->owner);
return err;
}
-static void nft_set_destroy(struct nft_set *set)
+struct nft_set_elem_catchall {
+ struct list_head list;
+ struct rcu_head rcu;
+ void *elem;
+};
+
+static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ struct nft_set_elem_catchall *next, *catchall;
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ list_del_rcu(&catchall->list);
+ nft_set_elem_destroy(set, catchall->elem, true);
+ kfree_rcu(catchall, rcu);
+ }
+}
+
+static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
+ int i;
+
if (WARN_ON(set->use > 0))
return;
+ for (i = 0; i < set->num_exprs; i++)
+ nft_expr_destroy(ctx, set->exprs[i]);
+
set->ops->destroy(set);
- module_put(to_set_type(set->ops)->owner);
+ nft_set_catchall_destroy(ctx, set);
kfree(set->name);
kvfree(set);
}
-static int nf_tables_delset(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
+ struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
- int err;
- if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
return -EAFNOSUPPORT;
- if (nla[NFTA_SET_TABLE] == NULL)
- return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
+ return PTR_ERR(table);
+ }
if (nla[NFTA_SET_HANDLE]) {
attr = nla[NFTA_SET_HANDLE];
- set = nft_set_lookup_byhandle(ctx.table, attr, genmask);
+ set = nft_set_lookup_byhandle(table, attr, genmask);
} else {
attr = nla[NFTA_SET_NAME];
- set = nft_set_lookup(ctx.table, attr, genmask);
+ set = nft_set_lookup(table, attr, genmask);
}
if (IS_ERR(set)) {
@@ -4174,18 +4685,26 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
return PTR_ERR(set);
}
if (set->use ||
- (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) {
+ (info->nlh->nlmsg_flags & NLM_F_NONREC &&
+ atomic_read(&set->nelems) > 0)) {
NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
}
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
return nft_delset(&ctx, set);
}
-static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+static int nft_validate_register_store(const struct nft_ctx *ctx,
+ enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type,
+ unsigned int len);
+
+static int nft_setelem_data_validate(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
enum nft_registers dreg;
@@ -4197,6 +4716,37 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
set->dlen);
}
+static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_set_elem *elem)
+{
+ return nft_setelem_data_validate(ctx, set, elem);
+}
+
+static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_elem elem;
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ elem.priv = catchall->elem;
+ ret = nft_setelem_data_validate(ctx, set, &elem);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding)
{
@@ -4226,6 +4776,9 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
iter.fn = nf_tables_bind_check_setelem;
set->ops->walk(ctx, set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_bind_check(ctx, set);
+
if (iter.err < 0)
return iter.err;
}
@@ -4263,7 +4816,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
case NFT_TRANS_ABORT:
case NFT_TRANS_RELEASE:
set->use--;
- /* fall through */
+ fallthrough;
default:
nf_tables_unbind_set(ctx, set, binding,
phase == NFT_TRANS_COMMIT);
@@ -4274,7 +4827,7 @@ EXPORT_SYMBOL_GPL(nf_tables_deactivate_set);
void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
{
if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
- nft_set_destroy(set);
+ nft_set_destroy(ctx, set);
}
EXPORT_SYMBOL_GPL(nf_tables_destroy_set);
@@ -4285,8 +4838,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_DATA] = {
.align = __alignof__(u32),
},
- [NFT_SET_EXT_EXPR] = {
- .align = __alignof__(struct nft_expr),
+ [NFT_SET_EXT_EXPRESSIONS] = {
+ .align = __alignof__(struct nft_set_elem_expr),
},
[NFT_SET_EXT_OBJREF] = {
.len = sizeof(struct nft_object *),
@@ -4312,7 +4865,6 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
.align = __alignof__(u32),
},
};
-EXPORT_SYMBOL_GPL(nft_set_ext_types);
/*
* Set elements
@@ -4330,6 +4882,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
[NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING,
.len = NFT_OBJ_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_EXPRESSIONS] = { .type = NLA_NESTED },
};
static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
@@ -4341,26 +4894,41 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
- const struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack,
- u8 genmask)
+static int nft_set_elem_expr_dump(struct sk_buff *skb,
+ const struct nft_set *set,
+ const struct nft_set_ext *ext)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- int family = nfmsg->nfgen_family;
- struct nft_table *table;
+ struct nft_set_elem_expr *elem_expr;
+ u32 size, num_exprs = 0;
+ struct nft_expr *expr;
+ struct nlattr *nest;
- table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
- genmask);
- if (IS_ERR(table)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
- return PTR_ERR(table);
- }
+ elem_expr = nft_set_ext_expr(ext);
+ nft_setelem_expr_foreach(expr, elem_expr, size)
+ num_exprs++;
- nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
+ if (num_exprs == 1) {
+ expr = nft_setelem_expr_at(elem_expr, 0);
+ if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0)
+ return -1;
+
+ return 0;
+ } else if (num_exprs > 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size) {
+ expr = nft_setelem_expr_at(elem_expr, size);
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
return 0;
+
+nla_put_failure:
+ return -1;
}
static int nf_tables_fill_setelem(struct sk_buff *skb,
@@ -4375,7 +4943,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nest == NULL)
goto nla_put_failure;
- if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
+ nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
NFT_DATA_VALUE, set->klen) < 0)
goto nla_put_failure;
@@ -4390,8 +4959,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
set->dlen) < 0)
goto nla_put_failure;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) &&
- nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) &&
+ nft_set_elem_expr_dump(skb, set, ext))
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
@@ -4464,22 +5033,48 @@ struct nft_set_dump_ctx {
struct nft_ctx ctx;
};
+static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
+ const struct nft_set *set)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_cur(net);
+ struct nft_set_elem elem;
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask) ||
+ nft_set_elem_expired(ext))
+ continue;
+
+ elem.priv = catchall->elem;
+ ret = nf_tables_fill_setelem(skb, set, &elem);
+ break;
+ }
+
+ return ret;
+}
+
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nft_set_dump_ctx *dump_ctx = cb->data;
struct net *net = sock_net(skb->sk);
+ struct nftables_pernet *nft_net;
struct nft_table *table;
struct nft_set *set;
struct nft_set_dump_args args;
bool set_found = false;
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct nlattr *nest;
u32 portid, seq;
int event;
rcu_read_lock();
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ nft_net = nft_pernet(net);
+ cb->seq = READ_ONCE(nft_net->base_seq);
+
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
dump_ctx->ctx.family != table->family)
continue;
@@ -4505,16 +5100,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
portid = NETLINK_CB(cb->skb).portid;
seq = cb->nlh->nlmsg_seq;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
- NLM_F_MULTI);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
+ table->family, NFNETLINK_V0, nft_base_seq(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = table->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
@@ -4532,6 +5122,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
args.iter.err = 0;
args.iter.fn = nf_tables_dump_setelem;
set->ops->walk(&dump_ctx->ctx, set, &args.iter);
+
+ if (!args.iter.err && args.iter.count == cb->args[0])
+ args.iter.err = nft_set_catchall_dump(net, skb, set);
rcu_read_unlock();
nla_nest_end(skb, nest);
@@ -4571,22 +5164,16 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
const struct nft_set *set,
const struct nft_set_elem *elem)
{
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct nlattr *nest;
int err;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
- flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
+ NFNETLINK_V0, nft_base_seq(ctx->net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = ctx->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_NAME, set->name))
@@ -4617,11 +5204,14 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
return 0;
*flags = ntohl(nla_get_be32(attr));
- if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
+ if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
+ return -EOPNOTSUPP;
if (!(set->flags & NFT_SET_INTERVAL) &&
*flags & NFT_SET_ELEM_INTERVAL_END)
return -EINVAL;
+ if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) ==
+ (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
return 0;
}
@@ -4629,18 +5219,72 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data *key, struct nlattr *attr)
{
- struct nft_data_desc desc;
- int err;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = NFT_DATA_VALUE_MAXLEN,
+ .len = set->klen,
+ };
- err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr);
- if (err < 0)
- return err;
+ return nft_data_init(ctx, key, &desc, attr);
+}
- if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) {
- nft_data_release(key, desc.type);
- return -EINVAL;
+static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_data_desc *desc,
+ struct nft_data *data,
+ struct nlattr *attr)
+{
+ u32 dtype;
+
+ if (set->dtype == NFT_DATA_VERDICT)
+ dtype = NFT_DATA_VERDICT;
+ else
+ dtype = NFT_DATA_VALUE;
+
+ desc->type = dtype;
+ desc->size = NFT_DATA_VALUE_MAXLEN;
+ desc->len = set->dlen;
+ desc->flags = NFT_DATA_DESC_SETELEM;
+
+ return nft_data_init(ctx, data, desc, attr);
+}
+
+static void *nft_setelem_catchall_get(const struct net *net,
+ const struct nft_set *set)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_cur(net);
+ struct nft_set_ext *ext;
+ void *priv = NULL;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask) ||
+ nft_set_elem_expired(ext))
+ continue;
+
+ priv = catchall->elem;
+ break;
}
+ return priv;
+}
+
+static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_elem *elem, u32 flags)
+{
+ void *priv;
+
+ if (!(flags & NFT_SET_ELEM_CATCHALL)) {
+ priv = set->ops->get(ctx->net, set, elem, flags);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+ } else {
+ priv = nft_setelem_catchall_get(ctx->net, set);
+ if (!priv)
+ return -ENOENT;
+ }
+ elem->priv = priv;
+
return 0;
}
@@ -4651,7 +5295,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_elem elem;
struct sk_buff *skb;
uint32_t flags = 0;
- void *priv;
int err;
err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
@@ -4659,17 +5302,19 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
return err;
- if (!nla[NFTA_SET_ELEM_KEY])
- return -EINVAL;
-
err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;
- err = nft_setelem_parse_key(ctx, set, &elem.key.val,
- nla[NFTA_SET_ELEM_KEY]);
- if (err < 0)
- return err;
+ if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
+
+ if (nla[NFTA_SET_ELEM_KEY]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ return err;
+ }
if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
@@ -4678,57 +5323,56 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return err;
}
- priv = set->ops->get(ctx->net, set, &elem, flags);
- if (IS_ERR(priv))
- return PTR_ERR(priv);
-
- elem.priv = priv;
+ err = nft_setelem_get(ctx, set, &elem, flags);
+ if (err < 0)
+ return err;
err = -ENOMEM;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb == NULL)
- goto err1;
+ return err;
err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid,
NFT_MSG_NEWSETELEM, 0, set, &elem);
if (err < 0)
- goto err2;
+ goto err_fill_setelem;
- err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT);
- /* This avoids a loop in nfnetlink. */
- if (err < 0)
- goto err1;
+ return nfnetlink_unicast(skb, ctx->net, ctx->portid);
- return 0;
-err2:
+err_fill_setelem:
kfree_skb(skb);
-err1:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ return err;
}
/* called with rcu_read_lock held */
-static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
-{
- u8 genmask = nft_genmask_cur(net);
+static int nf_tables_getsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
+ struct nft_table *table;
struct nft_set *set;
struct nlattr *attr;
struct nft_ctx ctx;
int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ return PTR_ERR(table);
+ }
- set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
+ set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_set_start,
.dump = nf_tables_dump_set,
@@ -4741,7 +5385,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
};
c.data = &dump_ctx;
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
@@ -4749,8 +5393,10 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_get_set_elem(&ctx, set, attr);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
break;
+ }
}
return err;
@@ -4759,11 +5405,13 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
const struct nft_set *set,
const struct nft_set_elem *elem,
- int event, u16 flags)
+ int event)
{
+ struct nftables_pernet *nft_net;
struct net *net = ctx->net;
u32 portid = ctx->portid;
struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
@@ -4773,6 +5421,9 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
set, elem);
if (err < 0) {
@@ -4780,8 +5431,8 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
goto err;
}
- nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
- GFP_KERNEL);
+ nft_net = nft_pernet(net);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -4801,6 +5452,54 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
return trans;
}
+struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nft_expr *expr;
+ int err;
+
+ expr = nft_expr_init(ctx, attr);
+ if (IS_ERR(expr))
+ return expr;
+
+ err = -EOPNOTSUPP;
+ if (expr->ops->type->flags & NFT_EXPR_GC) {
+ if (set->flags & NFT_SET_TIMEOUT)
+ goto err_set_elem_expr;
+ if (!set->ops->gc_init)
+ goto err_set_elem_expr;
+ set->ops->gc_init(set);
+ }
+
+ return expr;
+
+err_set_elem_expr:
+ nft_expr_destroy(ctx, expr);
+ return ERR_PTR(err);
+}
+
+static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len)
+{
+ len += nft_set_ext_types[id].len;
+ if (len > tmpl->ext_len[id] ||
+ len > U8_MAX)
+ return -1;
+
+ return 0;
+}
+
+static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id,
+ void *to, const void *from, u32 len)
+{
+ if (nft_set_ext_check(tmpl, id, len) < 0)
+ return -1;
+
+ memcpy(to, from, len);
+
+ return 0;
+}
+
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *key_end,
@@ -4811,16 +5510,26 @@ void *nft_set_elem_init(const struct nft_set *set,
elem = kzalloc(set->ops->elemsize + tmpl->len, gfp);
if (elem == NULL)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ext = nft_set_elem_ext(set, elem);
nft_set_ext_init(ext, tmpl);
- memcpy(nft_set_ext_key(ext), key, set->klen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
- memcpy(nft_set_ext_key_end(ext), key_end, set->klen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
- memcpy(nft_set_ext_data(ext), data, set->dlen);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY,
+ nft_set_ext_key(ext), key, set->klen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END,
+ nft_set_ext_key_end(ext), key_end, set->klen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA,
+ nft_set_ext_data(ext), data, set->dlen) < 0)
+ goto err_ext_check;
+
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
*nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
if (expiration == 0)
@@ -4830,6 +5539,32 @@ void *nft_set_elem_init(const struct nft_set *set,
*nft_set_ext_timeout(ext) = timeout;
return elem;
+
+err_ext_check:
+ kfree(elem);
+
+ return ERR_PTR(-EINVAL);
+}
+
+static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_expr *expr)
+{
+ if (expr->ops->destroy_clone) {
+ expr->ops->destroy_clone(ctx, expr);
+ module_put(expr->ops->type->owner);
+ } else {
+ nf_tables_expr_destroy(ctx, expr);
+ }
+}
+
+static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_set_elem_expr *elem_expr)
+{
+ struct nft_expr *expr;
+ u32 size;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size)
+ __nft_set_elem_expr_destroy(ctx, expr);
}
void nft_set_elem_destroy(const struct nft_set *set, void *elem,
@@ -4844,40 +5579,311 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
- if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
- struct nft_expr *expr = nft_set_ext_expr(ext);
+ if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
+ nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext));
- if (expr->ops->destroy_clone) {
- expr->ops->destroy_clone(&ctx, expr);
- module_put(expr->ops->type->owner);
- } else {
- nf_tables_expr_destroy(&ctx, expr);
- }
- }
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
(*nft_set_ext_obj(ext))->use--;
kfree(elem);
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
-/* Only called from commit path, nft_set_elem_deactivate() already deals with
- * the refcounting from the preparation phase.
+/* Only called from commit path, nft_setelem_data_deactivate() already deals
+ * with the refcounting from the preparation phase.
*/
static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
const struct nft_set *set, void *elem)
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
+ nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
+
kfree(elem);
}
+int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_expr *expr_array[])
+{
+ struct nft_expr *expr;
+ int err, i, k;
+
+ for (i = 0; i < set->num_exprs; i++) {
+ expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
+ if (!expr)
+ goto err_expr;
+
+ err = nft_expr_clone(expr, set->exprs[i]);
+ if (err < 0) {
+ kfree(expr);
+ goto err_expr;
+ }
+ expr_array[i] = expr;
+ }
+
+ return 0;
+
+err_expr:
+ for (k = i - 1; k >= 0; k--)
+ nft_expr_destroy(ctx, expr_array[k]);
+
+ return -ENOMEM;
+}
+
+static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
+ const struct nft_set_ext_tmpl *tmpl,
+ const struct nft_set_ext *ext,
+ struct nft_expr *expr_array[],
+ u32 num_exprs)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ u32 len = sizeof(struct nft_set_elem_expr);
+ struct nft_expr *expr;
+ int i, err;
+
+ if (num_exprs == 0)
+ return 0;
+
+ for (i = 0; i < num_exprs; i++)
+ len += expr_array[i]->ops->size;
+
+ if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0)
+ return -EINVAL;
+
+ for (i = 0; i < num_exprs; i++) {
+ expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
+ err = nft_expr_clone(expr, expr_array[i]);
+ if (err < 0)
+ goto err_elem_expr_setup;
+
+ elem_expr->size += expr_array[i]->ops->size;
+ nft_expr_destroy(ctx, expr_array[i]);
+ expr_array[i] = NULL;
+ }
+
+ return 0;
+
+err_elem_expr_setup:
+ for (; i < num_exprs; i++) {
+ nft_expr_destroy(ctx, expr_array[i]);
+ expr_array[i] = NULL;
+ }
+
+ return -ENOMEM;
+}
+
+struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
+ const struct nft_set *set)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_cur(net);
+ struct nft_set_ext *ext;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (nft_set_elem_active(ext, genmask) &&
+ !nft_set_elem_expired(ext))
+ return ext;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);
+
+void *nft_set_catchall_gc(const struct nft_set *set)
+{
+ struct nft_set_elem_catchall *catchall, *next;
+ struct nft_set_ext *ext;
+ void *elem = NULL;
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+
+ if (!nft_set_elem_expired(ext) ||
+ nft_set_elem_mark_busy(ext))
+ continue;
+
+ elem = catchall->elem;
+ list_del_rcu(&catchall->list);
+ kfree_rcu(catchall, rcu);
+ break;
+ }
+
+ return elem;
+}
+EXPORT_SYMBOL_GPL(nft_set_catchall_gc);
+
+static int nft_setelem_catchall_insert(const struct net *net,
+ struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_set_ext **pext)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_next(net);
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (nft_set_elem_active(ext, genmask)) {
+ *pext = ext;
+ return -EEXIST;
+ }
+ }
+
+ catchall = kmalloc(sizeof(*catchall), GFP_KERNEL);
+ if (!catchall)
+ return -ENOMEM;
+
+ catchall->elem = elem->priv;
+ list_add_tail_rcu(&catchall->list, &set->catchall_list);
+
+ return 0;
+}
+
+static int nft_setelem_insert(const struct net *net,
+ struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_set_ext **ext, unsigned int flags)
+{
+ int ret;
+
+ if (flags & NFT_SET_ELEM_CATCHALL)
+ ret = nft_setelem_catchall_insert(net, set, elem, ext);
+ else
+ ret = set->ops->insert(net, set, elem, ext);
+
+ return ret;
+}
+
+static bool nft_setelem_is_catchall(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL)
+ return true;
+
+ return false;
+}
+
+static void nft_setelem_activate(struct net *net, struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+
+ if (nft_setelem_is_catchall(set, elem)) {
+ nft_set_elem_change_active(net, set, ext);
+ nft_set_elem_clear_busy(ext);
+ } else {
+ set->ops->activate(net, set, elem);
+ }
+}
+
+static int nft_setelem_catchall_deactivate(const struct net *net,
+ struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_is_active(net, ext) ||
+ nft_set_elem_mark_busy(ext))
+ continue;
+
+ kfree(elem->priv);
+ elem->priv = catchall->elem;
+ nft_set_elem_change_active(net, set, ext);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int __nft_setelem_deactivate(const struct net *net,
+ struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ void *priv;
+
+ priv = set->ops->deactivate(net, set, elem);
+ if (!priv)
+ return -ENOENT;
+
+ kfree(elem->priv);
+ elem->priv = priv;
+ set->ndeact++;
+
+ return 0;
+}
+
+static int nft_setelem_deactivate(const struct net *net,
+ struct nft_set *set,
+ struct nft_set_elem *elem, u32 flags)
+{
+ int ret;
+
+ if (flags & NFT_SET_ELEM_CATCHALL)
+ ret = nft_setelem_catchall_deactivate(net, set, elem);
+ else
+ ret = __nft_setelem_deactivate(net, set, elem);
+
+ return ret;
+}
+
+static void nft_setelem_catchall_remove(const struct net *net,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_set_elem_catchall *catchall, *next;
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ if (catchall->elem == elem->priv) {
+ list_del_rcu(&catchall->list);
+ kfree_rcu(catchall, rcu);
+ break;
+ }
+ }
+}
+
+static void nft_setelem_remove(const struct net *net,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ if (nft_setelem_is_catchall(set, elem))
+ nft_setelem_catchall_remove(net, set, elem);
+ else
+ set->ops->remove(net, set, elem);
+}
+
+static bool nft_setelem_valid_key_end(const struct nft_set *set,
+ struct nlattr **nla, u32 flags)
+{
+ if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) ==
+ (NFT_SET_CONCAT | NFT_SET_INTERVAL)) {
+ if (flags & NFT_SET_ELEM_INTERVAL_END)
+ return false;
+
+ if (nla[NFTA_SET_ELEM_KEY_END] &&
+ flags & NFT_SET_ELEM_CATCHALL)
+ return false;
+ } else {
+ if (nla[NFTA_SET_ELEM_KEY_END])
+ return false;
+ }
+
+ return true;
+}
+
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{
+ struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
u8 genmask = nft_genmask_next(ctx->net);
+ u32 flags = 0, size = 0, num_exprs = 0;
struct nft_set_ext_tmpl tmpl;
struct nft_set_ext *ext, *ext2;
struct nft_set_elem elem;
@@ -4885,30 +5891,32 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_object *obj = NULL;
struct nft_userdata *udata;
struct nft_data_desc desc;
- struct nft_data data;
enum nft_registers dreg;
struct nft_trans *trans;
- u32 flags = 0;
u64 timeout;
u64 expiration;
+ int err, i;
u8 ulen;
- int err;
err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
nft_set_elem_policy, NULL);
if (err < 0)
return err;
- if (nla[NFTA_SET_ELEM_KEY] == NULL)
- return -EINVAL;
-
nft_set_ext_prepare(&tmpl);
err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+
+ if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
+
+ if (flags != 0) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (err < 0)
+ return err;
+ }
if (set->flags & NFT_SET_MAP) {
if (nla[NFTA_SET_ELEM_DATA] == NULL &&
@@ -4919,13 +5927,27 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return -EINVAL;
}
+ if (set->flags & NFT_SET_OBJECT) {
+ if (!nla[NFTA_SET_ELEM_OBJREF] &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END))
+ return -EINVAL;
+ } else {
+ if (nla[NFTA_SET_ELEM_OBJREF])
+ return -EINVAL;
+ }
+
+ if (!nft_setelem_valid_key_end(set, nla, flags))
+ return -EINVAL;
+
if ((flags & NFT_SET_ELEM_INTERVAL_END) &&
(nla[NFTA_SET_ELEM_DATA] ||
nla[NFTA_SET_ELEM_OBJREF] ||
nla[NFTA_SET_ELEM_TIMEOUT] ||
nla[NFTA_SET_ELEM_EXPIRATION] ||
nla[NFTA_SET_ELEM_USERDATA] ||
- nla[NFTA_SET_ELEM_EXPR]))
+ nla[NFTA_SET_ELEM_EXPR] ||
+ nla[NFTA_SET_ELEM_KEY_END] ||
+ nla[NFTA_SET_ELEM_EXPRESSIONS]))
return -EINVAL;
timeout = 0;
@@ -4950,12 +5972,76 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return err;
}
- err = nft_setelem_parse_key(ctx, set, &elem.key.val,
- nla[NFTA_SET_ELEM_KEY]);
- if (err < 0)
- return err;
+ if (nla[NFTA_SET_ELEM_EXPR]) {
+ struct nft_expr *expr;
+
+ if (set->num_exprs && set->num_exprs != 1)
+ return -EOPNOTSUPP;
+
+ expr = nft_set_elem_expr_alloc(ctx, set,
+ nla[NFTA_SET_ELEM_EXPR]);
+ if (IS_ERR(expr))
+ return PTR_ERR(expr);
+
+ expr_array[0] = expr;
+ num_exprs = 1;
+
+ if (set->num_exprs && set->exprs[0]->ops != expr->ops) {
+ err = -EOPNOTSUPP;
+ goto err_set_elem_expr;
+ }
+ } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) {
+ struct nft_expr *expr;
+ struct nlattr *tmp;
+ int left;
+
+ i = 0;
+ nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX ||
+ (set->num_exprs && set->num_exprs == i)) {
+ err = -E2BIG;
+ goto err_set_elem_expr;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_set_elem_expr;
+ }
+ expr = nft_set_elem_expr_alloc(ctx, set, tmp);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_elem_expr;
+ }
+ expr_array[i] = expr;
+ num_exprs++;
+
+ if (set->num_exprs && expr->ops != set->exprs[i]->ops) {
+ err = -EOPNOTSUPP;
+ goto err_set_elem_expr;
+ }
+ i++;
+ }
+ if (set->num_exprs && set->num_exprs != i) {
+ err = -EOPNOTSUPP;
+ goto err_set_elem_expr;
+ }
+ } else if (set->num_exprs > 0) {
+ err = nft_set_elem_expr_clone(ctx, set, expr_array);
+ if (err < 0)
+ goto err_set_elem_expr_clone;
+
+ num_exprs = set->num_exprs;
+ }
+
+ if (nla[NFTA_SET_ELEM_KEY]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ goto err_set_elem_expr;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ if (err < 0)
+ goto err_parse_key;
+ }
if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
@@ -4963,20 +6049,34 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
goto err_parse_key;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ if (err < 0)
+ goto err_parse_key_end;
}
if (timeout > 0) {
- nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
- if (timeout != set->timeout)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
+ if (err < 0)
+ goto err_parse_key_end;
+
+ if (timeout != set->timeout) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+ if (err < 0)
+ goto err_parse_key_end;
+ }
}
- if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
- if (!(set->flags & NFT_SET_OBJECT)) {
- err = -EINVAL;
+ if (num_exprs) {
+ for (i = 0; i < num_exprs; i++)
+ size += expr_array[i]->ops->size;
+
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
+ sizeof(struct nft_set_elem_expr) + size);
+ if (err < 0)
goto err_parse_key_end;
- }
+ }
+
+ if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
obj = nft_obj_lookup(ctx->net, ctx->table,
nla[NFTA_SET_ELEM_OBJREF],
set->objtype, genmask);
@@ -4984,19 +6084,17 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = PTR_ERR(obj);
goto err_parse_key_end;
}
- nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+ if (err < 0)
+ goto err_parse_key_end;
}
if (nla[NFTA_SET_ELEM_DATA] != NULL) {
- err = nft_data_init(ctx, &data, sizeof(data), &desc,
- nla[NFTA_SET_ELEM_DATA]);
+ err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val,
+ nla[NFTA_SET_ELEM_DATA]);
if (err < 0)
goto err_parse_key_end;
- err = -EINVAL;
- if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen)
- goto err_parse_data;
-
dreg = nft_type_to_reg(set->dtype);
list_for_each_entry(binding, &set->bindings, list) {
struct nft_ctx bind_ctx = {
@@ -5010,19 +6108,21 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
continue;
err = nft_validate_register_store(&bind_ctx, dreg,
- &data,
+ &elem.data.val,
desc.type, desc.len);
if (err < 0)
goto err_parse_data;
if (desc.type == NFT_DATA_VERDICT &&
- (data.verdict.code == NFT_GOTO ||
- data.verdict.code == NFT_JUMP))
+ (elem.data.val.verdict.code == NFT_GOTO ||
+ elem.data.val.verdict.code == NFT_JUMP))
nft_validate_state_update(ctx->net,
NFT_VALIDATE_NEED);
}
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
+ if (err < 0)
+ goto err_parse_data;
}
/* The full maximum length of userdata can exceed the maximum
@@ -5032,22 +6132,31 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
ulen = 0;
if (nla[NFTA_SET_ELEM_USERDATA] != NULL) {
ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]);
- if (ulen > 0)
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
- ulen);
+ if (ulen > 0) {
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
+ ulen);
+ if (err < 0)
+ goto err_parse_data;
+ }
}
- err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
- elem.key_end.val.data, data.data,
- timeout, expiration, GFP_KERNEL);
- if (elem.priv == NULL)
+ elem.key_end.val.data, elem.data.val.data,
+ timeout, expiration, GFP_KERNEL_ACCOUNT);
+ if (IS_ERR(elem.priv)) {
+ err = PTR_ERR(elem.priv);
goto err_parse_data;
+ }
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
*nft_set_ext_flags(ext) = flags;
+
if (ulen > 0) {
+ if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) {
+ err = -EINVAL;
+ goto err_elem_userdata;
+ }
udata = nft_set_ext_userdata(ext);
udata->len = ulen - 1;
nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
@@ -5056,22 +6165,26 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
*nft_set_ext_obj(ext) = obj;
obj->use++;
}
+ err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs);
+ if (err < 0)
+ goto err_elem_free;
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
- if (trans == NULL)
- goto err_trans;
+ if (trans == NULL) {
+ err = -ENOMEM;
+ goto err_elem_free;
+ }
ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
- err = set->ops->insert(ctx->net, set, &elem, &ext2);
+
+ err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags);
if (err) {
if (err == -EEXIST) {
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
- nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) {
- err = -EBUSY;
+ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF))
goto err_element_clash;
- }
if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
memcmp(nft_set_ext_data(ext),
@@ -5079,49 +6192,62 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
(nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
*nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
- err = -EBUSY;
+ goto err_element_clash;
else if (!(nlmsg_flags & NLM_F_EXCL))
err = 0;
+ } else if (err == -ENOTEMPTY) {
+ /* ENOTEMPTY reports overlapping between this element
+ * and an existing one.
+ */
+ err = -EEXIST;
}
goto err_element_clash;
}
- if (set->size &&
+ if (!(flags & NFT_SET_ELEM_CATCHALL) && set->size &&
!atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) {
err = -ENFILE;
goto err_set_full;
}
nft_trans_elem(trans) = elem;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
err_set_full:
- set->ops->remove(ctx->net, set, &elem);
+ nft_setelem_remove(ctx->net, set, &elem);
err_element_clash:
kfree(trans);
-err_trans:
+err_elem_free:
if (obj)
obj->use--;
- kfree(elem.priv);
+err_elem_userdata:
+ nf_tables_set_elem_destroy(ctx, set, elem.priv);
err_parse_data:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
- nft_data_release(&data, desc.type);
+ nft_data_release(&elem.data.val, desc.type);
err_parse_key_end:
nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
err_parse_key:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
-
+err_set_elem_expr:
+ for (i = 0; i < num_exprs && expr_array[i]; i++)
+ nft_expr_destroy(ctx, expr_array[i]);
+err_set_elem_expr_clone:
return err;
}
-static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- u8 genmask = nft_genmask_next(net);
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
+ struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
int rem, err;
@@ -5129,12 +6255,14 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ return PTR_ERR(table);
+ }
- set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+ set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET],
nla[NFTA_SET_ELEM_LIST_SET_ID], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
@@ -5142,14 +6270,18 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
return -EBUSY;
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
- if (err < 0)
+ err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return err;
+ }
}
- if (net->nft.validate_state == NFT_VALIDATE_DO)
- return nft_table_validate(net, ctx.table);
+ if (nft_net->validate_state == NFT_VALIDATE_DO)
+ return nft_table_validate(net, table);
return 0;
}
@@ -5167,19 +6299,32 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
*/
void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
{
+ struct nft_chain *chain;
+ struct nft_rule *rule;
+
if (type == NFT_DATA_VERDICT) {
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
- data->verdict.chain->use++;
+ chain = data->verdict.chain;
+ chain->use++;
+
+ if (!nft_chain_is_bound(chain))
+ break;
+
+ chain->table->use++;
+ list_for_each_entry(rule, &chain->rules, list)
+ chain->use++;
+
+ nft_chain_add(chain->table, chain);
break;
}
}
}
-static void nft_set_elem_activate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem)
+static void nft_setelem_data_activate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
@@ -5189,9 +6334,9 @@ static void nft_set_elem_activate(const struct net *net,
(*nft_set_ext_obj(ext))->use++;
}
-static void nft_set_elem_deactivate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem)
+static void nft_setelem_data_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
@@ -5210,7 +6355,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_ext *ext;
struct nft_trans *trans;
u32 flags = 0;
- void *priv;
int err;
err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
@@ -5218,39 +6362,54 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
return err;
- if (nla[NFTA_SET_ELEM_KEY] == NULL)
+ err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+ if (err < 0)
+ return err;
+
+ if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
+
+ if (!nft_setelem_valid_key_end(set, nla, flags))
return -EINVAL;
nft_set_ext_prepare(&tmpl);
- err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
- if (err < 0)
- return err;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (flags != 0) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (err < 0)
+ return err;
+ }
- err = nft_setelem_parse_key(ctx, set, &elem.key.val,
- nla[NFTA_SET_ELEM_KEY]);
- if (err < 0)
- return err;
+ if (nla[NFTA_SET_ELEM_KEY]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ return err;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ if (err < 0)
+ goto fail_elem;
+ }
if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
nla[NFTA_SET_ELEM_KEY_END]);
if (err < 0)
- return err;
+ goto fail_elem;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ if (err < 0)
+ goto fail_elem_key_end;
}
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
elem.key_end.val.data, NULL, 0, 0,
- GFP_KERNEL);
- if (elem.priv == NULL)
- goto fail_elem;
+ GFP_KERNEL_ACCOUNT);
+ if (IS_ERR(elem.priv)) {
+ err = PTR_ERR(elem.priv);
+ goto fail_elem_key_end;
+ }
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
@@ -5260,33 +6419,31 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (trans == NULL)
goto fail_trans;
- priv = set->ops->deactivate(ctx->net, set, &elem);
- if (priv == NULL) {
- err = -ENOENT;
+ err = nft_setelem_deactivate(ctx->net, set, &elem, flags);
+ if (err < 0)
goto fail_ops;
- }
- kfree(elem.priv);
- elem.priv = priv;
- nft_set_elem_deactivate(ctx->net, set, &elem);
+ nft_setelem_data_deactivate(ctx->net, set, &elem);
nft_trans_elem(trans) = elem;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
fail_ops:
kfree(trans);
fail_trans:
kfree(elem.priv);
+fail_elem_key_end:
+ nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
fail_elem:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
return err;
}
-static int nft_flush_set(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+static int nft_setelem_flush(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_set_elem *elem)
{
struct nft_trans *trans;
int err;
@@ -5302,10 +6459,10 @@ static int nft_flush_set(const struct nft_ctx *ctx,
}
set->ndeact++;
- nft_set_elem_deactivate(ctx->net, set, elem);
+ nft_setelem_data_deactivate(ctx->net, set, elem);
nft_trans_elem_set(trans) = set;
nft_trans_elem(trans) = *elem;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
err1:
@@ -5313,44 +6470,101 @@ err1:
return err;
}
-static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_elem *elem)
{
- u8 genmask = nft_genmask_next(net);
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
+ sizeof(struct nft_trans_elem), GFP_KERNEL);
+ if (!trans)
+ return -ENOMEM;
+
+ nft_setelem_data_deactivate(ctx->net, set, elem);
+ nft_trans_elem_set(trans) = set;
+ nft_trans_elem(trans) = *elem;
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return 0;
+}
+
+static int nft_set_catchall_flush(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_elem elem;
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask) ||
+ nft_set_elem_mark_busy(ext))
+ continue;
+
+ elem.priv = catchall->elem;
+ ret = __nft_set_catchall_flush(ctx, set, &elem);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
+static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
+{
+ struct nft_set_iter iter = {
+ .genmask = genmask,
+ .fn = nft_setelem_flush,
+ };
+
+ set->ops->walk(ctx, set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_flush(ctx, set);
+
+ return iter.err;
+}
+
+static int nf_tables_delsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
+ struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ return PTR_ERR(table);
+ }
- set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
+ set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
return -EBUSY;
- if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
- struct nft_set_iter iter = {
- .genmask = genmask,
- .fn = nft_flush_set,
- };
- set->ops->walk(&ctx, set, &iter);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- return iter.err;
- }
+ if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
+ return nft_set_flush(&ctx, set, genmask);
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_del_setelem(&ctx, set, attr);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
break;
-
- set->ndeact++;
+ }
}
return err;
}
@@ -5365,7 +6579,6 @@ void nft_set_gc_batch_release(struct rcu_head *rcu)
nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
kfree(gcb);
}
-EXPORT_SYMBOL_GPL(nft_set_gc_batch_release);
struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
gfp_t gfp)
@@ -5378,7 +6591,6 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
gcb->head.set = set;
return gcb;
}
-EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
/*
* Stateful objects
@@ -5386,7 +6598,7 @@ EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
/**
* nft_register_obj- register nf_tables stateful object type
- * @obj: object type
+ * @obj_type: object type
*
* Registers the object type for use with nf_tables. Returns zero on
* success or a negative errno code otherwise.
@@ -5405,7 +6617,7 @@ EXPORT_SYMBOL_GPL(nft_register_obj);
/**
* nft_unregister_obj - unregister nf_tables object type
- * @obj: object type
+ * @obj_type: object type
*
* Unregisters the object type for use with nf_tables.
*/
@@ -5427,7 +6639,7 @@ struct nft_object *nft_obj_lookup(const struct net *net,
struct rhlist_head *tmp, *list;
struct nft_object *obj;
- nla_strlcpy(search, nla, sizeof(search));
+ nla_strscpy(search, nla, sizeof(search));
k.name = search;
WARN_ON_ONCE(!rcu_read_lock_held() &&
@@ -5474,6 +6686,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
[NFTA_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_OBJ_DATA] = { .type = NLA_NESTED },
[NFTA_OBJ_HANDLE] = { .type = NLA_U64},
+ [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
@@ -5509,7 +6723,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
}
err = -ENOMEM;
- obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL);
+ obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT);
if (!obj)
goto err2;
@@ -5583,12 +6797,15 @@ static int nf_tables_updobj(const struct nft_ctx *ctx,
{
struct nft_object *newobj;
struct nft_trans *trans;
- int err;
+ int err = -ENOMEM;
+
+ if (!try_module_get(type->owner))
+ return -ENOENT;
trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ,
sizeof(struct nft_trans_obj));
if (!trans)
- return -ENOMEM;
+ goto err_trans;
newobj = nft_obj_init(ctx, type, attr);
if (IS_ERR(newobj)) {
@@ -5599,24 +6816,25 @@ static int nf_tables_updobj(const struct nft_ctx *ctx,
nft_trans_obj(trans) = obj;
nft_trans_obj_update(trans) = true;
nft_trans_obj_newobj(trans) = newobj;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
err_free_trans:
kfree(trans);
+err_trans:
+ module_put(type->owner);
return err;
}
-static int nf_tables_newobj(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_object_type *type;
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct net *net = info->net;
struct nft_table *table;
struct nft_object *obj;
struct nft_ctx ctx;
@@ -5628,7 +6846,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
!nla[NFTA_OBJ_DATA])
return -EINVAL;
- table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
@@ -5643,20 +6862,20 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
return err;
}
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
type = __nft_obj_type_get(objtype);
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
type = nft_obj_type_get(net, objtype);
if (IS_ERR(type))
@@ -5665,40 +6884,50 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
- goto err1;
+ goto err_init;
}
obj->key.table = table;
obj->handle = nf_tables_alloc_handle(table);
- obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
+ obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT);
if (!obj->key.name) {
err = -ENOMEM;
- goto err2;
+ goto err_strdup;
+ }
+
+ if (nla[NFTA_OBJ_USERDATA]) {
+ obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL);
+ if (obj->udata == NULL)
+ goto err_userdata;
+
+ obj->udlen = nla_len(nla[NFTA_OBJ_USERDATA]);
}
err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
if (err < 0)
- goto err3;
+ goto err_trans;
err = rhltable_insert(&nft_objname_ht, &obj->rhlhead,
nft_objname_ht_params);
if (err < 0)
- goto err4;
+ goto err_obj_ht;
list_add_tail_rcu(&obj->list, &table->objects);
table->use++;
return 0;
-err4:
+err_obj_ht:
/* queued in transaction log */
INIT_LIST_HEAD(&obj->list);
return err;
-err3:
+err_trans:
+ kfree(obj->udata);
+err_userdata:
kfree(obj->key.name);
-err2:
+err_strdup:
if (obj->ops->destroy)
obj->ops->destroy(&ctx, obj);
kfree(obj);
-err1:
+err_init:
module_put(type->owner);
return err;
}
@@ -5708,19 +6937,14 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
int family, const struct nft_table *table,
struct nft_object *obj, bool reset)
{
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+ NFNETLINK_V0, nft_base_seq(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) ||
nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
@@ -5730,6 +6954,10 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
NFTA_OBJ_PAD))
goto nla_put_failure;
+ if (obj->udata &&
+ nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -5751,6 +6979,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
struct nft_obj_filter *filter = cb->data;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net;
struct nft_object *obj;
bool reset = false;
@@ -5758,9 +6987,10 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
reset = true;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = READ_ONCE(nft_net->base_seq);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -5779,6 +7009,19 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
filter->type != NFT_OBJECT_UNSPEC &&
obj->ops->type->type != filter->type)
goto cont;
+ if (reset) {
+ char *buf = kasprintf(GFP_ATOMIC,
+ "%s:%u",
+ table->name,
+ nft_net->base_seq);
+
+ audit_log_nfcfg(buf,
+ family,
+ obj->handle,
+ AUDIT_NFT_OP_OBJ_RESET,
+ GFP_ATOMIC);
+ kfree(buf);
+ }
if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -5839,22 +7082,21 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb)
}
/* called with rcu_read_lock held */
-static int nf_tables_getobj(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_table *table;
+ struct net *net = info->net;
struct nft_object *obj;
struct sk_buff *skb2;
bool reset = false;
u32 objtype;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_obj_start,
.dump = nf_tables_dump_obj,
@@ -5863,14 +7105,14 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
.data = (void *)nla,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
if (!nla[NFTA_OBJ_NAME] ||
!nla[NFTA_OBJ_TYPE])
return -EINVAL;
- table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
@@ -5887,17 +7129,33 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
if (!skb2)
return -ENOMEM;
- if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
+ if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
reset = true;
+ if (reset) {
+ const struct nftables_pernet *nft_net;
+ char *buf;
+
+ nft_net = nft_pernet(net);
+ buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq);
+
+ audit_log_nfcfg(buf,
+ family,
+ obj->handle,
+ AUDIT_NFT_OP_OBJ_RESET,
+ GFP_ATOMIC);
+ kfree(buf);
+ }
+
err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+ info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
family, table, obj, reset);
if (err < 0)
- goto err;
+ goto err_fill_obj_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+
+err_fill_obj_info:
kfree_skb(skb2);
return err;
}
@@ -5909,17 +7167,17 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
module_put(obj->ops->type->owner);
kfree(obj->key.name);
+ kfree(obj->udata);
kfree(obj);
}
-static int nf_tables_delobj(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_object *obj;
@@ -5930,7 +7188,8 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
(!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
return -EINVAL;
- table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
@@ -5954,17 +7213,29 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
return -EBUSY;
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
return nft_delobj(&ctx, obj);
}
void nft_obj_notify(struct net *net, const struct nft_table *table,
struct nft_object *obj, u32 portid, u32 seq, int event,
- int family, int report, gfp_t gfp)
+ u16 flags, int family, int report, gfp_t gfp)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct sk_buff *skb;
int err;
+ char *buf = kasprintf(gfp, "%s:%u",
+ table->name, nft_net->base_seq);
+
+ audit_log_nfcfg(buf,
+ family,
+ obj->handle,
+ event == NFT_MSG_NEWOBJ ?
+ AUDIT_NFT_OP_OBJ_REGISTER :
+ AUDIT_NFT_OP_OBJ_UNREGISTER,
+ gfp);
+ kfree(buf);
if (!report &&
!nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
@@ -5974,14 +7245,15 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
if (skb == NULL)
goto err;
- err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family,
- table, obj, false);
+ err = nf_tables_fill_obj_info(skb, net, portid, seq, event,
+ flags & (NLM_F_CREATE | NLM_F_EXCL),
+ family, table, obj, false);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
+ nft_notify_enqueue(skb, report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -5992,7 +7264,7 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx,
struct nft_object *obj, int event)
{
nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event,
- ctx->family, ctx->report, GFP_KERNEL);
+ ctx->flags, ctx->family, ctx->report, GFP_KERNEL);
}
/*
@@ -6047,7 +7319,7 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
case NFT_TRANS_ABORT:
case NFT_TRANS_RELEASE:
flowtable->use--;
- /* fall through */
+ fallthrough;
default:
return;
}
@@ -6068,50 +7340,77 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table,
return ERR_PTR(-ENOENT);
}
+struct nft_flowtable_hook {
+ u32 num;
+ int priority;
+ struct list_head list;
+};
+
static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
[NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 },
[NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 },
[NFTA_FLOWTABLE_HOOK_DEVS] = { .type = NLA_NESTED },
};
-static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
- const struct nlattr *attr,
- struct nft_flowtable *flowtable)
+static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
+ const struct nlattr *attr,
+ struct nft_flowtable_hook *flowtable_hook,
+ struct nft_flowtable *flowtable, bool add)
{
struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
struct nft_hook *hook;
int hooknum, priority;
int err;
+ INIT_LIST_HEAD(&flowtable_hook->list);
+
err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr,
nft_flowtable_hook_policy, NULL);
if (err < 0)
return err;
- if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
- !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] ||
- !tb[NFTA_FLOWTABLE_HOOK_DEVS])
- return -EINVAL;
+ if (add) {
+ if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
+ !tb[NFTA_FLOWTABLE_HOOK_PRIORITY])
+ return -EINVAL;
- hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
- if (hooknum != NF_NETDEV_INGRESS)
- return -EINVAL;
+ hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
+ if (hooknum != NF_NETDEV_INGRESS)
+ return -EOPNOTSUPP;
- priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
+ priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
- err = nf_tables_parse_netdev_hooks(ctx->net,
- tb[NFTA_FLOWTABLE_HOOK_DEVS],
- &flowtable->hook_list);
- if (err < 0)
- return err;
+ flowtable_hook->priority = priority;
+ flowtable_hook->num = hooknum;
+ } else {
+ if (tb[NFTA_FLOWTABLE_HOOK_NUM]) {
+ hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
+ if (hooknum != flowtable->hooknum)
+ return -EOPNOTSUPP;
+ }
- flowtable->hooknum = hooknum;
- flowtable->data.priority = priority;
+ if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
+ priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
+ if (priority != flowtable->data.priority)
+ return -EOPNOTSUPP;
+ }
- list_for_each_entry(hook, &flowtable->hook_list, list) {
+ flowtable_hook->priority = flowtable->data.priority;
+ flowtable_hook->num = flowtable->hooknum;
+ }
+
+ if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) {
+ err = nf_tables_parse_netdev_hooks(ctx->net,
+ tb[NFTA_FLOWTABLE_HOOK_DEVS],
+ &flowtable_hook->list);
+ if (err < 0)
+ return err;
+ }
+
+ list_for_each_entry(hook, &flowtable_hook->list, list) {
hook->ops.pf = NFPROTO_NETDEV;
- hook->ops.hooknum = hooknum;
- hook->ops.priority = priority;
+ hook->ops.hooknum = flowtable_hook->num;
+ hook->ops.priority = flowtable_hook->priority;
hook->ops.priv = &flowtable->data;
hook->ops.hook = flowtable->data.type->hook;
}
@@ -6159,29 +7458,45 @@ static void nft_unregister_flowtable_hook(struct net *net,
FLOW_BLOCK_UNBIND);
}
-static void nft_unregister_flowtable_net_hooks(struct net *net,
- struct nft_flowtable *flowtable)
+static void __nft_unregister_flowtable_net_hooks(struct net *net,
+ struct list_head *hook_list,
+ bool release_netdev)
{
- struct nft_hook *hook;
+ struct nft_hook *hook, *next;
- list_for_each_entry(hook, &flowtable->hook_list, list)
+ list_for_each_entry_safe(hook, next, hook_list, list) {
nf_unregister_net_hook(net, &hook->ops);
+ if (release_netdev) {
+ list_del(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
+ }
+}
+
+static void nft_unregister_flowtable_net_hooks(struct net *net,
+ struct list_head *hook_list)
+{
+ __nft_unregister_flowtable_net_hooks(net, hook_list, false);
}
static int nft_register_flowtable_net_hooks(struct net *net,
struct nft_table *table,
+ struct list_head *hook_list,
struct nft_flowtable *flowtable)
{
struct nft_hook *hook, *hook2, *next;
struct nft_flowtable *ft;
int err, i = 0;
- list_for_each_entry(hook, &flowtable->hook_list, list) {
+ list_for_each_entry(hook, hook_list, list) {
list_for_each_entry(ft, &table->flowtables, list) {
+ if (!nft_is_active_next(net, ft))
+ continue;
+
list_for_each_entry(hook2, &ft->hook_list, list) {
if (hook->ops.dev == hook2->ops.dev &&
hook->ops.pf == hook2->ops.pf) {
- err = -EBUSY;
+ err = -EEXIST;
goto err_unregister_net_hooks;
}
}
@@ -6207,7 +7522,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,
return 0;
err_unregister_net_hooks:
- list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+ list_for_each_entry_safe(hook, next, hook_list, list) {
if (i-- <= 0)
break;
@@ -6219,18 +7534,101 @@ err_unregister_net_hooks:
return err;
}
-static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static void nft_flowtable_hooks_destroy(struct list_head *hook_list)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_hook *hook, *next;
+
+ list_for_each_entry_safe(hook, next, hook_list, list) {
+ list_del_rcu(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
+}
+
+static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
+ struct nft_flowtable *flowtable)
+{
+ const struct nlattr * const *nla = ctx->nla;
+ struct nft_flowtable_hook flowtable_hook;
+ struct nft_hook *hook, *next;
+ struct nft_trans *trans;
+ bool unregister = false;
+ u32 flags;
+ int err;
+
+ err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
+ &flowtable_hook, flowtable, false);
+ if (err < 0)
+ return err;
+
+ list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
+ if (nft_hook_list_find(&flowtable->hook_list, hook)) {
+ list_del(&hook->list);
+ kfree(hook);
+ }
+ }
+
+ if (nla[NFTA_FLOWTABLE_FLAGS]) {
+ flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+ if (flags & ~NFT_FLOWTABLE_MASK) {
+ err = -EOPNOTSUPP;
+ goto err_flowtable_update_hook;
+ }
+ if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
+ (flags & NFT_FLOWTABLE_HW_OFFLOAD)) {
+ err = -EOPNOTSUPP;
+ goto err_flowtable_update_hook;
+ }
+ } else {
+ flags = flowtable->data.flags;
+ }
+
+ err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
+ &flowtable_hook.list, flowtable);
+ if (err < 0)
+ goto err_flowtable_update_hook;
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE,
+ sizeof(struct nft_trans_flowtable));
+ if (!trans) {
+ unregister = true;
+ err = -ENOMEM;
+ goto err_flowtable_update_hook;
+ }
+
+ nft_trans_flowtable_flags(trans) = flags;
+ nft_trans_flowtable(trans) = flowtable;
+ nft_trans_flowtable_update(trans) = true;
+ INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
+ list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans));
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return 0;
+
+err_flowtable_update_hook:
+ list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
+ if (unregister)
+ nft_unregister_flowtable_hook(ctx->net, flowtable, hook);
+ list_del_rcu(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
+
+ return err;
+
+}
+
+static int nf_tables_newflowtable(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nft_flowtable_hook flowtable_hook;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nf_flowtable_type *type;
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
struct nft_hook *hook, *next;
+ struct net *net = info->net;
struct nft_table *table;
struct nft_ctx ctx;
int err;
@@ -6241,7 +7639,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
return -EINVAL;
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
- genmask);
+ genmask, NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
@@ -6256,17 +7654,19 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
return err;
}
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return -EEXIST;
}
- return 0;
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
+ return nft_flowtable_update(&ctx, info->nlh, flowtable);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
+ flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT);
if (!flowtable)
return -ENOMEM;
@@ -6274,7 +7674,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
flowtable->handle = nf_tables_alloc_handle(table);
INIT_LIST_HEAD(&flowtable->hook_list);
- flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
+ flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT);
if (!flowtable->name) {
err = -ENOMEM;
goto err1;
@@ -6289,8 +7689,10 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
if (nla[NFTA_FLOWTABLE_FLAGS]) {
flowtable->data.flags =
ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
- if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD)
+ if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) {
+ err = -EOPNOTSUPP;
goto err3;
+ }
}
write_pnet(&flowtable->data.net, net);
@@ -6299,17 +7701,20 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
if (err < 0)
goto err3;
- err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
- flowtable);
+ err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
+ &flowtable_hook, flowtable, true);
if (err < 0)
goto err4;
- err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable);
+ list_splice(&flowtable_hook.list, &flowtable->hook_list);
+ flowtable->data.priority = flowtable_hook.priority;
+ flowtable->hooknum = flowtable_hook.num;
+
+ err = nft_register_flowtable_net_hooks(ctx.net, table,
+ &flowtable->hook_list,
+ flowtable);
if (err < 0) {
- list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
- list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
- }
+ nft_flowtable_hooks_destroy(&flowtable->hook_list);
goto err4;
}
@@ -6338,16 +7743,73 @@ err1:
return err;
}
-static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct nft_hook *this, *next;
+
+ list_for_each_entry_safe(this, next, &flowtable_hook->list, list) {
+ list_del(&this->list);
+ kfree(this);
+ }
+}
+
+static int nft_delflowtable_hook(struct nft_ctx *ctx,
+ struct nft_flowtable *flowtable)
+{
+ const struct nlattr * const *nla = ctx->nla;
+ struct nft_flowtable_hook flowtable_hook;
+ LIST_HEAD(flowtable_del_list);
+ struct nft_hook *this, *hook;
+ struct nft_trans *trans;
+ int err;
+
+ err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
+ &flowtable_hook, flowtable, false);
+ if (err < 0)
+ return err;
+
+ list_for_each_entry(this, &flowtable_hook.list, list) {
+ hook = nft_hook_list_find(&flowtable->hook_list, this);
+ if (!hook) {
+ err = -ENOENT;
+ goto err_flowtable_del_hook;
+ }
+ list_move(&hook->list, &flowtable_del_list);
+ }
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE,
+ sizeof(struct nft_trans_flowtable));
+ if (!trans) {
+ err = -ENOMEM;
+ goto err_flowtable_del_hook;
+ }
+
+ nft_trans_flowtable(trans) = flowtable;
+ nft_trans_flowtable_update(trans) = true;
+ INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
+ list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans));
+ nft_flowtable_hook_release(&flowtable_hook);
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return 0;
+
+err_flowtable_del_hook:
+ list_splice(&flowtable_del_list, &flowtable->hook_list);
+ nft_flowtable_hook_release(&flowtable_hook);
+
+ return err;
+}
+
+static int nf_tables_delflowtable(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
@@ -6358,7 +7820,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
return -EINVAL;
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
- genmask);
+ genmask, NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
@@ -6376,36 +7838,36 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(flowtable);
}
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
+ if (nla[NFTA_FLOWTABLE_HOOK])
+ return nft_delflowtable_hook(&ctx, flowtable);
+
if (flowtable->use > 0) {
NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
-
return nft_delflowtable(&ctx, flowtable);
}
static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq, int event,
u32 flags, int family,
- struct nft_flowtable *flowtable)
+ struct nft_flowtable *flowtable,
+ struct list_head *hook_list)
{
struct nlattr *nest, *nest_devs;
- struct nfgenmsg *nfmsg;
struct nft_hook *hook;
struct nlmsghdr *nlh;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+ NFNETLINK_V0, nft_base_seq(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
@@ -6425,7 +7887,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
if (!nest_devs)
goto nla_put_failure;
- list_for_each_entry_rcu(hook, &flowtable->hook_list, list) {
+ list_for_each_entry_rcu(hook, hook_list, list) {
if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name))
goto nla_put_failure;
}
@@ -6453,12 +7915,14 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
+ struct nftables_pernet *nft_net;
const struct nft_table *table;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = READ_ONCE(nft_net->base_seq);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -6478,7 +7942,9 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
NFT_MSG_NEWFLOWTABLE,
NLM_F_MULTI | NLM_F_APPEND,
- table->family, flowtable) < 0)
+ table->family,
+ flowtable,
+ &flowtable->hook_list) < 0)
goto done;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -6529,21 +7995,19 @@ static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
}
/* called with rcu_read_lock held */
-static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getflowtable(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
- int family = nfmsg->nfgen_family;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
const struct nft_table *table;
+ struct net *net = info->net;
struct sk_buff *skb2;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_flowtable_start,
.dump = nf_tables_dump_flowtable,
@@ -6552,14 +8016,14 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
.data = (void *)nla,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
if (!nla[NFTA_FLOWTABLE_NAME])
return -EINVAL;
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
- genmask);
+ genmask, 0);
if (IS_ERR(table))
return PTR_ERR(table);
@@ -6573,26 +8037,30 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
return -ENOMEM;
err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
+ info->nlh->nlmsg_seq,
NFT_MSG_NEWFLOWTABLE, 0, family,
- flowtable);
+ flowtable, &flowtable->hook_list);
if (err < 0)
- goto err;
+ goto err_fill_flowtable_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+
+err_fill_flowtable_info:
kfree_skb(skb2);
return err;
}
static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
struct nft_flowtable *flowtable,
+ struct list_head *hook_list,
int event)
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct sk_buff *skb;
+ u16 flags = 0;
int err;
- if (ctx->report &&
+ if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
return;
@@ -6600,16 +8068,18 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
- ctx->seq, event, 0,
- ctx->family, flowtable);
+ ctx->seq, event, flags,
+ ctx->family, flowtable, hook_list);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -6634,21 +8104,17 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
char buf[TASK_COMM_LEN];
int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
+ NFNETLINK_V0, nft_base_seq(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
- if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) ||
+ if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) ||
nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
goto nla_put_failure;
@@ -6683,6 +8149,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct nft_flowtable *flowtable;
+ struct nftables_pernet *nft_net;
struct nft_table *table;
struct net *net;
@@ -6690,13 +8157,14 @@ static int nf_tables_flowtable_event(struct notifier_block *this,
return 0;
net = dev_net(dev);
- mutex_lock(&net->nft.commit_mutex);
- list_for_each_entry(table, &net->nft.tables, list) {
+ nft_net = nft_pernet(net);
+ mutex_lock(&nft_net->commit_mutex);
+ list_for_each_entry(table, &nft_net->tables, list) {
list_for_each_entry(flowtable, &table->flowtables, list) {
nft_flowtable_event(event, dev, flowtable);
}
}
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
return NOTIFY_DONE;
}
@@ -6712,7 +8180,7 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
struct sk_buff *skb2;
int err;
- if (nlmsg_report(nlh) &&
+ if (!nlmsg_report(nlh) &&
!nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
return;
@@ -6735,10 +8203,8 @@ err:
-ENOBUFS);
}
-static int nf_tables_getgen(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
struct sk_buff *skb2;
int err;
@@ -6747,128 +8213,152 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
if (skb2 == NULL)
return -ENOMEM;
- err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq);
+ err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid,
+ info->nlh->nlmsg_seq);
if (err < 0)
- goto err;
+ goto err_fill_gen_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+
+err_fill_gen_info:
kfree_skb(skb2);
return err;
}
static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
[NFT_MSG_NEWTABLE] = {
- .call_batch = nf_tables_newtable,
+ .call = nf_tables_newtable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_GETTABLE] = {
- .call_rcu = nf_tables_gettable,
+ .call = nf_tables_gettable,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_DELTABLE] = {
- .call_batch = nf_tables_deltable,
+ .call = nf_tables_deltable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_NEWCHAIN] = {
- .call_batch = nf_tables_newchain,
+ .call = nf_tables_newchain,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_GETCHAIN] = {
- .call_rcu = nf_tables_getchain,
+ .call = nf_tables_getchain,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_DELCHAIN] = {
- .call_batch = nf_tables_delchain,
+ .call = nf_tables_delchain,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_NEWRULE] = {
- .call_batch = nf_tables_newrule,
+ .call = nf_tables_newrule,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_GETRULE] = {
- .call_rcu = nf_tables_getrule,
+ .call = nf_tables_getrule,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_DELRULE] = {
- .call_batch = nf_tables_delrule,
+ .call = nf_tables_delrule,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_NEWSET] = {
- .call_batch = nf_tables_newset,
+ .call = nf_tables_newset,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_GETSET] = {
- .call_rcu = nf_tables_getset,
+ .call = nf_tables_getset,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_DELSET] = {
- .call_batch = nf_tables_delset,
+ .call = nf_tables_delset,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_NEWSETELEM] = {
- .call_batch = nf_tables_newsetelem,
+ .call = nf_tables_newsetelem,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETSETELEM] = {
- .call_rcu = nf_tables_getsetelem,
+ .call = nf_tables_getsetelem,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_DELSETELEM] = {
- .call_batch = nf_tables_delsetelem,
+ .call = nf_tables_delsetelem,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETGEN] = {
- .call_rcu = nf_tables_getgen,
+ .call = nf_tables_getgen,
+ .type = NFNL_CB_RCU,
},
[NFT_MSG_NEWOBJ] = {
- .call_batch = nf_tables_newobj,
+ .call = nf_tables_newobj,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ] = {
- .call_rcu = nf_tables_getobj,
+ .call = nf_tables_getobj,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_DELOBJ] = {
- .call_batch = nf_tables_delobj,
+ .call = nf_tables_delobj,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ_RESET] = {
- .call_rcu = nf_tables_getobj,
+ .call = nf_tables_getobj,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_NEWFLOWTABLE] = {
- .call_batch = nf_tables_newflowtable,
+ .call = nf_tables_newflowtable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
[NFT_MSG_GETFLOWTABLE] = {
- .call_rcu = nf_tables_getflowtable,
+ .call = nf_tables_getflowtable,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
[NFT_MSG_DELFLOWTABLE] = {
- .call_batch = nf_tables_delflowtable,
+ .call = nf_tables_delflowtable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
@@ -6876,16 +8366,17 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
static int nf_tables_validate(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_table *table;
- switch (net->nft.validate_state) {
+ switch (nft_net->validate_state) {
case NFT_VALIDATE_SKIP:
break;
case NFT_VALIDATE_NEED:
nft_validate_state_update(net, NFT_VALIDATE_DO);
- /* fall through */
+ fallthrough;
case NFT_VALIDATE_DO:
- list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(table, &nft_net->tables, list) {
if (nft_table_validate(net, table) < 0)
return -EAGAIN;
}
@@ -6957,7 +8448,7 @@ static void nft_obj_commit_update(struct nft_trans *trans)
if (obj->ops->update)
obj->ops->update(obj, newobj);
- kfree(newobj);
+ nft_obj_destroy(&trans->ctx, newobj);
}
static void nft_commit_release(struct nft_trans *trans)
@@ -6977,7 +8468,7 @@ static void nft_commit_release(struct nft_trans *trans)
nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
break;
case NFT_MSG_DELSET:
- nft_set_destroy(nft_trans_set(trans));
+ nft_set_destroy(&trans->ctx, nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
nf_tables_set_elem_destroy(&trans->ctx,
@@ -6988,7 +8479,10 @@ static void nft_commit_release(struct nft_trans *trans)
nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
break;
case NFT_MSG_DELFLOWTABLE:
- nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+ if (nft_trans_flowtable_update(trans))
+ nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+ else
+ nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
break;
}
@@ -7018,48 +8512,108 @@ static void nf_tables_trans_destroy_work(struct work_struct *w)
}
}
+void nf_tables_trans_destroy_flush_work(void)
+{
+ flush_work(&trans_destroy_work);
+}
+EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work);
+
+static bool nft_expr_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ return false;
+}
+
static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
{
+ const struct nft_expr *expr, *last;
+ struct nft_regs_track track = {};
+ unsigned int size, data_size;
+ void *data, *data_boundary;
+ struct nft_rule_dp *prule;
struct nft_rule *rule;
- unsigned int alloc = 0;
- int i;
/* already handled or inactive chain? */
- if (chain->rules_next || !nft_is_active_next(net, chain))
+ if (chain->blob_next || !nft_is_active_next(net, chain))
return 0;
- rule = list_entry(&chain->rules, struct nft_rule, list);
- i = 0;
-
- list_for_each_entry_continue(rule, &chain->rules, list) {
- if (nft_is_active_next(net, rule))
- alloc++;
+ data_size = 0;
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (nft_is_active_next(net, rule)) {
+ data_size += sizeof(*prule) + rule->dlen;
+ if (data_size > INT_MAX)
+ return -ENOMEM;
+ }
}
+ data_size += offsetof(struct nft_rule_dp, data); /* last rule */
- chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
- if (!chain->rules_next)
+ chain->blob_next = nf_tables_chain_alloc_rules(data_size);
+ if (!chain->blob_next)
return -ENOMEM;
- list_for_each_entry_continue(rule, &chain->rules, list) {
- if (nft_is_active_next(net, rule))
- chain->rules_next[i++] = rule;
+ data = (void *)chain->blob_next->data;
+ data_boundary = data + data_size;
+ size = 0;
+
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (!nft_is_active_next(net, rule))
+ continue;
+
+ prule = (struct nft_rule_dp *)data;
+ data += offsetof(struct nft_rule_dp, data);
+ if (WARN_ON_ONCE(data > data_boundary))
+ return -ENOMEM;
+
+ size = 0;
+ track.last = nft_expr_last(rule);
+ nft_rule_for_each_expr(expr, last, rule) {
+ track.cur = expr;
+
+ if (nft_expr_reduce(&track, expr)) {
+ expr = track.cur;
+ continue;
+ }
+
+ if (WARN_ON_ONCE(data + expr->ops->size > data_boundary))
+ return -ENOMEM;
+
+ memcpy(data + size, expr, expr->ops->size);
+ size += expr->ops->size;
+ }
+ if (WARN_ON_ONCE(size >= 1 << 12))
+ return -ENOMEM;
+
+ prule->handle = rule->handle;
+ prule->dlen = size;
+ prule->is_last = 0;
+
+ data += size;
+ size = 0;
+ chain->blob_next->size += (unsigned long)(data - (void *)prule);
}
- chain->rules_next[i] = NULL;
+ prule = (struct nft_rule_dp *)data;
+ data += offsetof(struct nft_rule_dp, data);
+ if (WARN_ON_ONCE(data > data_boundary))
+ return -ENOMEM;
+
+ nft_last_rule(chain->blob_next, prule);
+
return 0;
}
static void nf_tables_commit_chain_prepare_cancel(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
struct nft_chain *chain = trans->ctx.chain;
if (trans->msg_type == NFT_MSG_NEWRULE ||
trans->msg_type == NFT_MSG_DELRULE) {
- kvfree(chain->rules_next);
- chain->rules_next = NULL;
+ kvfree(chain->blob_next);
+ chain->blob_next = NULL;
}
}
}
@@ -7068,38 +8622,34 @@ static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
{
struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
- kvfree(o->start);
+ kvfree(o->blob);
}
-static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob)
{
- struct nft_rule **r = rules;
struct nft_rules_old *old;
- while (*r)
- r++;
-
- r++; /* rcu_head is after end marker */
- old = (void *) r;
- old->start = rules;
+ /* rcu_head is after end marker */
+ old = (void *)blob + sizeof(*blob) + blob->size;
+ old->blob = blob;
call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
}
static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
{
- struct nft_rule **g0, **g1;
+ struct nft_rule_blob *g0, *g1;
bool next_genbit;
next_genbit = nft_gencursor_next(net);
- g0 = rcu_dereference_protected(chain->rules_gen_0,
+ g0 = rcu_dereference_protected(chain->blob_gen_0,
lockdep_commit_lock_is_held(net));
- g1 = rcu_dereference_protected(chain->rules_gen_1,
+ g1 = rcu_dereference_protected(chain->blob_gen_1,
lockdep_commit_lock_is_held(net));
/* No changes to this chain? */
- if (chain->rules_next == NULL) {
+ if (chain->blob_next == NULL) {
/* chain had no change in last or next generation */
if (g0 == g1)
return;
@@ -7108,10 +8658,10 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
* one uses same rules as current generation.
*/
if (next_genbit) {
- rcu_assign_pointer(chain->rules_gen_1, g0);
+ rcu_assign_pointer(chain->blob_gen_1, g0);
nf_tables_commit_chain_free_rules_old(g1);
} else {
- rcu_assign_pointer(chain->rules_gen_0, g1);
+ rcu_assign_pointer(chain->blob_gen_0, g1);
nf_tables_commit_chain_free_rules_old(g0);
}
@@ -7119,11 +8669,11 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
}
if (next_genbit)
- rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+ rcu_assign_pointer(chain->blob_gen_1, chain->blob_next);
else
- rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+ rcu_assign_pointer(chain->blob_gen_0, chain->blob_next);
- chain->rules_next = NULL;
+ chain->blob_next = NULL;
if (g0 == g1)
return;
@@ -7140,7 +8690,7 @@ static void nft_obj_del(struct nft_object *obj)
list_del_rcu(&obj->list);
}
-static void nft_chain_del(struct nft_chain *chain)
+void nft_chain_del(struct nft_chain *chain)
{
struct nft_table *table = chain->table;
@@ -7151,10 +8701,11 @@ static void nft_chain_del(struct nft_chain *chain)
static void nf_tables_module_autoload_cleanup(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_module_request *req, *next;
- WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
- list_for_each_entry_safe(req, next, &net->nft.module_list, list) {
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
+ list_for_each_entry_safe(req, next, &nft_net->module_list, list) {
WARN_ON_ONCE(!req->done);
list_del(&req->list);
kfree(req);
@@ -7163,6 +8714,7 @@ static void nf_tables_module_autoload_cleanup(struct net *net)
static void nf_tables_commit_release(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans;
/* all side effects have to be made visible.
@@ -7172,38 +8724,138 @@ static void nf_tables_commit_release(struct net *net)
* Memory reclaim happens asynchronously from work queue
* to prevent expensive synchronize_rcu() in commit phase.
*/
- if (list_empty(&net->nft.commit_list)) {
+ if (list_empty(&nft_net->commit_list)) {
nf_tables_module_autoload_cleanup(net);
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
return;
}
- trans = list_last_entry(&net->nft.commit_list,
+ trans = list_last_entry(&nft_net->commit_list,
struct nft_trans, list);
get_net(trans->ctx.net);
WARN_ON_ONCE(trans->put_net);
trans->put_net = true;
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list);
+ list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list);
spin_unlock(&nf_tables_destroy_list_lock);
nf_tables_module_autoload_cleanup(net);
- mutex_unlock(&net->nft.commit_mutex);
-
schedule_work(&trans_destroy_work);
+
+ mutex_unlock(&nft_net->commit_mutex);
+}
+
+static void nft_commit_notify(struct net *net, u32 portid)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct sk_buff *batch_skb = NULL, *nskb, *skb;
+ unsigned char *data;
+ int len;
+
+ list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) {
+ if (!batch_skb) {
+new_batch:
+ batch_skb = skb;
+ len = NLMSG_GOODSIZE - skb->len;
+ list_del(&skb->list);
+ continue;
+ }
+ len -= skb->len;
+ if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) {
+ data = skb_put(batch_skb, skb->len);
+ memcpy(data, skb->data, skb->len);
+ list_del(&skb->list);
+ kfree_skb(skb);
+ continue;
+ }
+ nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+ NFT_CB(batch_skb).report, GFP_KERNEL);
+ goto new_batch;
+ }
+
+ if (batch_skb) {
+ nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+ NFT_CB(batch_skb).report, GFP_KERNEL);
+ }
+
+ WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
+}
+
+static int nf_tables_commit_audit_alloc(struct list_head *adl,
+ struct nft_table *table)
+{
+ struct nft_audit_data *adp;
+
+ list_for_each_entry(adp, adl, list) {
+ if (adp->table == table)
+ return 0;
+ }
+ adp = kzalloc(sizeof(*adp), GFP_KERNEL);
+ if (!adp)
+ return -ENOMEM;
+ adp->table = table;
+ list_add(&adp->list, adl);
+ return 0;
+}
+
+static void nf_tables_commit_audit_free(struct list_head *adl)
+{
+ struct nft_audit_data *adp, *adn;
+
+ list_for_each_entry_safe(adp, adn, adl, list) {
+ list_del(&adp->list);
+ kfree(adp);
+ }
+}
+
+static void nf_tables_commit_audit_collect(struct list_head *adl,
+ struct nft_table *table, u32 op)
+{
+ struct nft_audit_data *adp;
+
+ list_for_each_entry(adp, adl, list) {
+ if (adp->table == table)
+ goto found;
+ }
+ WARN_ONCE(1, "table=%s not expected in commit list", table->name);
+ return;
+found:
+ adp->entries++;
+ if (!adp->op || adp->op > op)
+ adp->op = op;
+}
+
+#define AUNFTABLENAMELEN (NFT_TABLE_MAXNAMELEN + 22)
+
+static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation)
+{
+ struct nft_audit_data *adp, *adn;
+ char aubuf[AUNFTABLENAMELEN];
+
+ list_for_each_entry_safe(adp, adn, adl, list) {
+ snprintf(aubuf, AUNFTABLENAMELEN, "%s:%u", adp->table->name,
+ generation);
+ audit_log_nfcfg(aubuf, adp->table->family, adp->entries,
+ nft2audit_op[adp->op], GFP_KERNEL);
+ list_del(&adp->list);
+ kfree(adp);
+ }
}
static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
+ unsigned int base_seq;
+ LIST_HEAD(adl);
int err;
- if (list_empty(&net->nft.commit_list)) {
- mutex_unlock(&net->nft.commit_mutex);
+ if (list_empty(&nft_net->commit_list)) {
+ mutex_unlock(&nft_net->commit_mutex);
return 0;
}
@@ -7216,9 +8868,15 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
return err;
/* 1. Allocate space for next generation rules_gen_X[] */
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
int ret;
+ ret = nf_tables_commit_audit_alloc(&adl, trans->ctx.table);
+ if (ret) {
+ nf_tables_commit_chain_prepare_cancel(net);
+ nf_tables_commit_audit_free(&adl);
+ return ret;
+ }
if (trans->msg_type == NFT_MSG_NEWRULE ||
trans->msg_type == NFT_MSG_DELRULE) {
chain = trans->ctx.chain;
@@ -7226,13 +8884,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
ret = nf_tables_commit_chain_prepare(net, chain);
if (ret < 0) {
nf_tables_commit_chain_prepare_cancel(net);
+ nf_tables_commit_audit_free(&adl);
return ret;
}
}
}
/* step 2. Make rules_gen_X visible to packet path */
- list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(table, &nft_net->tables, list) {
list_for_each_entry(chain, &table->chains, list)
nf_tables_commit_chain(net, chain);
}
@@ -7241,20 +8900,29 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
* Bump generation counter, invalidate any dump in progress.
* Cannot fail after this point.
*/
- while (++net->nft.base_seq == 0);
+ base_seq = READ_ONCE(nft_net->base_seq);
+ while (++base_seq == 0)
+ ;
+
+ WRITE_ONCE(nft_net->base_seq, base_seq);
/* step 3. Start new generation, rules_gen_X now in use. */
net->nft.gencursor = nft_gencursor_next(net);
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
+ nf_tables_commit_audit_collect(&adl, trans->ctx.table,
+ trans->msg_type);
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (!nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) {
+ nft_trans_destroy(trans);
+ break;
}
+ if (trans->ctx.table->flags & NFT_TABLE_F_DORMANT)
+ nf_tables_table_disable(net, trans->ctx.table);
+
+ trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE;
} else {
nft_clear(net, trans->ctx.table);
}
@@ -7289,6 +8957,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nf_tables_rule_notify(&trans->ctx,
nft_trans_rule(trans),
NFT_MSG_NEWRULE);
+ if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_DELRULE:
@@ -7299,6 +8970,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_rule_expr_deactivate(&trans->ctx,
nft_trans_rule(trans),
NFT_TRANS_COMMIT);
+
+ if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_NEWSET:
nft_clear(net, nft_trans_set(trans));
@@ -7321,10 +8995,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
case NFT_MSG_NEWSETELEM:
te = (struct nft_trans_elem *)trans->data;
- te->set->ops->activate(net, te->set, &te->elem);
+ nft_setelem_activate(net, te->set, &te->elem);
nf_tables_setelem_notify(&trans->ctx, te->set,
&te->elem,
- NFT_MSG_NEWSETELEM, 0);
+ NFT_MSG_NEWSETELEM);
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSETELEM:
@@ -7332,10 +9006,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nf_tables_setelem_notify(&trans->ctx, te->set,
&te->elem,
- NFT_MSG_DELSETELEM, 0);
- te->set->ops->remove(net, te->set, &te->elem);
- atomic_dec(&te->set->nelems);
- te->set->ndeact--;
+ NFT_MSG_DELSETELEM);
+ nft_setelem_remove(net, te->set, &te->elem);
+ if (!nft_setelem_is_catchall(te->set, &te->elem)) {
+ atomic_dec(&te->set->nelems);
+ te->set->ndeact--;
+ }
break;
case NFT_MSG_NEWOBJ:
if (nft_trans_obj_update(trans)) {
@@ -7357,24 +9033,48 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
NFT_MSG_DELOBJ);
break;
case NFT_MSG_NEWFLOWTABLE:
- nft_clear(net, nft_trans_flowtable(trans));
- nf_tables_flowtable_notify(&trans->ctx,
- nft_trans_flowtable(trans),
- NFT_MSG_NEWFLOWTABLE);
+ if (nft_trans_flowtable_update(trans)) {
+ nft_trans_flowtable(trans)->data.flags =
+ nft_trans_flowtable_flags(trans);
+ nf_tables_flowtable_notify(&trans->ctx,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable_hooks(trans),
+ NFT_MSG_NEWFLOWTABLE);
+ list_splice(&nft_trans_flowtable_hooks(trans),
+ &nft_trans_flowtable(trans)->hook_list);
+ } else {
+ nft_clear(net, nft_trans_flowtable(trans));
+ nf_tables_flowtable_notify(&trans->ctx,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable(trans)->hook_list,
+ NFT_MSG_NEWFLOWTABLE);
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_DELFLOWTABLE:
- list_del_rcu(&nft_trans_flowtable(trans)->list);
- nf_tables_flowtable_notify(&trans->ctx,
- nft_trans_flowtable(trans),
- NFT_MSG_DELFLOWTABLE);
- nft_unregister_flowtable_net_hooks(net,
- nft_trans_flowtable(trans));
+ if (nft_trans_flowtable_update(trans)) {
+ nf_tables_flowtable_notify(&trans->ctx,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable_hooks(trans),
+ NFT_MSG_DELFLOWTABLE);
+ nft_unregister_flowtable_net_hooks(net,
+ &nft_trans_flowtable_hooks(trans));
+ } else {
+ list_del_rcu(&nft_trans_flowtable(trans)->list);
+ nf_tables_flowtable_notify(&trans->ctx,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable(trans)->hook_list,
+ NFT_MSG_DELFLOWTABLE);
+ nft_unregister_flowtable_net_hooks(net,
+ &nft_trans_flowtable(trans)->hook_list);
+ }
break;
}
}
+ nft_commit_notify(net, NETLINK_CB(skb).portid);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
+ nf_tables_commit_audit_log(&adl, nft_net->base_seq);
nf_tables_commit_release(net);
return 0;
@@ -7382,17 +9082,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
static void nf_tables_module_autoload(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_module_request *req, *next;
LIST_HEAD(module_list);
- list_splice_init(&net->nft.module_list, &module_list);
- mutex_unlock(&net->nft.commit_mutex);
+ list_splice_init(&nft_net->module_list, &module_list);
+ mutex_unlock(&nft_net->commit_mutex);
list_for_each_entry_safe(req, next, &module_list, list) {
request_module("%s", req->module);
req->done = true;
}
- mutex_lock(&net->nft.commit_mutex);
- list_splice(&module_list, &net->nft.module_list);
+ mutex_lock(&nft_net->commit_mutex);
+ list_splice(&module_list, &nft_net->module_list);
}
static void nf_tables_abort_release(struct nft_trans *trans)
@@ -7408,7 +9109,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
break;
case NFT_MSG_NEWSET:
- nft_set_destroy(nft_trans_set(trans));
+ nft_set_destroy(&trans->ctx, nft_trans_set(trans));
break;
case NFT_MSG_NEWSETELEM:
nft_set_elem_destroy(nft_trans_elem_set(trans),
@@ -7418,27 +9119,41 @@ static void nf_tables_abort_release(struct nft_trans *trans)
nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
break;
case NFT_MSG_NEWFLOWTABLE:
- nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+ if (nft_trans_flowtable_update(trans))
+ nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+ else
+ nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
break;
}
kfree(trans);
}
-static int __nf_tables_abort(struct net *net, bool autoload)
+static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
- list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
+ if (action == NFNL_ABORT_VALIDATE &&
+ nf_tables_validate(net) < 0)
+ return -EAGAIN;
+
+ list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list,
list) {
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
+ if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_DORMANT) {
+ nf_tables_table_disable(net, trans->ctx.table);
trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ } else if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_AWAKEN) {
+ trans->ctx.table->flags &= ~NFT_TABLE_F_DORMANT;
}
+ trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE;
nft_trans_destroy(trans);
} else {
list_del_rcu(&trans->ctx.table->list);
@@ -7454,6 +9169,10 @@ static int __nf_tables_abort(struct net *net, bool autoload)
kfree(nft_trans_chain_name(trans));
nft_trans_destroy(trans);
} else {
+ if (nft_chain_is_bound(trans->ctx.chain)) {
+ nft_trans_destroy(trans);
+ break;
+ }
trans->ctx.table->use--;
nft_chain_del(trans->ctx.chain);
nf_tables_unregister_hook(trans->ctx.net,
@@ -7472,11 +9191,16 @@ static int __nf_tables_abort(struct net *net, bool autoload)
nft_rule_expr_deactivate(&trans->ctx,
nft_trans_rule(trans),
NFT_TRANS_ABORT);
+ if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_DELRULE:
trans->ctx.chain->use++;
nft_clear(trans->ctx.net, nft_trans_rule(trans));
nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans));
+ if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSET:
@@ -7498,21 +9222,23 @@ static int __nf_tables_abort(struct net *net, bool autoload)
break;
}
te = (struct nft_trans_elem *)trans->data;
- te->set->ops->remove(net, te->set, &te->elem);
- atomic_dec(&te->set->nelems);
+ nft_setelem_remove(net, te->set, &te->elem);
+ if (!nft_setelem_is_catchall(te->set, &te->elem))
+ atomic_dec(&te->set->nelems);
break;
case NFT_MSG_DELSETELEM:
te = (struct nft_trans_elem *)trans->data;
- nft_set_elem_activate(net, te->set, &te->elem);
- te->set->ops->activate(net, te->set, &te->elem);
- te->set->ndeact--;
+ nft_setelem_data_activate(net, te->set, &te->elem);
+ nft_setelem_activate(net, te->set, &te->elem);
+ if (!nft_setelem_is_catchall(te->set, &te->elem))
+ te->set->ndeact--;
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWOBJ:
if (nft_trans_obj_update(trans)) {
- kfree(nft_trans_obj_newobj(trans));
+ nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans));
nft_trans_destroy(trans);
} else {
trans->ctx.table->use--;
@@ -7525,14 +9251,24 @@ static int __nf_tables_abort(struct net *net, bool autoload)
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWFLOWTABLE:
- trans->ctx.table->use--;
- list_del_rcu(&nft_trans_flowtable(trans)->list);
- nft_unregister_flowtable_net_hooks(net,
- nft_trans_flowtable(trans));
+ if (nft_trans_flowtable_update(trans)) {
+ nft_unregister_flowtable_net_hooks(net,
+ &nft_trans_flowtable_hooks(trans));
+ } else {
+ trans->ctx.table->use--;
+ list_del_rcu(&nft_trans_flowtable(trans)->list);
+ nft_unregister_flowtable_net_hooks(net,
+ &nft_trans_flowtable(trans)->hook_list);
+ }
break;
case NFT_MSG_DELFLOWTABLE:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
+ if (nft_trans_flowtable_update(trans)) {
+ list_splice(&nft_trans_flowtable_hooks(trans),
+ &nft_trans_flowtable(trans)->hook_list);
+ } else {
+ trans->ctx.table->use++;
+ nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
+ }
nft_trans_destroy(trans);
break;
}
@@ -7541,12 +9277,12 @@ static int __nf_tables_abort(struct net *net, bool autoload)
synchronize_rcu();
list_for_each_entry_safe_reverse(trans, next,
- &net->nft.commit_list, list) {
+ &nft_net->commit_list, list) {
list_del(&trans->list);
nf_tables_abort_release(trans);
}
- if (autoload)
+ if (action == NFNL_ABORT_AUTOLOAD)
nf_tables_module_autoload(net);
else
nf_tables_module_autoload_cleanup(net);
@@ -7559,24 +9295,27 @@ static void nf_tables_cleanup(struct net *net)
nft_validate_state_update(net, NFT_VALIDATE_SKIP);
}
-static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload)
+static int nf_tables_abort(struct net *net, struct sk_buff *skb,
+ enum nfnl_abort_action action)
{
- int ret = __nf_tables_abort(net, autoload);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ int ret = __nf_tables_abort(net, action);
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
return ret;
}
static bool nf_tables_valid_genid(struct net *net, u32 genid)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
bool genid_ok;
- mutex_lock(&net->nft.commit_mutex);
+ mutex_lock(&nft_net->commit_mutex);
- genid_ok = genid == 0 || net->nft.base_seq == genid;
+ genid_ok = genid == 0 || nft_net->base_seq == genid;
if (!genid_ok)
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
/* else, commit mutex has to be released by commit or abort function */
return genid_ok;
@@ -7638,26 +9377,59 @@ EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);
static int nf_tables_check_loops(const struct nft_ctx *ctx,
const struct nft_chain *chain);
+static int nft_check_loops(const struct nft_ctx *ctx,
+ const struct nft_set_ext *ext)
+{
+ const struct nft_data *data;
+ int ret;
+
+ data = nft_set_ext_data(ext);
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ ret = nf_tables_check_loops(ctx, data->verdict.chain);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
- const struct nft_data *data;
if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
*nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
return 0;
- data = nft_set_ext_data(ext);
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- return nf_tables_check_loops(ctx, data->verdict.chain);
- default:
- return 0;
+ return nft_check_loops(ctx, ext);
+}
+
+static int nft_set_catchall_loops(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ ret = nft_check_loops(ctx, ext);
+ if (ret < 0)
+ return ret;
}
+
+ return ret;
}
static int nf_tables_check_loops(const struct nft_ctx *ctx,
@@ -7693,6 +9465,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
data->verdict.chain);
if (err < 0)
return err;
+ break;
default:
break;
}
@@ -7718,6 +9491,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
iter.fn = nf_tables_loop_check_setelem;
set->ops->walk(ctx, set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_loops(ctx, set);
+
if (iter.err < 0)
return iter.err;
}
@@ -7751,28 +9527,24 @@ int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
}
EXPORT_SYMBOL_GPL(nft_parse_u32_check);
-/**
- * nft_parse_register - parse a register value from a netlink attribute
- *
- * @attr: netlink attribute
- *
- * Parse and translate a register value from a netlink attribute.
- * Registers used to be 128 bit wide, these register numbers will be
- * mapped to the corresponding 32 bit register numbers.
- */
-unsigned int nft_parse_register(const struct nlattr *attr)
+static int nft_parse_register(const struct nlattr *attr, u32 *preg)
{
unsigned int reg;
reg = ntohl(nla_get_be32(attr));
switch (reg) {
case NFT_REG_VERDICT...NFT_REG_4:
- return reg * NFT_REG_SIZE / NFT_REG32_SIZE;
+ *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE;
+ break;
+ case NFT_REG32_00...NFT_REG32_15:
+ *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
+ break;
default:
- return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
+ return -ERANGE;
}
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(nft_parse_register);
/**
* nft_dump_register - dump a register value to a netlink attribute
@@ -7796,16 +9568,7 @@ int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg)
}
EXPORT_SYMBOL_GPL(nft_dump_register);
-/**
- * nft_validate_register_load - validate a load from a register
- *
- * @reg: the register number
- * @len: the length of the data
- *
- * Validate that the input register is one of the general purpose
- * registers and that the length of the load is within the bounds.
- */
-int nft_validate_register_load(enum nft_registers reg, unsigned int len)
+static int nft_validate_register_load(enum nft_registers reg, unsigned int len)
{
if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
return -EINVAL;
@@ -7816,26 +9579,30 @@ int nft_validate_register_load(enum nft_registers reg, unsigned int len)
return 0;
}
-EXPORT_SYMBOL_GPL(nft_validate_register_load);
-/**
- * nft_validate_register_store - validate an expressions' register store
- *
- * @ctx: context of the expression performing the load
- * @reg: the destination register number
- * @data: the data to load
- * @type: the data type
- * @len: the length of the data
- *
- * Validate that a data load uses the appropriate data type for
- * the destination register and the length is within the bounds.
- * A value of NULL for the data means that its runtime gathered
- * data.
- */
-int nft_validate_register_store(const struct nft_ctx *ctx,
- enum nft_registers reg,
- const struct nft_data *data,
- enum nft_data_types type, unsigned int len)
+int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len)
+{
+ u32 reg;
+ int err;
+
+ err = nft_parse_register(attr, &reg);
+ if (err < 0)
+ return err;
+
+ err = nft_validate_register_load(reg, len);
+ if (err < 0)
+ return err;
+
+ *sreg = reg;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_parse_register_load);
+
+static int nft_validate_register_store(const struct nft_ctx *ctx,
+ enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type,
+ unsigned int len)
{
int err;
@@ -7867,12 +9634,33 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
return 0;
}
}
-EXPORT_SYMBOL_GPL(nft_validate_register_store);
+
+int nft_parse_register_store(const struct nft_ctx *ctx,
+ const struct nlattr *attr, u8 *dreg,
+ const struct nft_data *data,
+ enum nft_data_types type, unsigned int len)
+{
+ int err;
+ u32 reg;
+
+ err = nft_parse_register(attr, &reg);
+ if (err < 0)
+ return err;
+
+ err = nft_validate_register_store(ctx, reg, data, type, len);
+ if (err < 0)
+ return err;
+
+ *dreg = reg;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_parse_register_store);
static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
[NFTA_VERDICT_CODE] = { .type = NLA_U32 },
[NFTA_VERDICT_CHAIN] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
+ [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 },
};
static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
@@ -7902,21 +9690,35 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
default:
return -EINVAL;
}
- /* fall through */
+ fallthrough;
case NFT_CONTINUE:
case NFT_BREAK:
case NFT_RETURN:
break;
case NFT_JUMP:
case NFT_GOTO:
- if (!tb[NFTA_VERDICT_CHAIN])
+ if (tb[NFTA_VERDICT_CHAIN]) {
+ chain = nft_chain_lookup(ctx->net, ctx->table,
+ tb[NFTA_VERDICT_CHAIN],
+ genmask);
+ } else if (tb[NFTA_VERDICT_CHAIN_ID]) {
+ chain = nft_chain_lookup_byid(ctx->net, ctx->table,
+ tb[NFTA_VERDICT_CHAIN_ID]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ } else {
return -EINVAL;
- chain = nft_chain_lookup(ctx->net, ctx->table,
- tb[NFTA_VERDICT_CHAIN], genmask);
+ }
+
if (IS_ERR(chain))
return PTR_ERR(chain);
if (nft_is_base_chain(chain))
return -EOPNOTSUPP;
+ if (nft_chain_is_bound(chain))
+ return -EINVAL;
+ if (desc->flags & NFT_DATA_DESC_SETELEM &&
+ chain->flags & NFT_CHAIN_BINDING)
+ return -EINVAL;
chain->use++;
data->verdict.chain = chain;
@@ -7924,16 +9726,29 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
}
desc->len = sizeof(data->verdict);
- desc->type = NFT_DATA_VERDICT;
+
return 0;
}
static void nft_verdict_uninit(const struct nft_data *data)
{
+ struct nft_chain *chain;
+ struct nft_rule *rule;
+
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
- data->verdict.chain->use--;
+ chain = data->verdict.chain;
+ chain->use--;
+
+ if (!nft_chain_is_bound(chain))
+ break;
+
+ chain->table->use--;
+ list_for_each_entry(rule, &chain->rules, list)
+ chain->use--;
+
+ nft_chain_del(chain);
break;
}
}
@@ -7964,20 +9779,25 @@ nla_put_failure:
}
static int nft_value_init(const struct nft_ctx *ctx,
- struct nft_data *data, unsigned int size,
- struct nft_data_desc *desc, const struct nlattr *nla)
+ struct nft_data *data, struct nft_data_desc *desc,
+ const struct nlattr *nla)
{
unsigned int len;
len = nla_len(nla);
if (len == 0)
return -EINVAL;
- if (len > size)
+ if (len > desc->size)
return -EOVERFLOW;
+ if (desc->len) {
+ if (len != desc->len)
+ return -EINVAL;
+ } else {
+ desc->len = len;
+ }
nla_memcpy(data->data, nla, len);
- desc->type = NFT_DATA_VALUE;
- desc->len = len;
+
return 0;
}
@@ -7997,7 +9817,6 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
*
* @ctx: context of the expression using the data
* @data: destination struct nft_data
- * @size: maximum data length
* @desc: data description
* @nla: netlink attribute containing data
*
@@ -8007,24 +9826,35 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
* The caller can indicate that it only wants to accept data of type
* NFT_DATA_VALUE by passing NULL for the ctx argument.
*/
-int nft_data_init(const struct nft_ctx *ctx,
- struct nft_data *data, unsigned int size,
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_data_desc *desc, const struct nlattr *nla)
{
struct nlattr *tb[NFTA_DATA_MAX + 1];
int err;
+ if (WARN_ON_ONCE(!desc->size))
+ return -EINVAL;
+
err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla,
nft_data_policy, NULL);
if (err < 0)
return err;
- if (tb[NFTA_DATA_VALUE])
- return nft_value_init(ctx, data, size, desc,
- tb[NFTA_DATA_VALUE]);
- if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
- return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
- return -EINVAL;
+ if (tb[NFTA_DATA_VALUE]) {
+ if (desc->type != NFT_DATA_VALUE)
+ return -EINVAL;
+
+ err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+ } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) {
+ if (desc->type != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+ } else {
+ err = -EINVAL;
+ }
+
+ return err;
}
EXPORT_SYMBOL_GPL(nft_data_init);
@@ -8098,10 +9928,34 @@ int __nft_release_basechain(struct nft_ctx *ctx)
}
EXPORT_SYMBOL_GPL(__nft_release_basechain);
-static void __nft_release_tables(struct net *net)
+static void __nft_release_hook(struct net *net, struct nft_table *table)
+{
+ struct nft_flowtable *flowtable;
+ struct nft_chain *chain;
+
+ list_for_each_entry(chain, &table->chains, list)
+ __nf_tables_unregister_hook(net, table, chain, true);
+ list_for_each_entry(flowtable, &table->flowtables, list)
+ __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list,
+ true);
+}
+
+static void __nft_release_hooks(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_table *table;
+
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (nft_table_has_owner(table))
+ continue;
+
+ __nft_release_hook(net, table);
+ }
+}
+
+static void __nft_release_table(struct net *net, struct nft_table *table)
{
struct nft_flowtable *flowtable, *nf;
- struct nft_table *table, *nt;
struct nft_chain *chain, *nc;
struct nft_object *obj, *ne;
struct nft_rule *rule, *nr;
@@ -8111,123 +9965,203 @@ static void __nft_release_tables(struct net *net)
.family = NFPROTO_NETDEV,
};
- list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
- ctx.family = table->family;
-
- list_for_each_entry(chain, &table->chains, list)
- nf_tables_unregister_hook(net, table, chain);
- /* No packets are walking on these chains anymore. */
- ctx.table = table;
- list_for_each_entry(chain, &table->chains, list) {
- ctx.chain = chain;
- list_for_each_entry_safe(rule, nr, &chain->rules, list) {
- list_del(&rule->list);
- chain->use--;
- nf_tables_rule_release(&ctx, rule);
- }
- }
- list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
- list_del(&flowtable->list);
- table->use--;
- nf_tables_flowtable_destroy(flowtable);
- }
- list_for_each_entry_safe(set, ns, &table->sets, list) {
- list_del(&set->list);
- table->use--;
- nft_set_destroy(set);
- }
- list_for_each_entry_safe(obj, ne, &table->objects, list) {
- nft_obj_del(obj);
- table->use--;
- nft_obj_destroy(&ctx, obj);
- }
- list_for_each_entry_safe(chain, nc, &table->chains, list) {
- ctx.chain = chain;
- nft_chain_del(chain);
- table->use--;
- nf_tables_chain_destroy(&ctx);
+ ctx.family = table->family;
+ ctx.table = table;
+ list_for_each_entry(chain, &table->chains, list) {
+ ctx.chain = chain;
+ list_for_each_entry_safe(rule, nr, &chain->rules, list) {
+ list_del(&rule->list);
+ chain->use--;
+ nf_tables_rule_release(&ctx, rule);
}
+ }
+ list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
+ list_del(&flowtable->list);
+ table->use--;
+ nf_tables_flowtable_destroy(flowtable);
+ }
+ list_for_each_entry_safe(set, ns, &table->sets, list) {
+ list_del(&set->list);
+ table->use--;
+ nft_set_destroy(&ctx, set);
+ }
+ list_for_each_entry_safe(obj, ne, &table->objects, list) {
+ nft_obj_del(obj);
+ table->use--;
+ nft_obj_destroy(&ctx, obj);
+ }
+ list_for_each_entry_safe(chain, nc, &table->chains, list) {
+ ctx.chain = chain;
+ nft_chain_del(chain);
+ table->use--;
+ nf_tables_chain_destroy(&ctx);
+ }
+ nf_tables_table_destroy(&ctx);
+}
+
+static void __nft_release_tables(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_table *table, *nt;
+
+ list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
+ if (nft_table_has_owner(table))
+ continue;
+
list_del(&table->list);
- nf_tables_table_destroy(&ctx);
+
+ __nft_release_table(net, table);
}
}
+static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct nft_table *table, *to_delete[8];
+ struct nftables_pernet *nft_net;
+ struct netlink_notify *n = ptr;
+ struct net *net = n->net;
+ unsigned int deleted;
+ bool restart = false;
+
+ if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER)
+ return NOTIFY_DONE;
+
+ nft_net = nft_pernet(net);
+ deleted = 0;
+ mutex_lock(&nft_net->commit_mutex);
+ if (!list_empty(&nf_tables_destroy_list))
+ rcu_barrier();
+again:
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (nft_table_has_owner(table) &&
+ n->portid == table->nlpid) {
+ __nft_release_hook(net, table);
+ list_del_rcu(&table->list);
+ to_delete[deleted++] = table;
+ if (deleted >= ARRAY_SIZE(to_delete))
+ break;
+ }
+ }
+ if (deleted) {
+ restart = deleted >= ARRAY_SIZE(to_delete);
+ synchronize_rcu();
+ while (deleted)
+ __nft_release_table(net, to_delete[--deleted]);
+
+ if (restart)
+ goto again;
+ }
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nft_nl_notifier = {
+ .notifier_call = nft_rcv_nl_event,
+};
+
static int __net_init nf_tables_init_net(struct net *net)
{
- INIT_LIST_HEAD(&net->nft.tables);
- INIT_LIST_HEAD(&net->nft.commit_list);
- INIT_LIST_HEAD(&net->nft.module_list);
- mutex_init(&net->nft.commit_mutex);
- net->nft.base_seq = 1;
- net->nft.validate_state = NFT_VALIDATE_SKIP;
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ INIT_LIST_HEAD(&nft_net->tables);
+ INIT_LIST_HEAD(&nft_net->commit_list);
+ INIT_LIST_HEAD(&nft_net->module_list);
+ INIT_LIST_HEAD(&nft_net->notify_list);
+ mutex_init(&nft_net->commit_mutex);
+ nft_net->base_seq = 1;
+ nft_net->validate_state = NFT_VALIDATE_SKIP;
return 0;
}
+static void __net_exit nf_tables_pre_exit_net(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ mutex_lock(&nft_net->commit_mutex);
+ __nft_release_hooks(net);
+ mutex_unlock(&nft_net->commit_mutex);
+}
+
static void __net_exit nf_tables_exit_net(struct net *net)
{
- mutex_lock(&net->nft.commit_mutex);
- if (!list_empty(&net->nft.commit_list))
- __nf_tables_abort(net, false);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ mutex_lock(&nft_net->commit_mutex);
+ if (!list_empty(&nft_net->commit_list) ||
+ !list_empty(&nft_net->module_list))
+ __nf_tables_abort(net, NFNL_ABORT_NONE);
__nft_release_tables(net);
- mutex_unlock(&net->nft.commit_mutex);
- WARN_ON_ONCE(!list_empty(&net->nft.tables));
- WARN_ON_ONCE(!list_empty(&net->nft.module_list));
+ mutex_unlock(&nft_net->commit_mutex);
+ WARN_ON_ONCE(!list_empty(&nft_net->tables));
+ WARN_ON_ONCE(!list_empty(&nft_net->module_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
}
static struct pernet_operations nf_tables_net_ops = {
- .init = nf_tables_init_net,
- .exit = nf_tables_exit_net,
+ .init = nf_tables_init_net,
+ .pre_exit = nf_tables_pre_exit_net,
+ .exit = nf_tables_exit_net,
+ .id = &nf_tables_net_id,
+ .size = sizeof(struct nftables_pernet),
};
static int __init nf_tables_module_init(void)
{
int err;
- spin_lock_init(&nf_tables_destroy_list_lock);
err = register_pernet_subsys(&nf_tables_net_ops);
if (err < 0)
return err;
err = nft_chain_filter_init();
if (err < 0)
- goto err1;
+ goto err_chain_filter;
err = nf_tables_core_module_init();
if (err < 0)
- goto err2;
+ goto err_core_module;
err = register_netdevice_notifier(&nf_tables_flowtable_notifier);
if (err < 0)
- goto err3;
+ goto err_netdev_notifier;
err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params);
if (err < 0)
- goto err4;
+ goto err_rht_objname;
err = nft_offload_init();
if (err < 0)
- goto err5;
+ goto err_offload;
+
+ err = netlink_register_notifier(&nft_nl_notifier);
+ if (err < 0)
+ goto err_netlink_notifier;
/* must be last */
err = nfnetlink_subsys_register(&nf_tables_subsys);
if (err < 0)
- goto err6;
+ goto err_nfnl_subsys;
nft_chain_route_init();
return err;
-err6:
+
+err_nfnl_subsys:
+ netlink_unregister_notifier(&nft_nl_notifier);
+err_netlink_notifier:
nft_offload_exit();
-err5:
+err_offload:
rhltable_destroy(&nft_objname_ht);
-err4:
+err_rht_objname:
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
-err3:
+err_netdev_notifier:
nf_tables_core_module_exit();
-err2:
+err_core_module:
nft_chain_filter_fini();
-err1:
+err_chain_filter:
unregister_pernet_subsys(&nf_tables_net_ops);
return err;
}
@@ -8235,6 +10169,7 @@ err1:
static void __exit nf_tables_module_exit(void)
{
nfnetlink_subsys_unregister(&nf_tables_subsys);
+ netlink_unregister_notifier(&nft_nl_notifier);
nft_offload_exit();
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
nft_chain_filter_fini();
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 96c74c4c7176..cee3e4e905ec 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -25,9 +25,7 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
const struct nft_chain *chain,
enum nft_trace_types type)
{
- const struct nft_pktinfo *pkt = info->pkt;
-
- if (!info->trace || !pkt->skb->nf_trace)
+ if (!info->trace || !info->nf_trace)
return;
info->chain = chain;
@@ -36,28 +34,99 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
nft_trace_notify(info);
}
-static inline void nft_trace_packet(struct nft_traceinfo *info,
+static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info,
const struct nft_chain *chain,
- const struct nft_rule *rule,
+ const struct nft_rule_dp *rule,
enum nft_trace_types type)
{
if (static_branch_unlikely(&nft_trace_enabled)) {
+ info->nf_trace = pkt->skb->nf_trace;
info->rule = rule;
__nft_trace_packet(info, chain, type);
}
}
+static inline void nft_trace_copy_nftrace(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info)
+{
+ if (static_branch_unlikely(&nft_trace_enabled)) {
+ if (info->trace)
+ info->nf_trace = pkt->skb->nf_trace;
+ }
+}
+
+static void nft_bitwise_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ u32 *src = &regs->data[priv->sreg];
+ u32 *dst = &regs->data[priv->dreg];
+
+ *dst = (*src & priv->mask) ^ priv->xor;
+}
+
static void nft_cmp_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
- u32 mask = nft_cmp_fast_mask(priv->len);
- if ((regs->data[priv->sreg] & mask) == priv->data)
+ if (((regs->data[priv->sreg] & priv->mask) == priv->data) ^ priv->inv)
return;
regs->verdict.code = NFT_BREAK;
}
+static void nft_cmp16_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ const u64 *reg_data = (const u64 *)&regs->data[priv->sreg];
+ const u64 *mask = (const u64 *)&priv->mask;
+ const u64 *data = (const u64 *)&priv->data;
+
+ if (((reg_data[0] & mask[0]) == data[0] &&
+ ((reg_data[1] & mask[1]) == data[1])) ^ priv->inv)
+ return;
+ regs->verdict.code = NFT_BREAK;
+}
+
+static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
+ const struct nft_chain *chain,
+ const struct nft_regs *regs)
+{
+ enum nft_trace_types type;
+
+ switch (regs->verdict.code) {
+ case NFT_CONTINUE:
+ case NFT_RETURN:
+ type = NFT_TRACETYPE_RETURN;
+ break;
+ case NF_STOLEN:
+ type = NFT_TRACETYPE_RULE;
+ /* can't access skb->nf_trace; use copy */
+ break;
+ default:
+ type = NFT_TRACETYPE_RULE;
+
+ if (info->trace)
+ info->nf_trace = info->pkt->skb->nf_trace;
+ break;
+ }
+
+ __nft_trace_packet(info, chain, type);
+}
+
+static inline void nft_trace_verdict(struct nft_traceinfo *info,
+ const struct nft_chain *chain,
+ const struct nft_rule_dp *rule,
+ const struct nft_regs *regs)
+{
+ if (static_branch_unlikely(&nft_trace_enabled)) {
+ info->rule = rule;
+ __nft_trace_verdict(info, chain, regs);
+ }
+}
+
static bool nft_payload_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -70,9 +139,9 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
ptr = skb_network_header(skb);
else {
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
return false;
- ptr = skb_network_header(skb) + pkt->xt.thoff;
+ ptr = skb_network_header(skb) + nft_thoff(pkt);
}
ptr += priv->offset;
@@ -101,7 +170,6 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
base_chain = nft_base_chain(chain);
- rcu_read_lock();
pstats = READ_ONCE(base_chain->stats);
if (pstats) {
local_bh_disable();
@@ -112,12 +180,12 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
u64_stats_update_end(&stats->syncp);
local_bh_enable();
}
- rcu_read_unlock();
}
struct nft_jumpstack {
- const struct nft_chain *chain;
- struct nft_rule *const *rules;
+ const struct nft_chain *chain;
+ const struct nft_rule_dp *rule;
+ const struct nft_rule_dp *last_rule;
};
static void expr_call_ops_eval(const struct nft_expr *expr,
@@ -132,6 +200,7 @@ static void expr_call_ops_eval(const struct nft_expr *expr,
X(e, nft_payload_eval);
X(e, nft_cmp_eval);
+ X(e, nft_counter_eval);
X(e, nft_meta_get_eval);
X(e, nft_lookup_eval);
X(e, nft_range_eval);
@@ -145,18 +214,28 @@ static void expr_call_ops_eval(const struct nft_expr *expr,
expr->ops->eval(expr, regs, pkt);
}
+#define nft_rule_expr_first(rule) (struct nft_expr *)&rule->data[0]
+#define nft_rule_expr_next(expr) ((void *)expr) + expr->ops->size
+#define nft_rule_expr_last(rule) (struct nft_expr *)&rule->data[rule->dlen]
+#define nft_rule_next(rule) (void *)rule + sizeof(*rule) + rule->dlen
+
+#define nft_rule_dp_for_each_expr(expr, last, rule) \
+ for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \
+ (expr) != (last); \
+ (expr) = nft_rule_expr_next(expr))
+
unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
+ const struct nft_rule_dp *rule, *last_rule;
const struct net *net = nft_net(pkt);
- struct nft_rule *const *rules;
- const struct nft_rule *rule;
const struct nft_expr *expr, *last;
- struct nft_regs regs;
+ struct nft_regs regs = {};
unsigned int stackptr = 0;
struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
bool genbit = READ_ONCE(net->nft.gencursor);
+ struct nft_rule_blob *blob;
struct nft_traceinfo info;
info.trace = false;
@@ -164,18 +243,22 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
nft_trace_init(&info, pkt, &regs.verdict, basechain);
do_chain:
if (genbit)
- rules = rcu_dereference(chain->rules_gen_1);
+ blob = rcu_dereference(chain->blob_gen_1);
else
- rules = rcu_dereference(chain->rules_gen_0);
+ blob = rcu_dereference(chain->blob_gen_0);
+ rule = (struct nft_rule_dp *)blob->data;
+ last_rule = (void *)blob->data + blob->size;
next_rule:
- rule = *rules;
regs.verdict.code = NFT_CONTINUE;
- for (; *rules ; rules++) {
- rule = *rules;
- nft_rule_for_each_expr(expr, last, rule) {
+ for (; rule < last_rule; rule = nft_rule_next(rule)) {
+ nft_rule_dp_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
+ else if (expr->ops == &nft_cmp16_fast_ops)
+ nft_cmp16_fast_eval(expr, &regs);
+ else if (expr->ops == &nft_bitwise_fast_ops)
+ nft_bitwise_fast_eval(expr, &regs);
else if (expr->ops != &nft_payload_fast_ops ||
!nft_payload_fast_eval(expr, &regs, pkt))
expr_call_ops_eval(expr, &regs, pkt);
@@ -187,22 +270,23 @@ next_rule:
switch (regs.verdict.code) {
case NFT_BREAK:
regs.verdict.code = NFT_CONTINUE;
+ nft_trace_copy_nftrace(pkt, &info);
continue;
case NFT_CONTINUE:
- nft_trace_packet(&info, chain, rule,
+ nft_trace_packet(pkt, &info, chain, rule,
NFT_TRACETYPE_RULE);
continue;
}
break;
}
+ nft_trace_verdict(&info, chain, rule, &regs);
+
switch (regs.verdict.code & NF_VERDICT_MASK) {
case NF_ACCEPT:
case NF_DROP:
case NF_QUEUE:
case NF_STOLEN:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RULE);
return regs.verdict.code;
}
@@ -211,32 +295,29 @@ next_rule:
if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
return NF_DROP;
jumpstack[stackptr].chain = chain;
- jumpstack[stackptr].rules = rules + 1;
+ jumpstack[stackptr].rule = nft_rule_next(rule);
+ jumpstack[stackptr].last_rule = last_rule;
stackptr++;
- /* fall through */
+ fallthrough;
case NFT_GOTO:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RULE);
-
chain = regs.verdict.chain;
goto do_chain;
case NFT_CONTINUE:
case NFT_RETURN:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RETURN);
break;
default:
- WARN_ON(1);
+ WARN_ON_ONCE(1);
}
if (stackptr > 0) {
stackptr--;
chain = jumpstack[stackptr].chain;
- rules = jumpstack[stackptr].rules;
+ rule = jumpstack[stackptr].rule;
+ last_rule = jumpstack[stackptr].last_rule;
goto next_rule;
}
- nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY);
+ nft_trace_packet(pkt, &info, basechain, NULL, NFT_TRACETYPE_POLICY);
if (static_branch_unlikely(&nft_counters_enabled))
nft_update_chain_stats(basechain, pkt);
@@ -257,18 +338,23 @@ static struct nft_expr_type *nft_basic_types[] = {
&nft_meta_type,
&nft_rt_type,
&nft_exthdr_type,
+ &nft_last_type,
+ &nft_counter_type,
};
static struct nft_object_type *nft_basic_objects[] = {
#ifdef CONFIG_NETWORK_SECMARK
&nft_secmark_obj_type,
#endif
+ &nft_counter_obj_type,
};
int __init nf_tables_core_module_init(void)
{
int err, i, j = 0;
+ nft_counter_init_seqcount();
+
for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) {
err = nft_register_obj(nft_basic_objects[i]);
if (err)
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 2bb28483af22..910ef881c3b8 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -28,6 +28,62 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions)
return flow;
}
+void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow,
+ enum flow_dissector_key_id addr_type)
+{
+ struct nft_flow_match *match = &flow->match;
+ struct nft_flow_key *mask = &match->mask;
+ struct nft_flow_key *key = &match->key;
+
+ if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL))
+ return;
+
+ key->control.addr_type = addr_type;
+ mask->control.addr_type = 0xffff;
+ match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] =
+ offsetof(struct nft_flow_key, control);
+}
+
+struct nft_offload_ethertype {
+ __be16 value;
+ __be16 mask;
+};
+
+static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow)
+{
+ struct nft_flow_match *match = &flow->match;
+ struct nft_offload_ethertype ethertype = {
+ .value = match->key.basic.n_proto,
+ .mask = match->mask.basic.n_proto,
+ };
+
+ if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) &&
+ (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) ||
+ match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) {
+ match->key.basic.n_proto = match->key.cvlan.vlan_tpid;
+ match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid;
+ match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid;
+ match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid;
+ match->key.vlan.vlan_tpid = ethertype.value;
+ match->mask.vlan.vlan_tpid = ethertype.mask;
+ match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] =
+ offsetof(struct nft_flow_key, cvlan);
+ match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN);
+ } else if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC) &&
+ (match->key.basic.n_proto == htons(ETH_P_8021Q) ||
+ match->key.basic.n_proto == htons(ETH_P_8021AD))) {
+ match->key.basic.n_proto = match->key.vlan.vlan_tpid;
+ match->mask.basic.n_proto = match->mask.vlan.vlan_tpid;
+ match->key.vlan.vlan_tpid = ethertype.value;
+ match->mask.vlan.vlan_tpid = ethertype.mask;
+ match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] =
+ offsetof(struct nft_flow_key, vlan);
+ match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_VLAN);
+ }
+}
+
struct nft_flow_rule *nft_flow_rule_create(struct net *net,
const struct nft_rule *rule)
{
@@ -37,8 +93,9 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr->ops && expr != nft_expr_last(rule)) {
- if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION)
+ while (nft_expr_more(rule, expr)) {
+ if (expr->ops->offload_action &&
+ expr->ops->offload_action(expr))
num_actions++;
expr = nft_expr_next(expr);
@@ -61,7 +118,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
ctx->net = net;
ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
- while (expr->ops && expr != nft_expr_last(rule)) {
+ while (nft_expr_more(rule, expr)) {
if (!expr->ops->offload) {
err = -EOPNOTSUPP;
goto err_out;
@@ -72,6 +129,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
expr = nft_expr_next(expr);
}
+ nft_flow_rule_transfer_vlan(ctx, flow);
+
flow->proto = ctx->dep.l3num;
kfree(ctx);
@@ -149,7 +208,7 @@ static int nft_setup_cb_call(enum tc_setup_type type, void *type_data,
return 0;
}
-int nft_chain_offload_priority(struct nft_base_chain *basechain)
+static int nft_chain_offload_priority(const struct nft_base_chain *basechain)
{
if (basechain->ops.priority <= 0 ||
basechain->ops.priority > USHRT_MAX)
@@ -158,6 +217,27 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain)
return 0;
}
+bool nft_chain_offload_support(const struct nft_base_chain *basechain)
+{
+ struct net_device *dev;
+ struct nft_hook *hook;
+
+ if (nft_chain_offload_priority(basechain) < 0)
+ return false;
+
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ if (hook->ops.pf != NFPROTO_NETDEV ||
+ hook->ops.hooknum != NF_NETDEV_INGRESS)
+ return false;
+
+ dev = hook->ops.dev;
+ if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists())
+ return false;
+ }
+
+ return true;
+}
+
static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
const struct nft_base_chain *basechain,
const struct nft_rule *rule,
@@ -180,26 +260,56 @@ static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
cls_flow->rule = flow->rule;
}
-static int nft_flow_offload_rule(struct nft_chain *chain,
- struct nft_rule *rule,
- struct nft_flow_rule *flow,
- enum flow_cls_command command)
+static int nft_flow_offload_cmd(const struct nft_chain *chain,
+ const struct nft_rule *rule,
+ struct nft_flow_rule *flow,
+ enum flow_cls_command command,
+ struct flow_cls_offload *cls_flow)
{
struct netlink_ext_ack extack = {};
- struct flow_cls_offload cls_flow;
struct nft_base_chain *basechain;
if (!nft_is_base_chain(chain))
return -EOPNOTSUPP;
basechain = nft_base_chain(chain);
- nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack,
+ nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack,
command);
- return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow,
+ return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow,
&basechain->flow_block.cb_list);
}
+static int nft_flow_offload_rule(const struct nft_chain *chain,
+ struct nft_rule *rule,
+ struct nft_flow_rule *flow,
+ enum flow_cls_command command)
+{
+ struct flow_cls_offload cls_flow;
+
+ return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow);
+}
+
+int nft_flow_rule_stats(const struct nft_chain *chain,
+ const struct nft_rule *rule)
+{
+ struct flow_cls_offload cls_flow = {};
+ struct nft_expr *expr, *next;
+ int err;
+
+ err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS,
+ &cls_flow);
+ if (err < 0)
+ return err;
+
+ nft_rule_for_each_expr(expr, next, rule) {
+ if (expr->ops->offload_stats)
+ expr->ops->offload_stats(expr, &cls_flow.stats);
+ }
+
+ return 0;
+}
+
static int nft_flow_offload_bind(struct flow_block_offload *bo,
struct nft_base_chain *basechain)
{
@@ -265,6 +375,7 @@ static void nft_flow_block_offload_init(struct flow_block_offload *bo,
bo->command = cmd;
bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
bo->extack = extack;
+ bo->cb_list_head = &basechain->flow_block.cb_list;
INIT_LIST_HEAD(&bo->cb_list);
}
@@ -285,44 +396,46 @@ static int nft_block_offload_cmd(struct nft_base_chain *chain,
return nft_block_setup(chain, &bo, cmd);
}
-static void nft_indr_block_ing_cmd(struct net_device *dev,
- struct nft_base_chain *chain,
- flow_indr_block_bind_cb_t *cb,
- void *cb_priv,
- enum flow_block_command cmd)
+static void nft_indr_block_cleanup(struct flow_block_cb *block_cb)
{
+ struct nft_base_chain *basechain = block_cb->indr.data;
+ struct net_device *dev = block_cb->indr.dev;
struct netlink_ext_ack extack = {};
+ struct nftables_pernet *nft_net;
+ struct net *net = dev_net(dev);
struct flow_block_offload bo;
- if (!chain)
- return;
-
- nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
-
- cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
-
- nft_block_setup(chain, &bo, cmd);
+ nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND,
+ basechain, &extack);
+ nft_net = nft_pernet(net);
+ mutex_lock(&nft_net->commit_mutex);
+ list_del(&block_cb->driver_list);
+ list_move(&block_cb->list, &bo.cb_list);
+ nft_flow_offload_unbind(&bo, basechain);
+ mutex_unlock(&nft_net->commit_mutex);
}
-static int nft_indr_block_offload_cmd(struct nft_base_chain *chain,
+static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain,
struct net_device *dev,
enum flow_block_command cmd)
{
struct netlink_ext_ack extack = {};
struct flow_block_offload bo;
+ int err;
- nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
+ nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack);
- flow_indr_block_call(dev, &bo, cmd);
+ err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo,
+ nft_indr_block_cleanup);
+ if (err < 0)
+ return err;
if (list_empty(&bo.cb_list))
return -EOPNOTSUPP;
- return nft_block_setup(chain, &bo, cmd);
+ return nft_block_setup(basechain, &bo, cmd);
}
-#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK
-
static int nft_chain_offload_cmd(struct nft_base_chain *basechain,
struct net_device *dev,
enum flow_block_command cmd)
@@ -395,9 +508,10 @@ static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy,
static void nft_flow_rule_offload_abort(struct net *net,
struct nft_trans *trans)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
int err = 0;
- list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) {
+ list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) {
if (trans->ctx.family != NFPROTO_NETDEV)
continue;
@@ -443,11 +557,12 @@ static void nft_flow_rule_offload_abort(struct net *net,
int nft_flow_rule_offload_commit(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans;
int err = 0;
u8 policy;
- list_for_each_entry(trans, &net->nft.commit_list, list) {
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
if (trans->ctx.family != NFPROTO_NETDEV)
continue;
@@ -499,35 +614,18 @@ int nft_flow_rule_offload_commit(struct net *net)
}
}
- list_for_each_entry(trans, &net->nft.commit_list, list) {
- if (trans->ctx.family != NFPROTO_NETDEV)
- continue;
-
- switch (trans->msg_type) {
- case NFT_MSG_NEWRULE:
- case NFT_MSG_DELRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
- continue;
-
- nft_flow_rule_destroy(nft_trans_flow_rule(trans));
- break;
- default:
- break;
- }
- }
-
return err;
}
-static struct nft_chain *__nft_offload_get_chain(struct net_device *dev)
+static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net,
+ struct net_device *dev)
{
struct nft_base_chain *basechain;
- struct net *net = dev_net(dev);
struct nft_hook *hook, *found;
const struct nft_table *table;
struct nft_chain *chain;
- list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(table, &nft_net->tables, list) {
if (table->family != NFPROTO_NETDEV)
continue;
@@ -555,69 +653,39 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev)
return NULL;
}
-static void nft_indr_block_cb(struct net_device *dev,
- flow_indr_block_bind_cb_t *cb, void *cb_priv,
- enum flow_block_command cmd)
-{
- struct net *net = dev_net(dev);
- struct nft_chain *chain;
-
- mutex_lock(&net->nft.commit_mutex);
- chain = __nft_offload_get_chain(dev);
- if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) {
- struct nft_base_chain *basechain;
-
- basechain = nft_base_chain(chain);
- nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd);
- }
- mutex_unlock(&net->nft.commit_mutex);
-}
-
static int nft_offload_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct nftables_pernet *nft_net;
struct net *net = dev_net(dev);
struct nft_chain *chain;
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
- mutex_lock(&net->nft.commit_mutex);
- chain = __nft_offload_get_chain(dev);
+ nft_net = nft_pernet(net);
+ mutex_lock(&nft_net->commit_mutex);
+ chain = __nft_offload_get_chain(nft_net, dev);
if (chain)
nft_flow_block_chain(nft_base_chain(chain), dev,
FLOW_BLOCK_UNBIND);
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
return NOTIFY_DONE;
}
-static struct flow_indr_block_entry block_ing_entry = {
- .cb = nft_indr_block_cb,
- .list = LIST_HEAD_INIT(block_ing_entry.list),
-};
-
static struct notifier_block nft_offload_netdev_notifier = {
.notifier_call = nft_offload_netdev_event,
};
int nft_offload_init(void)
{
- int err;
-
- err = register_netdevice_notifier(&nft_offload_netdev_notifier);
- if (err < 0)
- return err;
-
- flow_indr_add_block_cb(&block_ing_entry);
-
- return 0;
+ return register_netdevice_notifier(&nft_offload_netdev_notifier);
}
void nft_offload_exit(void)
{
- flow_indr_del_block_cb(&block_ing_entry);
unregister_netdevice_notifier(&nft_offload_netdev_notifier);
}
diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c
deleted file mode 100644
index 586b621007eb..000000000000
--- a/net/netfilter/nf_tables_set_core.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/module.h>
-#include <net/netfilter/nf_tables_core.h>
-
-static int __init nf_tables_set_module_init(void)
-{
- nft_register_set(&nft_set_hash_fast_type);
- nft_register_set(&nft_set_hash_type);
- nft_register_set(&nft_set_rhash_type);
- nft_register_set(&nft_set_bitmap_type);
- nft_register_set(&nft_set_rbtree_type);
- nft_register_set(&nft_set_pipapo_type);
-
- return 0;
-}
-
-static void __exit nf_tables_set_module_exit(void)
-{
- nft_unregister_set(&nft_set_pipapo_type);
- nft_unregister_set(&nft_set_rbtree_type);
- nft_unregister_set(&nft_set_bitmap_type);
- nft_unregister_set(&nft_set_rhash_type);
- nft_unregister_set(&nft_set_hash_type);
- nft_unregister_set(&nft_set_hash_fast_type);
-}
-
-module_init(nf_tables_set_module_init);
-module_exit(nf_tables_set_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 87b36da5cd98..1163ba9c1401 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -7,7 +7,7 @@
#include <linux/module.h>
#include <linux/static_key.h>
#include <linux/hash.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/skbuff.h>
@@ -25,22 +25,6 @@
DEFINE_STATIC_KEY_FALSE(nft_trace_enabled);
EXPORT_SYMBOL_GPL(nft_trace_enabled);
-static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb)
-{
- __be32 id;
-
- /* using skb address as ID results in a limited number of
- * values (and quick reuse).
- *
- * So we attempt to use as many skb members that will not
- * change while skb is with netfilter.
- */
- id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb),
- skb->skb_iif);
-
- return nla_put_be32(nlskb, NFTA_TRACE_ID, id);
-}
-
static int trace_fill_header(struct sk_buff *nlskb, u16 type,
const struct sk_buff *skb,
int off, unsigned int len)
@@ -113,17 +97,17 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
int off = skb_network_offset(skb);
unsigned int len, nh_end;
- nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len;
+ nh_end = pkt->flags & NFT_PKTINFO_L4PROTO ? nft_thoff(pkt) : skb->len;
len = min_t(unsigned int, nh_end - skb_network_offset(skb),
NFT_TRACETYPE_NETWORK_HSIZE);
if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len))
return -1;
- if (pkt->tprot_set) {
- len = min_t(unsigned int, skb->len - pkt->xt.thoff,
+ if (pkt->flags & NFT_PKTINFO_L4PROTO) {
+ len = min_t(unsigned int, skb->len - nft_thoff(pkt),
NFT_TRACETYPE_TRANSPORT_HSIZE);
if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
- pkt->xt.thoff, len))
+ nft_thoff(pkt), len))
return -1;
}
@@ -142,7 +126,7 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
const struct nft_traceinfo *info)
{
- if (!info->rule)
+ if (!info->rule || info->rule->is_last)
return 0;
/* a continue verdict with ->type == RETURN means that this is
@@ -183,10 +167,10 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
void nft_trace_notify(struct nft_traceinfo *info)
{
const struct nft_pktinfo *pkt = info->pkt;
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct sk_buff *skb;
unsigned int size;
+ u32 mark = 0;
u16 event;
if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
@@ -219,22 +203,18 @@ void nft_trace_notify(struct nft_traceinfo *info)
return;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE);
- nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0);
+ nlh = nfnl_msg_put(skb, 0, 0, event, 0, info->basechain->type->family,
+ NFNETLINK_V0, 0);
if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = info->basechain->type->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt))))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type)))
goto nla_put_failure;
- if (trace_fill_id(skb, pkt->skb))
+ if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name))
@@ -254,16 +234,24 @@ void nft_trace_notify(struct nft_traceinfo *info)
case NFT_TRACETYPE_RULE:
if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict))
goto nla_put_failure;
+
+ /* pkt->skb undefined iff NF_STOLEN, disable dump */
+ if (info->verdict->code == NF_STOLEN)
+ info->packet_dumped = true;
+ else
+ mark = pkt->skb->mark;
+
break;
case NFT_TRACETYPE_POLICY:
+ mark = pkt->skb->mark;
+
if (nla_put_be32(skb, NFTA_TRACE_POLICY,
htonl(info->basechain->policy)))
goto nla_put_failure;
break;
}
- if (pkt->skb->mark &&
- nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark)))
+ if (mark && nla_put_be32(skb, NFTA_TRACE_MARK, htonl(mark)))
goto nla_put_failure;
if (!info->packet_dumped) {
@@ -288,9 +276,20 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
const struct nft_verdict *verdict,
const struct nft_chain *chain)
{
+ static siphash_key_t trace_key __read_mostly;
+ struct sk_buff *skb = pkt->skb;
+
info->basechain = nft_base_chain(chain);
info->trace = true;
+ info->nf_trace = pkt->skb->nf_trace;
info->packet_dumped = false;
info->pkt = pkt;
info->verdict = verdict;
+
+ net_get_random_once(&trace_key, sizeof(trace_key));
+
+ info->skbid = (u32)siphash_3u32(hash32_ptr(skb),
+ skb_get_hash(skb),
+ skb->skb_iif,
+ &trace_key);
}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 99127e2d95a8..6d18fb346868 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -28,11 +28,13 @@
#include <linux/sched/signal.h>
#include <net/netlink.h>
+#include <net/netns/generic.h>
#include <linux/netfilter/nfnetlink.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+MODULE_DESCRIPTION("Netfilter messages via netlink socket");
#define nfnl_dereference_protected(id) \
rcu_dereference_protected(table[(id)].subsys, \
@@ -40,11 +42,39 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
#define NFNL_MAX_ATTR_COUNT 32
+static unsigned int nfnetlink_pernet_id __read_mostly;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static DEFINE_SPINLOCK(nfnl_grp_active_lock);
+#endif
+
+struct nfnl_net {
+ struct sock *nfnl;
+};
+
static struct {
struct mutex mutex;
const struct nfnetlink_subsystem __rcu *subsys;
} table[NFNL_SUBSYS_COUNT];
+static struct lock_class_key nfnl_lockdep_keys[NFNL_SUBSYS_COUNT];
+
+static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = {
+ [NFNL_SUBSYS_NONE] = "nfnl_subsys_none",
+ [NFNL_SUBSYS_CTNETLINK] = "nfnl_subsys_ctnetlink",
+ [NFNL_SUBSYS_CTNETLINK_EXP] = "nfnl_subsys_ctnetlink_exp",
+ [NFNL_SUBSYS_QUEUE] = "nfnl_subsys_queue",
+ [NFNL_SUBSYS_ULOG] = "nfnl_subsys_ulog",
+ [NFNL_SUBSYS_OSF] = "nfnl_subsys_osf",
+ [NFNL_SUBSYS_IPSET] = "nfnl_subsys_ipset",
+ [NFNL_SUBSYS_ACCT] = "nfnl_subsys_acct",
+ [NFNL_SUBSYS_CTNETLINK_TIMEOUT] = "nfnl_subsys_cttimeout",
+ [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper",
+ [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables",
+ [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat",
+ [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook",
+};
+
static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK,
[NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK,
@@ -57,6 +87,11 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES,
};
+static struct nfnl_net *nfnl_pernet(struct net *net)
+{
+ return net_generic(net, nfnetlink_pernet_id);
+}
+
void nfnl_lock(__u8 subsys_id)
{
mutex_lock(&table[subsys_id].mutex);
@@ -131,30 +166,51 @@ nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
int nfnetlink_has_listeners(struct net *net, unsigned int group)
{
- return netlink_has_listeners(net->nfnl, group);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ return netlink_has_listeners(nfnlnet->nfnl, group);
}
EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
unsigned int group, int echo, gfp_t flags)
{
- return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ return nlmsg_notify(nfnlnet->nfnl, skb, portid, group, echo, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_send);
int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
{
- return netlink_set_err(net->nfnl, portid, group, error);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ return netlink_set_err(nfnlnet->nfnl, portid, group, error);
}
EXPORT_SYMBOL_GPL(nfnetlink_set_err);
-int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
- int flags)
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid)
{
- return netlink_unicast(net->nfnl, skb, portid, flags);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+ int err;
+
+ err = nlmsg_unicast(nfnlnet->nfnl, skb, portid);
+ if (err == -EAGAIN)
+ err = -ENOBUFS;
+
+ return err;
}
EXPORT_SYMBOL_GPL(nfnetlink_unicast);
+void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid,
+ __u32 group, gfp_t allocation)
+{
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ netlink_broadcast(nfnlnet->nfnl, skb, portid, group, allocation);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_broadcast);
+
/* Process one complete nfnetlink message. */
static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
@@ -171,6 +227,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
type = nlh->nlmsg_type;
replay:
rcu_read_lock();
+
ss = nfnetlink_get_subsys(type);
if (!ss) {
#ifdef CONFIG_MODULES
@@ -194,11 +251,19 @@ replay:
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
__u8 subsys_id = NFNL_SUBSYS_ID(type);
+ struct nfnl_info info = {
+ .net = net,
+ .sk = nfnlnet->nfnl,
+ .nlh = nlh,
+ .nfmsg = nlmsg_data(nlh),
+ .extack = extack,
+ };
/* Sanity-check NFNL_MAX_ATTR_COUNT */
if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
@@ -214,24 +279,32 @@ replay:
return err;
}
- if (nc->call_rcu) {
- err = nc->call_rcu(net, net->nfnl, skb, nlh,
- (const struct nlattr **)cda,
- extack);
+ if (!nc->call) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ switch (nc->type) {
+ case NFNL_CB_RCU:
+ err = nc->call(skb, &info, (const struct nlattr **)cda);
rcu_read_unlock();
- } else {
+ break;
+ case NFNL_CB_MUTEX:
rcu_read_unlock();
nfnl_lock(subsys_id);
if (nfnl_dereference_protected(subsys_id) != ss ||
- nfnetlink_find_client(type, ss) != nc)
+ nfnetlink_find_client(type, ss) != nc) {
+ nfnl_unlock(subsys_id);
err = -EAGAIN;
- else if (nc->call)
- err = nc->call(net, net->nfnl, skb, nlh,
- (const struct nlattr **)cda,
- extack);
- else
- err = -EINVAL;
+ break;
+ }
+ err = nc->call(skb, &info, (const struct nlattr **)cda);
nfnl_unlock(subsys_id);
+ break;
+ default:
+ rcu_read_unlock();
+ err = -EINVAL;
+ break;
}
if (err == -EAGAIN)
goto replay;
@@ -310,7 +383,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
return netlink_ack(skb, nlh, -EINVAL, NULL);
replay:
status = 0;
-
+replay_abort:
skb = netlink_skb_clone(oskb, GFP_KERNEL);
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM, NULL);
@@ -409,12 +482,25 @@ replay:
goto ack;
}
+ if (nc->type != NFNL_CB_BATCH) {
+ err = -EINVAL;
+ goto ack;
+ }
+
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
- u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
+ u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
int attrlen = nlh->nlmsg_len - min_len;
+ struct nfnl_info info = {
+ .net = net,
+ .sk = nfnlnet->nfnl,
+ .nlh = nlh,
+ .nfmsg = nlmsg_data(nlh),
+ .extack = &extack,
+ };
/* Sanity-check NFTA_MAX_ATTR */
if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
@@ -429,11 +515,7 @@ replay:
if (err < 0)
goto ack;
- if (nc->call_batch) {
- err = nc->call_batch(net, net->nfnl, skb, nlh,
- (const struct nlattr **)cda,
- &extack);
- }
+ err = nc->call(skb, &info, (const struct nlattr **)cda);
/* The lock was released to autoload some module, we
* have to abort and start from scratch using the
@@ -476,7 +558,7 @@ ack:
}
done:
if (status & NFNL_BATCH_REPLAY) {
- ss->abort(net, oskb, true);
+ ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD);
nfnl_err_reset(&err_list);
kfree_skb(skb);
module_put(ss->owner);
@@ -487,11 +569,25 @@ done:
status |= NFNL_BATCH_REPLAY;
goto done;
} else if (err) {
- ss->abort(net, oskb, false);
+ ss->abort(net, oskb, NFNL_ABORT_NONE);
netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
}
} else {
- ss->abort(net, oskb, false);
+ enum nfnl_abort_action abort_action;
+
+ if (status & NFNL_BATCH_FAILURE)
+ abort_action = NFNL_ABORT_NONE;
+ else
+ abort_action = NFNL_ABORT_VALIDATE;
+
+ err = ss->abort(net, oskb, abort_action);
+ if (err == -EAGAIN) {
+ nfnl_err_reset(&err_list);
+ kfree_skb(skb);
+ module_put(ss->owner);
+ status |= NFNL_BATCH_FAILURE;
+ goto replay_abort;
+ }
}
if (ss->cleanup)
ss->cleanup(net);
@@ -535,7 +631,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
nfgenmsg = nlmsg_data(nlh);
skb_pull(skb, msglen);
/* Work around old nft using host byte order */
- if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+ if (nfgenmsg->res_id == (__force __be16)NFNL_SUBSYS_NFTABLES)
res_id = NFNL_SUBSYS_NFTABLES;
else
res_id = ntohs(nfgenmsg->res_id);
@@ -563,7 +659,44 @@ static void nfnetlink_rcv(struct sk_buff *skb)
netlink_rcv_skb(skb, nfnetlink_rcv_msg);
}
-#ifdef CONFIG_MODULES
+static void nfnetlink_bind_event(struct net *net, unsigned int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int type, group_bit;
+ u8 v;
+
+ /* All NFNLGRP_CONNTRACK_* group bits fit into u8.
+ * The other groups are not relevant and can be ignored.
+ */
+ if (group >= 8)
+ return;
+
+ type = nfnl_group2type[group];
+
+ switch (type) {
+ case NFNL_SUBSYS_CTNETLINK:
+ break;
+ case NFNL_SUBSYS_CTNETLINK_EXP:
+ break;
+ default:
+ return;
+ }
+
+ group_bit = (1 << group);
+
+ spin_lock(&nfnl_grp_active_lock);
+ v = READ_ONCE(net->ct.ctnetlink_has_listener);
+ if ((v & group_bit) == 0) {
+ v |= group_bit;
+
+ /* read concurrently without nfnl_grp_active_lock held. */
+ WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
+ }
+
+ spin_unlock(&nfnl_grp_active_lock);
+#endif
+}
+
static int nfnetlink_bind(struct net *net, int group)
{
const struct nfnetlink_subsystem *ss;
@@ -579,43 +712,82 @@ static int nfnetlink_bind(struct net *net, int group)
rcu_read_unlock();
if (!ss)
request_module_nowait("nfnetlink-subsys-%d", type);
+
+ nfnetlink_bind_event(net, group);
return 0;
}
+
+static void nfnetlink_unbind(struct net *net, int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int type, group_bit;
+
+ if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
+ return;
+
+ type = nfnl_group2type[group];
+
+ switch (type) {
+ case NFNL_SUBSYS_CTNETLINK:
+ break;
+ case NFNL_SUBSYS_CTNETLINK_EXP:
+ break;
+ default:
+ return;
+ }
+
+ /* ctnetlink_has_listener is u8 */
+ if (group >= 8)
+ return;
+
+ group_bit = (1 << group);
+
+ spin_lock(&nfnl_grp_active_lock);
+ if (!nfnetlink_has_listeners(net, group)) {
+ u8 v = READ_ONCE(net->ct.ctnetlink_has_listener);
+
+ v &= ~group_bit;
+
+ /* read concurrently without nfnl_grp_active_lock held. */
+ WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
+ }
+ spin_unlock(&nfnl_grp_active_lock);
#endif
+}
static int __net_init nfnetlink_net_init(struct net *net)
{
- struct sock *nfnl;
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
struct netlink_kernel_cfg cfg = {
.groups = NFNLGRP_MAX,
.input = nfnetlink_rcv,
-#ifdef CONFIG_MODULES
.bind = nfnetlink_bind,
-#endif
+ .unbind = nfnetlink_unbind,
};
- nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
- if (!nfnl)
+ nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
+ if (!nfnlnet->nfnl)
return -ENOMEM;
- net->nfnl_stash = nfnl;
- rcu_assign_pointer(net->nfnl, nfnl);
return 0;
}
static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
{
+ struct nfnl_net *nfnlnet;
struct net *net;
- list_for_each_entry(net, net_exit_list, exit_list)
- RCU_INIT_POINTER(net->nfnl, NULL);
- synchronize_net();
- list_for_each_entry(net, net_exit_list, exit_list)
- netlink_kernel_release(net->nfnl_stash);
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nfnlnet = nfnl_pernet(net);
+
+ netlink_kernel_release(nfnlnet->nfnl);
+ }
}
static struct pernet_operations nfnetlink_net_ops = {
.init = nfnetlink_net_init,
.exit_batch = nfnetlink_net_exit_batch,
+ .id = &nfnetlink_pernet_id,
+ .size = sizeof(struct nfnl_net),
};
static int __init nfnetlink_init(void)
@@ -626,7 +798,7 @@ static int __init nfnetlink_init(void)
BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
for (i=0; i<NFNL_SUBSYS_COUNT; i++)
- mutex_init(&table[i].mutex);
+ __mutex_init(&table[i].mutex, nfnl_lockdep_names[i], &nfnl_lockdep_keys[i]);
return register_pernet_subsys(&nfnetlink_net_ops);
}
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 2481470dec36..505f46a32173 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
- * (C) 2011 Intra2net AG <http://www.intra2net.com>
+ * (C) 2011 Intra2net AG <https://www.intra2net.com>
*/
#include <linux/init.h>
#include <linux/module.h>
@@ -16,6 +16,7 @@
#include <linux/errno.h>
#include <net/netlink.h>
#include <net/sock.h>
+#include <net/netns/generic.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
@@ -33,7 +34,7 @@ struct nf_acct {
refcount_t refcnt;
char name[NFACCT_NAME_MAX];
struct rcu_head rcu_head;
- char data[0];
+ char data[];
};
struct nfacct_filter {
@@ -41,17 +42,27 @@ struct nfacct_filter {
u32 mask;
};
+struct nfnl_acct_net {
+ struct list_head nfnl_acct_list;
+};
+
+static unsigned int nfnl_acct_net_id __read_mostly;
+
+static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_acct_net_id);
+}
+
#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
#define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */
-static int nfnl_acct_new(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_acct_new(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
struct nf_acct *nfacct, *matching = NULL;
- char *acct_name;
unsigned int size = 0;
+ char *acct_name;
u32 flags = 0;
if (!tb[NFACCT_NAME])
@@ -61,11 +72,11 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
if (strlen(acct_name) == 0)
return -EINVAL;
- list_for_each_entry(nfacct, &net->nfnl_acct_list, head) {
+ list_for_each_entry(nfacct, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
matching = nfacct;
@@ -73,7 +84,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
}
if (matching) {
- if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
/* reset counters if you request a replacement. */
atomic64_set(&matching->pkts, 0);
atomic64_set(&matching->bytes, 0);
@@ -112,7 +123,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
nfacct->flags = flags;
}
- nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
+ nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
if (tb[NFACCT_BYTES]) {
atomic64_set(&nfacct->bytes,
@@ -123,7 +134,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
}
refcount_set(&nfacct->refcnt, 1);
- list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list);
+ list_add_tail_rcu(&nfacct->head, &nfnl_acct_net->nfnl_acct_list);
return 0;
}
@@ -132,21 +143,16 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int event, struct nf_acct *acct)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
u64 pkts, bytes;
u32 old_flags;
event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, NFACCT_NAME, acct->name))
goto nla_put_failure;
@@ -188,6 +194,7 @@ static int
nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *last;
const struct nfacct_filter *filter = cb->data;
@@ -199,7 +206,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (last) {
if (cur != last)
continue;
@@ -264,16 +271,15 @@ static int nfnl_acct_start(struct netlink_callback *cb)
return 0;
}
-static int nfnl_acct_get(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
int ret = -ENOENT;
struct nf_acct *cur;
char *acct_name;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nfnl_acct_dump,
.start = nfnl_acct_start,
@@ -281,14 +287,14 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
.data = (void *)tb[NFACCT_FILTER],
};
- return netlink_dump_start(nfnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
if (!tb[NFACCT_NAME])
return -EINVAL;
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
@@ -301,21 +307,18 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
}
ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
- NFNL_MSG_ACCT_NEW, cur);
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
+ NFNL_MSG_ACCT_NEW, cur);
if (ret <= 0) {
kfree_skb(skb2);
break;
}
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+ break;
}
+
return ret;
}
@@ -337,24 +340,23 @@ static int nfnl_acct_try_del(struct nf_acct *cur)
return ret;
}
-static int nfnl_acct_del(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_acct_del(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
struct nf_acct *cur, *tmp;
int ret = -ENOENT;
char *acct_name;
if (!tb[NFACCT_NAME]) {
- list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head)
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head)
nfnl_acct_try_del(cur);
return 0;
}
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
@@ -377,18 +379,30 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {
};
static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = {
- [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
- [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
- [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
- [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_NEW] = {
+ .call = nfnl_acct_new,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
+ [NFNL_MSG_ACCT_GET] = {
+ .call = nfnl_acct_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
+ [NFNL_MSG_ACCT_GET_CTRZERO] = {
+ .call = nfnl_acct_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
+ [NFNL_MSG_ACCT_DEL] = {
+ .call = nfnl_acct_del,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
};
static const struct nfnetlink_subsystem nfnl_acct_subsys = {
@@ -402,10 +416,11 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *acct = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
continue;
@@ -457,8 +472,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct)
kfree_skb(skb);
return;
}
- netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
- GFP_ATOMIC);
+ nfnetlink_broadcast(net, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC);
}
int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct)
@@ -488,16 +502,17 @@ EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
static int __net_init nfnl_acct_net_init(struct net *net)
{
- INIT_LIST_HEAD(&net->nfnl_acct_list);
+ INIT_LIST_HEAD(&nfnl_acct_pernet(net)->nfnl_acct_list);
return 0;
}
static void __net_exit nfnl_acct_net_exit(struct net *net)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *tmp;
- list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) {
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) {
list_del_rcu(&cur->head);
if (refcount_dec_and_test(&cur->refcnt))
@@ -508,6 +523,8 @@ static void __net_exit nfnl_acct_net_exit(struct net *net)
static struct pernet_operations nfnl_acct_ops = {
.init = nfnl_acct_net_init,
.exit = nfnl_acct_net_exit,
+ .id = &nfnl_acct_net_id,
+ .size = sizeof(struct nfnl_acct_net),
};
static int __init nfnl_acct_init(void)
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index a5f294aa8e4c..97248963a7d3 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -96,14 +96,16 @@ static int
nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
{
struct nf_conn_help *help = nfct_help(ct);
+ const struct nf_conntrack_helper *helper;
if (attr == NULL)
return -EINVAL;
- if (help->helper->data_len == 0)
+ helper = rcu_dereference(help->helper);
+ if (!helper || helper->data_len == 0)
return -EINVAL;
- nla_memcpy(help->data, nla_data(attr), sizeof(help->data));
+ nla_memcpy(help->data, attr, sizeof(help->data));
return 0;
}
@@ -111,9 +113,11 @@ static int
nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
{
const struct nf_conn_help *help = nfct_help(ct);
+ const struct nf_conntrack_helper *helper;
- if (help->helper->data_len &&
- nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data))
+ helper = rcu_dereference(help->helper);
+ if (helper && helper->data_len &&
+ nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data))
goto nla_put_failure;
return 0;
@@ -146,7 +150,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
!tb[NFCTH_POLICY_EXPECT_TIMEOUT])
return -EINVAL;
- nla_strlcpy(expect_policy->name,
+ nla_strscpy(expect_policy->name,
tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN);
expect_policy->max_expected =
ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
@@ -233,13 +237,14 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
if (ret < 0)
goto err1;
- nla_strlcpy(helper->name,
+ nla_strscpy(helper->name,
tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN);
size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
if (size > sizeof_field(struct nf_conn_help, data)) {
ret = -ENOMEM;
goto err2;
}
+ helper->data_len = size;
helper->flags |= NF_CT_HELPER_F_USERSPACE;
memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple));
@@ -379,10 +384,14 @@ static int
nfnl_cthelper_update(const struct nlattr * const tb[],
struct nf_conntrack_helper *helper)
{
+ u32 size;
int ret;
- if (tb[NFCTH_PRIV_DATA_LEN])
- return -EBUSY;
+ if (tb[NFCTH_PRIV_DATA_LEN]) {
+ size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
+ if (size != helper->data_len)
+ return -EBUSY;
+ }
if (tb[NFCTH_POLICY]) {
ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]);
@@ -407,10 +416,8 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
return 0;
}
-static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
const char *helper_name;
struct nf_conntrack_helper *cur, *helper = NULL;
@@ -440,7 +447,7 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
tuple.dst.protonum != cur->tuple.dst.protonum))
continue;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
helper = cur;
@@ -525,20 +532,15 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int event, struct nf_conntrack_helper *helper)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
int status;
event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, NFCTH_NAME, helper->name))
goto nla_put_failure;
@@ -611,10 +613,8 @@ out:
return skb->len;
}
-static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
int ret = -ENOENT;
struct nf_conntrack_helper *cur;
@@ -627,11 +627,11 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nfnl_cthelper_dump_table,
};
- return netlink_dump_start(nfnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
if (tb[NFCTH_NAME])
@@ -663,29 +663,23 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
}
ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
NFNL_MSG_CTHELPER_NEW, cur);
if (ret <= 0) {
kfree_skb(skb2);
break;
}
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
-
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+ break;
}
+
return ret;
}
-static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
char *helper_name = NULL;
struct nf_conntrack_helper *cur;
@@ -747,15 +741,24 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
};
static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = {
- [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new,
- .attr_count = NFCTH_MAX,
- .policy = nfnl_cthelper_policy },
- [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get,
- .attr_count = NFCTH_MAX,
- .policy = nfnl_cthelper_policy },
- [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del,
- .attr_count = NFCTH_MAX,
- .policy = nfnl_cthelper_policy },
+ [NFNL_MSG_CTHELPER_NEW] = {
+ .call = nfnl_cthelper_new,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy
+ },
+ [NFNL_MSG_CTHELPER_GET] = {
+ .call = nfnl_cthelper_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy
+ },
+ [NFNL_MSG_CTHELPER_DEL] = {
+ .call = nfnl_cthelper_del,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy
+ },
};
static const struct nfnetlink_subsystem nfnl_cthelper_subsys = {
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index da915c224a82..f466af4f8531 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -20,6 +20,7 @@
#include <linux/netfilter.h>
#include <net/netlink.h>
+#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -30,6 +31,24 @@
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
+static unsigned int nfct_timeout_id __read_mostly;
+
+struct ctnl_timeout {
+ struct list_head head;
+ struct list_head free_head;
+ struct rcu_head rcu_head;
+ refcount_t refcnt;
+ char name[CTNL_TIMEOUT_NAME_MAX];
+
+ /* must be at the end */
+ struct nf_ct_timeout timeout;
+};
+
+struct nfct_timeout_pernet {
+ struct list_head nfct_timeout_list;
+ struct list_head nfct_timeout_freelist;
+};
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning");
@@ -42,6 +61,11 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
[CTA_TIMEOUT_DATA] = { .type = NLA_NESTED },
};
+static struct nfct_timeout_pernet *nfct_timeout_pernet(struct net *net)
+{
+ return net_generic(net, nfct_timeout_id);
+}
+
static int
ctnl_timeout_parse_policy(void *timeout,
const struct nf_conntrack_l4proto *l4proto,
@@ -71,12 +95,11 @@ err:
return ret;
}
-static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_new_timeout(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
__u16 l3num;
__u8 l4num;
const struct nf_conntrack_l4proto *l4proto;
@@ -94,11 +117,11 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
- list_for_each_entry(timeout, &net->nfct_timeout_list, head) {
+ list_for_each_entry(timeout, &pernet->nfct_timeout_list, head) {
if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
matching = timeout;
@@ -106,7 +129,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
}
if (matching) {
- if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
/* You cannot replace one timeout policy by another of
* different kind, sorry.
*/
@@ -116,7 +139,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
return ctnl_timeout_parse_policy(&matching->timeout.data,
matching->timeout.l4proto,
- net, cda[CTA_TIMEOUT_DATA]);
+ info->net,
+ cda[CTA_TIMEOUT_DATA]);
}
return -EBUSY;
@@ -137,8 +161,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
goto err_proto_put;
}
- ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net,
- cda[CTA_TIMEOUT_DATA]);
+ ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto,
+ info->net, cda[CTA_TIMEOUT_DATA]);
if (ret < 0)
goto err;
@@ -146,7 +170,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
timeout->timeout.l3num = l3num;
timeout->timeout.l4proto = l4proto;
refcount_set(&timeout->refcnt, 1);
- list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list);
+ __module_get(THIS_MODULE);
+ list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list);
return 0;
err:
@@ -160,22 +185,17 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int event, struct ctnl_timeout *timeout)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto;
struct nlattr *nest_parms;
int ret;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) ||
nla_put_be16(skb, CTA_TIMEOUT_L3PROTO,
htons(timeout->timeout.l3num)) ||
@@ -206,6 +226,7 @@ nla_put_failure:
static int
ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct nfct_timeout_pernet *pernet;
struct net *net = sock_net(skb->sk);
struct ctnl_timeout *cur, *last;
@@ -217,7 +238,8 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfct_timeout_list, head) {
+ pernet = nfct_timeout_pernet(net);
+ list_for_each_entry_rcu(cur, &pernet->nfct_timeout_list, head) {
if (last) {
if (cur != last)
continue;
@@ -238,28 +260,27 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_get_timeout(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
int ret = -ENOENT;
char *name;
struct ctnl_timeout *cur;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnl_timeout_dump,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
if (!cda[CTA_TIMEOUT_NAME])
return -EINVAL;
name = nla_data(cda[CTA_TIMEOUT_NAME]);
- list_for_each_entry(cur, &net->nfct_timeout_list, head) {
+ list_for_each_entry(cur, &pernet->nfct_timeout_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
@@ -272,21 +293,18 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
}
ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
IPCTNL_MSG_TIMEOUT_NEW, cur);
if (ret <= 0) {
kfree_skb(skb2);
break;
}
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+ break;
}
+
return ret;
}
@@ -309,30 +327,29 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
return ret;
}
-static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_del_timeout(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
struct ctnl_timeout *cur, *tmp;
int ret = -ENOENT;
char *name;
if (!cda[CTA_TIMEOUT_NAME]) {
- list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list,
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list,
head)
- ctnl_timeout_try_del(net, cur);
+ ctnl_timeout_try_del(info->net, cur);
return 0;
}
name = nla_data(cda[CTA_TIMEOUT_NAME]);
- list_for_each_entry(cur, &net->nfct_timeout_list, head) {
+ list_for_each_entry(cur, &pernet->nfct_timeout_list, head) {
if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- ret = ctnl_timeout_try_del(net, cur);
+ ret = ctnl_timeout_try_del(info->net, cur);
if (ret < 0)
return ret;
@@ -341,11 +358,9 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
return ret;
}
-static int cttimeout_default_set(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_default_set(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
const struct nf_conntrack_l4proto *l4proto;
__u8 l4num;
@@ -365,7 +380,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
goto err;
}
- ret = ctnl_timeout_parse_policy(NULL, l4proto, net,
+ ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net,
cda[CTA_TIMEOUT_DATA]);
if (ret < 0)
goto err;
@@ -382,21 +397,16 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
const unsigned int *timeouts)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
struct nlattr *nest_parms;
int ret;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) ||
nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
goto nla_put_failure;
@@ -420,18 +430,16 @@ nla_put_failure:
return -1;
}
-static int cttimeout_default_get(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_default_get(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
const struct nf_conntrack_l4proto *l4proto;
unsigned int *timeouts = NULL;
struct sk_buff *skb2;
- int ret, err;
__u16 l3num;
__u8 l4num;
+ int ret;
if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO])
return -EINVAL;
@@ -440,41 +448,40 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
l4proto = nf_ct_l4proto_find(l4num);
- err = -EOPNOTSUPP;
if (l4proto->l4proto != l4num)
- goto err;
+ return -EOPNOTSUPP;
switch (l4proto->l4proto) {
case IPPROTO_ICMP:
- timeouts = &nf_icmp_pernet(net)->timeout;
+ timeouts = &nf_icmp_pernet(info->net)->timeout;
break;
case IPPROTO_TCP:
- timeouts = nf_tcp_pernet(net)->timeouts;
+ timeouts = nf_tcp_pernet(info->net)->timeouts;
break;
- case IPPROTO_UDP: /* fallthrough */
+ case IPPROTO_UDP:
case IPPROTO_UDPLITE:
- timeouts = nf_udp_pernet(net)->timeouts;
+ timeouts = nf_udp_pernet(info->net)->timeouts;
break;
case IPPROTO_DCCP:
#ifdef CONFIG_NF_CT_PROTO_DCCP
- timeouts = nf_dccp_pernet(net)->dccp_timeout;
+ timeouts = nf_dccp_pernet(info->net)->dccp_timeout;
#endif
break;
case IPPROTO_ICMPV6:
- timeouts = &nf_icmpv6_pernet(net)->timeout;
+ timeouts = &nf_icmpv6_pernet(info->net)->timeout;
break;
case IPPROTO_SCTP:
#ifdef CONFIG_NF_CT_PROTO_SCTP
- timeouts = nf_sctp_pernet(net)->timeouts;
+ timeouts = nf_sctp_pernet(info->net)->timeouts;
#endif
break;
case IPPROTO_GRE:
#ifdef CONFIG_NF_CT_PROTO_GRE
- timeouts = nf_gre_pernet(net)->timeouts;
+ timeouts = nf_gre_pernet(info->net)->timeouts;
#endif
break;
case 255:
- timeouts = &nf_generic_pernet(net)->timeout;
+ timeouts = &nf_generic_pernet(info->net)->timeout;
break;
default:
WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto);
@@ -482,50 +489,38 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
}
if (!timeouts)
- goto err;
+ return -EOPNOTSUPP;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
- err = -ENOMEM;
- goto err;
- }
+ if (!skb2)
+ return -ENOMEM;
- ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ ret = cttimeout_default_fill_info(info->net, skb2,
+ NETLINK_CB(skb).portid,
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
l3num, l4proto, timeouts);
if (ret <= 0) {
kfree_skb(skb2);
- err = -ENOMEM;
- goto err;
+ return -ENOMEM;
}
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
-err:
- return err;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net,
const char *name)
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *timeout, *matching = NULL;
- list_for_each_entry_rcu(timeout, &net->nfct_timeout_list, head) {
+ list_for_each_entry_rcu(timeout, &pernet->nfct_timeout_list, head) {
if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- if (!try_module_get(THIS_MODULE))
- goto err;
-
- if (!refcount_inc_not_zero(&timeout->refcnt)) {
- module_put(THIS_MODULE);
+ if (!refcount_inc_not_zero(&timeout->refcnt))
goto err;
- }
matching = timeout;
break;
}
@@ -538,28 +533,43 @@ static void ctnl_timeout_put(struct nf_ct_timeout *t)
struct ctnl_timeout *timeout =
container_of(t, struct ctnl_timeout, timeout);
- if (refcount_dec_and_test(&timeout->refcnt))
+ if (refcount_dec_and_test(&timeout->refcnt)) {
kfree_rcu(timeout, rcu_head);
-
- module_put(THIS_MODULE);
+ module_put(THIS_MODULE);
+ }
}
static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
- [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_NEW] = {
+ .call = cttimeout_new_timeout,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_GET] = {
+ .call = cttimeout_get_timeout,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_DELETE] = {
+ .call = cttimeout_del_timeout,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = {
+ .call = cttimeout_default_set,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = {
+ .call = cttimeout_default_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
};
static const struct nfnetlink_subsystem cttimeout_subsys = {
@@ -573,20 +583,39 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT);
static int __net_init cttimeout_net_init(struct net *net)
{
- INIT_LIST_HEAD(&net->nfct_timeout_list);
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+
+ INIT_LIST_HEAD(&pernet->nfct_timeout_list);
+ INIT_LIST_HEAD(&pernet->nfct_timeout_freelist);
return 0;
}
+static void __net_exit cttimeout_net_pre_exit(struct net *net)
+{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+ struct ctnl_timeout *cur, *tmp;
+
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) {
+ list_del_rcu(&cur->head);
+ list_add(&cur->free_head, &pernet->nfct_timeout_freelist);
+ }
+
+ /* core calls synchronize_rcu() after this */
+}
+
static void __net_exit cttimeout_net_exit(struct net *net)
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *cur, *tmp;
- nf_ct_unconfirmed_destroy(net);
+ if (list_empty(&pernet->nfct_timeout_freelist))
+ return;
+
nf_ct_untimeout(net, NULL);
- list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) {
- list_del_rcu(&cur->head);
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) {
+ list_del(&cur->free_head);
if (refcount_dec_and_test(&cur->refcnt))
kfree_rcu(cur, rcu_head);
@@ -595,7 +624,15 @@ static void __net_exit cttimeout_net_exit(struct net *net)
static struct pernet_operations cttimeout_ops = {
.init = cttimeout_net_init,
+ .pre_exit = cttimeout_net_pre_exit,
.exit = cttimeout_net_exit,
+ .id = &nfct_timeout_id,
+ .size = sizeof(struct nfct_timeout_pernet),
+};
+
+static const struct nf_ct_timeout_hooks hooks = {
+ .timeout_find_get = ctnl_timeout_find_get,
+ .timeout_put = ctnl_timeout_put,
};
static int __init cttimeout_init(void)
@@ -612,8 +649,7 @@ static int __init cttimeout_init(void)
"nfnetlink.\n");
goto err_out;
}
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
+ RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks);
return 0;
err_out:
@@ -621,14 +657,24 @@ err_out:
return ret;
}
+static int untimeout(struct nf_conn *ct, void *timeout)
+{
+ struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
+
+ if (timeout_ext)
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+
+ return 0;
+}
+
static void __exit cttimeout_exit(void)
{
nfnetlink_subsys_unregister(&cttimeout_subsys);
unregister_pernet_subsys(&cttimeout_ops);
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
- synchronize_rcu();
+ RCU_INIT_POINTER(nf_ct_timeout_hook, NULL);
+
+ nf_ct_iterate_destroy(untimeout, NULL);
}
module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
new file mode 100644
index 000000000000..8120aadf6a0f
--- /dev/null
+++ b/net/netfilter/nfnetlink_hook.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021 Red Hat GmbH
+ *
+ * Author: Florian Westphal <fw@strlen.de>
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_hook.h>
+
+#include <net/netfilter/nf_tables.h>
+#include <net/sock.h>
+
+static const struct nla_policy nfnl_hook_nla_policy[NFNLA_HOOK_MAX + 1] = {
+ [NFNLA_HOOK_HOOKNUM] = { .type = NLA_U32 },
+ [NFNLA_HOOK_PRIORITY] = { .type = NLA_U32 },
+ [NFNLA_HOOK_DEV] = { .type = NLA_STRING,
+ .len = IFNAMSIZ - 1 },
+ [NFNLA_HOOK_FUNCTION_NAME] = { .type = NLA_NUL_STRING,
+ .len = KSYM_NAME_LEN, },
+ [NFNLA_HOOK_MODULE_NAME] = { .type = NLA_NUL_STRING,
+ .len = MODULE_NAME_LEN, },
+ [NFNLA_HOOK_CHAIN_INFO] = { .type = NLA_NESTED, },
+};
+
+static int nf_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *c)
+{
+ int err;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ rcu_read_unlock();
+ err = netlink_dump_start(nlsk, skb, nlh, c);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ return err;
+}
+
+struct nfnl_dump_hook_data {
+ char devname[IFNAMSIZ];
+ unsigned long headv;
+ u8 hook;
+};
+
+static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ const struct nf_hook_ops *ops)
+{
+ struct net *net = sock_net(nlskb->sk);
+ struct nlattr *nest, *nest2;
+ struct nft_chain *chain;
+ int ret = 0;
+
+ if (ops->hook_ops_type != NF_HOOK_OP_NF_TABLES)
+ return 0;
+
+ chain = ops->priv;
+ if (WARN_ON_ONCE(!chain))
+ return 0;
+
+ if (!nft_is_active(net, chain))
+ return 0;
+
+ nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE,
+ htonl(NFNL_HOOK_TYPE_NFTABLES));
+ if (ret)
+ goto cancel_nest;
+
+ nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
+ if (!nest2)
+ goto cancel_nest;
+
+ ret = nla_put_string(nlskb, NFNLA_CHAIN_TABLE, chain->table->name);
+ if (ret)
+ goto cancel_nest;
+
+ ret = nla_put_string(nlskb, NFNLA_CHAIN_NAME, chain->name);
+ if (ret)
+ goto cancel_nest;
+
+ ret = nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, chain->table->family);
+ if (ret)
+ goto cancel_nest;
+
+ nla_nest_end(nlskb, nest2);
+ nla_nest_end(nlskb, nest);
+ return ret;
+
+cancel_nest:
+ nla_nest_cancel(nlskb, nest);
+ return -EMSGSIZE;
+}
+
+static int nfnl_hook_dump_one(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ const struct nf_hook_ops *ops,
+ int family, unsigned int seq)
+{
+ u16 event = nfnl_msg_type(NFNL_SUBSYS_HOOK, NFNL_MSG_HOOK_GET);
+ unsigned int portid = NETLINK_CB(nlskb).portid;
+ struct nlmsghdr *nlh;
+ int ret = -EMSGSIZE;
+ u32 hooknum;
+#ifdef CONFIG_KALLSYMS
+ char sym[KSYM_SYMBOL_LEN];
+ char *module_name;
+#endif
+ nlh = nfnl_msg_put(nlskb, portid, seq, event,
+ NLM_F_MULTI, family, NFNETLINK_V0, 0);
+ if (!nlh)
+ goto nla_put_failure;
+
+#ifdef CONFIG_KALLSYMS
+ ret = snprintf(sym, sizeof(sym), "%ps", ops->hook);
+ if (ret >= sizeof(sym)) {
+ ret = -EINVAL;
+ goto nla_put_failure;
+ }
+
+ module_name = strstr(sym, " [");
+ if (module_name) {
+ char *end;
+
+ *module_name = '\0';
+ module_name += 2;
+ end = strchr(module_name, ']');
+ if (end) {
+ *end = 0;
+
+ ret = nla_put_string(nlskb, NFNLA_HOOK_MODULE_NAME, module_name);
+ if (ret)
+ goto nla_put_failure;
+ }
+ }
+
+ ret = nla_put_string(nlskb, NFNLA_HOOK_FUNCTION_NAME, sym);
+ if (ret)
+ goto nla_put_failure;
+#endif
+
+ if (ops->pf == NFPROTO_INET && ops->hooknum == NF_INET_INGRESS)
+ hooknum = NF_NETDEV_INGRESS;
+ else
+ hooknum = ops->hooknum;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_HOOKNUM, htonl(hooknum));
+ if (ret)
+ goto nla_put_failure;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_PRIORITY, htonl(ops->priority));
+ if (ret)
+ goto nla_put_failure;
+
+ ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops);
+ if (ret)
+ goto nla_put_failure;
+
+ nlmsg_end(nlskb, nlh);
+ return 0;
+nla_put_failure:
+ nlmsg_trim(nlskb, nlh);
+ return ret;
+}
+
+static const struct nf_hook_entries *
+nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev)
+{
+ const struct nf_hook_entries *hook_head = NULL;
+#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS)
+ struct net_device *netdev;
+#endif
+
+ switch (pf) {
+ case NFPROTO_IPV4:
+ if (hook >= ARRAY_SIZE(net->nf.hooks_ipv4))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
+ break;
+ case NFPROTO_IPV6:
+ if (hook >= ARRAY_SIZE(net->nf.hooks_ipv6))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
+ break;
+ case NFPROTO_ARP:
+#ifdef CONFIG_NETFILTER_FAMILY_ARP
+ if (hook >= ARRAY_SIZE(net->nf.hooks_arp))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
+#endif
+ break;
+ case NFPROTO_BRIDGE:
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ if (hook >= ARRAY_SIZE(net->nf.hooks_bridge))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
+#endif
+ break;
+#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS)
+ case NFPROTO_NETDEV:
+ if (hook >= NF_NETDEV_NUMHOOKS)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ netdev = dev_get_by_name_rcu(net, dev);
+ if (!netdev)
+ return ERR_PTR(-ENODEV);
+
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (hook == NF_NETDEV_INGRESS)
+ return rcu_dereference(netdev->nf_hooks_ingress);
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (hook == NF_NETDEV_EGRESS)
+ return rcu_dereference(netdev->nf_hooks_egress);
+#endif
+ fallthrough;
+#endif
+ default:
+ return ERR_PTR(-EPROTONOSUPPORT);
+ }
+
+ return hook_head;
+}
+
+static int nfnl_hook_dump(struct sk_buff *nlskb,
+ struct netlink_callback *cb)
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nfnl_dump_hook_data *ctx = cb->data;
+ int err, family = nfmsg->nfgen_family;
+ struct net *net = sock_net(nlskb->sk);
+ struct nf_hook_ops * const *ops;
+ const struct nf_hook_entries *e;
+ unsigned int i = cb->args[0];
+
+ rcu_read_lock();
+
+ e = nfnl_hook_entries_head(family, ctx->hook, net, ctx->devname);
+ if (!e)
+ goto done;
+
+ if (IS_ERR(e)) {
+ cb->seq++;
+ goto done;
+ }
+
+ if ((unsigned long)e != ctx->headv || i >= e->num_hook_entries)
+ cb->seq++;
+
+ ops = nf_hook_entries_get_hook_ops(e);
+
+ for (; i < e->num_hook_entries; i++) {
+ err = nfnl_hook_dump_one(nlskb, ctx, ops[i], family,
+ cb->nlh->nlmsg_seq);
+ if (err)
+ break;
+ }
+
+done:
+ nl_dump_check_consistent(cb, nlmsg_hdr(nlskb));
+ rcu_read_unlock();
+ cb->args[0] = i;
+ return nlskb->len;
+}
+
+static int nfnl_hook_dump_start(struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nlattr * const *nla = cb->data;
+ struct nfnl_dump_hook_data *ctx = NULL;
+ struct net *net = sock_net(cb->skb->sk);
+ u8 family = nfmsg->nfgen_family;
+ char name[IFNAMSIZ] = "";
+ const void *head;
+ u32 hooknum;
+
+ hooknum = ntohl(nla_get_be32(nla[NFNLA_HOOK_HOOKNUM]));
+ if (hooknum > 255)
+ return -EINVAL;
+
+ if (family == NFPROTO_NETDEV) {
+ if (!nla[NFNLA_HOOK_DEV])
+ return -EINVAL;
+
+ nla_strscpy(name, nla[NFNLA_HOOK_DEV], sizeof(name));
+ }
+
+ rcu_read_lock();
+ /* Not dereferenced; for consistency check only */
+ head = nfnl_hook_entries_head(family, hooknum, net, name);
+ rcu_read_unlock();
+
+ if (head && IS_ERR(head))
+ return PTR_ERR(head);
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ strscpy(ctx->devname, name, sizeof(ctx->devname));
+ ctx->headv = (unsigned long)head;
+ ctx->hook = hooknum;
+
+ cb->seq = 1;
+ cb->data = ctx;
+
+ return 0;
+}
+
+static int nfnl_hook_dump_stop(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+ return 0;
+}
+
+static int nfnl_hook_get(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ if (!nla[NFNLA_HOOK_HOOKNUM])
+ return -EINVAL;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nfnl_hook_dump_start,
+ .done = nfnl_hook_dump_stop,
+ .dump = nfnl_hook_dump,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nf_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static const struct nfnl_callback nfnl_hook_cb[NFNL_MSG_HOOK_MAX] = {
+ [NFNL_MSG_HOOK_GET] = {
+ .call = nfnl_hook_get,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFNLA_HOOK_MAX,
+ .policy = nfnl_hook_nla_policy
+ },
+};
+
+static const struct nfnetlink_subsystem nfhook_subsys = {
+ .name = "nfhook",
+ .subsys_id = NFNL_SUBSYS_HOOK,
+ .cb_count = NFNL_MSG_HOOK_MAX,
+ .cb = nfnl_hook_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_HOOK);
+
+static int __init nfnetlink_hook_init(void)
+{
+ return nfnetlink_subsys_register(&nfhook_subsys);
+}
+
+static void __exit nfnetlink_hook_exit(void)
+{
+ nfnetlink_subsys_unregister(&nfhook_subsys);
+}
+
+module_init(nfnetlink_hook_init);
+module_exit(nfnetlink_hook_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("nfnetlink_hook: list registered netfilter hooks");
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 0ba020ca38e6..d97eb280cb2e 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -43,6 +43,10 @@
#include "../bridge/br_private.h"
#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
#define NFULNL_COPY_DISABLED 0xff
#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
@@ -62,6 +66,7 @@ struct nfulnl_instance {
struct sk_buff *skb; /* pre-allocatd skb */
struct timer_list timer;
struct net *net;
+ netns_tracker ns_tracker;
struct user_namespace *peer_user_ns; /* User namespace of the peer process */
u32 peer_portid; /* PORTID of the peer process */
@@ -136,7 +141,7 @@ static void nfulnl_instance_free_rcu(struct rcu_head *head)
struct nfulnl_instance *inst =
container_of(head, struct nfulnl_instance, rcu);
- put_net(inst->net);
+ put_net_track(inst->net, &inst->ns_tracker);
kfree(inst);
module_put(THIS_MODULE);
}
@@ -183,7 +188,7 @@ instance_create(struct net *net, u_int16_t group_num,
timer_setup(&inst->timer, nfulnl_timer, 0);
- inst->net = get_net(net);
+ inst->net = get_net_track(net, &inst->ns_tracker, GFP_ATOMIC);
inst->peer_user_ns = user_ns;
inst->peer_portid = portid;
inst->group_num = group_num;
@@ -356,8 +361,7 @@ __nfulnl_send(struct nfulnl_instance *inst)
goto out;
}
}
- nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
- MSG_DONTWAIT);
+ nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid);
out:
inst->qlen = 0;
inst->skb = NULL;
@@ -453,20 +457,16 @@ __build_packet_message(struct nfnl_log_net *log,
{
struct nfulnl_msg_packet_hdr pmsg;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
sk_buff_data_t old_tail = inst->skb->tail;
struct sock *sk;
const unsigned char *hwhdrp;
+ ktime_t tstamp;
- nlh = nlmsg_put(inst->skb, 0, 0,
- nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
- sizeof(struct nfgenmsg), 0);
+ nlh = nfnl_msg_put(inst->skb, 0, 0,
+ nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
+ 0, pf, NFNETLINK_V0, htons(inst->group_num));
if (!nlh)
return -1;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = pf;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(inst->group_num);
memset(&pmsg, 0, sizeof(pmsg));
pmsg.hw_protocol = skb->protocol;
@@ -558,7 +558,8 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
if (indev && skb->dev &&
- skb->mac_header != skb->network_header) {
+ skb_mac_header_was_set(skb) &&
+ skb_mac_header_len(skb) != 0) {
struct nfulnl_msg_packet_hw phw;
int len;
@@ -588,9 +589,10 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
}
- if (hooknum <= NF_INET_FORWARD && skb->tstamp) {
+ tstamp = skb_tstamp_cond(skb, false);
+ if (hooknum <= NF_INET_FORWARD && tstamp) {
struct nfulnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+ struct timespec64 kts = ktime_to_timespec64(tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -689,7 +691,7 @@ nfulnl_log_packet(struct net *net,
struct nfnl_log_net *log = nfnl_log_pernet(net);
const struct nfnl_ct_hook *nfnl_ct = NULL;
struct nf_conn *ct = NULL;
- enum ip_conntrack_info uninitialized_var(ctinfo);
+ enum ip_conntrack_info ctinfo;
if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
@@ -734,14 +736,16 @@ nfulnl_log_packet(struct net *net,
size += nla_total_size(sizeof(u_int32_t));
if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
size += nla_total_size(sizeof(u_int32_t));
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (inst->flags & NFULNL_CFG_F_CONNTRACK) {
nfnl_ct = rcu_dereference(nfnl_ct_hook);
if (nfnl_ct != NULL) {
- ct = nfnl_ct->get_ct(skb, &ctinfo);
+ ct = nf_ct_get(skb, &ctinfo);
if (ct != NULL)
size += nfnl_ct->build_size(ct);
}
}
+#endif
if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE)
size += nfulnl_get_bridge_size(skb);
@@ -845,10 +849,8 @@ static struct notifier_block nfulnl_rtnl_notifier = {
.notifier_call = nfulnl_rcv_nl_event,
};
-static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfula[])
{
return -ENOTSUPP;
}
@@ -869,29 +871,26 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = {
[NFULA_CFG_FLAGS] = { .type = NLA_U16 },
};
-static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfula[],
- struct netlink_ext_ack *extack)
+static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfula[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int16_t group_num = ntohs(nfmsg->res_id);
- struct nfulnl_instance *inst;
+ struct nfnl_log_net *log = nfnl_log_pernet(info->net);
+ u_int16_t group_num = ntohs(info->nfmsg->res_id);
struct nfulnl_msg_config_cmd *cmd = NULL;
- struct nfnl_log_net *log = nfnl_log_pernet(net);
- int ret = 0;
+ struct nfulnl_instance *inst;
u16 flags = 0;
+ int ret = 0;
if (nfula[NFULA_CFG_CMD]) {
- u_int8_t pf = nfmsg->nfgen_family;
+ u_int8_t pf = info->nfmsg->nfgen_family;
cmd = nla_data(nfula[NFULA_CFG_CMD]);
/* Commands without queue context */
switch (cmd->command) {
case NFULNL_CFG_CMD_PF_BIND:
- return nf_log_bind_pf(net, pf, &nfulnl_logger);
+ return nf_log_bind_pf(info->net, pf, &nfulnl_logger);
case NFULNL_CFG_CMD_PF_UNBIND:
- nf_log_unbind_pf(net, pf);
+ nf_log_unbind_pf(info->net, pf);
return 0;
}
}
@@ -932,7 +931,7 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
goto out_put;
}
- inst = instance_create(net, group_num,
+ inst = instance_create(info->net, group_num,
NETLINK_CB(skb).portid,
sk_user_ns(NETLINK_CB(skb).sk));
if (IS_ERR(inst)) {
@@ -993,11 +992,17 @@ out:
}
static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
- [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
- .attr_count = NFULA_MAX, },
- [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
- .attr_count = NFULA_CFG_MAX,
- .policy = nfula_cfg_policy },
+ [NFULNL_MSG_PACKET] = {
+ .call = nfulnl_recv_unsupp,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFULA_MAX,
+ },
+ [NFULNL_MSG_CONFIG] = {
+ .call = nfulnl_recv_config,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFULA_CFG_MAX,
+ .policy = nfula_cfg_policy
+ },
};
static const struct nfnetlink_subsystem nfulnl_subsys = {
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 9f5dea0064ea..ee6840bd5933 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -165,12 +165,12 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
const struct sk_buff *skb,
const struct iphdr *ip,
- unsigned char *opts)
+ unsigned char *opts,
+ struct tcphdr *_tcph)
{
const struct tcphdr *tcp;
- struct tcphdr _tcph;
- tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+ tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph);
if (!tcp)
return NULL;
@@ -186,6 +186,8 @@ static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) +
sizeof(struct tcphdr), ctx->optsize, opts);
+ if (!ctx->optp)
+ return NULL;
}
return tcp;
@@ -205,10 +207,11 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
int fmatch = FMATCH_WRONG;
struct nf_osf_hdr_ctx ctx;
const struct tcphdr *tcp;
+ struct tcphdr _tcph;
memset(&ctx, 0, sizeof(ctx));
- tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
+ tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph);
if (!tcp)
return false;
@@ -265,10 +268,12 @@ bool nf_osf_find(const struct sk_buff *skb,
const struct nf_osf_finger *kf;
struct nf_osf_hdr_ctx ctx;
const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+ bool found = false;
memset(&ctx, 0, sizeof(ctx));
- tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
+ tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph);
if (!tcp)
return false;
@@ -279,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb,
data->genre = f->genre;
data->version = f->version;
+ found = true;
break;
}
- return true;
+ return found;
}
EXPORT_SYMBOL_GPL(nf_osf_find);
@@ -290,10 +296,9 @@ static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = {
[OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) },
};
-static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
+static int nfnl_osf_add_callback(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const osf_attrs[])
{
struct nf_osf_user_finger *f;
struct nf_osf_finger *kf = NULL, *sf;
@@ -305,7 +310,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
if (!osf_attrs[OSF_ATTR_FINGER])
return -EINVAL;
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
return -EINVAL;
f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
@@ -323,7 +328,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
kfree(kf);
kf = NULL;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
err = -EEXIST;
break;
}
@@ -337,11 +342,9 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
return err;
}
-static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
+static int nfnl_osf_remove_callback(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const osf_attrs[])
{
struct nf_osf_user_finger *f;
struct nf_osf_finger *sf;
@@ -375,11 +378,13 @@ static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = {
[OSF_MSG_ADD] = {
.call = nfnl_osf_add_callback,
+ .type = NFNL_CB_MUTEX,
.attr_count = OSF_ATTR_MAX,
.policy = nfnl_osf_policy,
},
[OSF_MSG_REMOVE] = {
.call = nfnl_osf_remove_callback,
+ .type = NFNL_CB_MUTEX,
.attr_count = OSF_ATTR_MAX,
.policy = nfnl_osf_policy,
},
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 76535fd9278c..87a9009d5234 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -225,7 +225,7 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
int err;
if (verdict == NF_ACCEPT ||
@@ -383,16 +383,16 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nlattr *nla;
struct nfqnl_msg_packet_hdr *pmsg;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct sk_buff *entskb = entry->skb;
struct net_device *indev;
struct net_device *outdev;
struct nf_conn *ct = NULL;
- enum ip_conntrack_info uninitialized_var(ctinfo);
- struct nfnl_ct_hook *nfnl_ct;
+ enum ip_conntrack_info ctinfo = 0;
+ const struct nfnl_ct_hook *nfnl_ct;
bool csum_verify;
char *secdata = NULL;
u32 seclen = 0;
+ ktime_t tstamp;
size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
@@ -403,11 +403,13 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
#endif
+ nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(u_int32_t)) /* priority */
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+ nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+ nla_total_size(sizeof(u_int32_t)); /* cap_len */
- if (entskb->tstamp)
+ tstamp = skb_tstamp_cond(entskb, false);
+ if (tstamp)
size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
size += nfqnl_get_bridge_size(entry);
@@ -444,13 +446,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nfnl_ct = rcu_dereference(nfnl_ct_hook);
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (queue->flags & NFQA_CFG_F_CONNTRACK) {
if (nfnl_ct != NULL) {
- ct = nfnl_ct->get_ct(entskb, &ctinfo);
+ ct = nf_ct_get(entskb, &ctinfo);
if (ct != NULL)
size += nfnl_ct->build_size(ct);
}
}
+#endif
if (queue->flags & NFQA_CFG_F_UID_GID) {
size += (nla_total_size(sizeof(u_int32_t)) /* uid */
@@ -469,18 +473,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
goto nlmsg_failure;
}
- nlh = nlmsg_put(skb, 0, 0,
- nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
- sizeof(struct nfgenmsg), 0);
+ nlh = nfnl_msg_put(skb, 0, 0,
+ nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
+ 0, entry->state.pf, NFNETLINK_V0,
+ htons(queue->queue_num));
if (!nlh) {
skb_tx_error(entskb);
kfree_skb(skb);
goto nlmsg_failure;
}
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = entry->state.pf;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(queue->queue_num);
nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg));
pmsg = nla_data(nla);
@@ -561,8 +562,13 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
goto nla_put_failure;
+ if (entskb->priority &&
+ nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority)))
+ goto nla_put_failure;
+
if (indev && entskb->dev &&
- entskb->mac_header != entskb->network_header) {
+ skb_mac_header_was_set(entskb) &&
+ skb_mac_header_len(entskb) != 0) {
struct nfqnl_msg_packet_hw phw;
int len;
@@ -578,9 +584,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (nfqnl_put_bridge(entry, skb) < 0)
goto nla_put_failure;
- if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) {
+ if (entry->state.hook <= NF_INET_FORWARD && tstamp) {
struct nfqnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
+ struct timespec64 kts = ktime_to_timespec64(tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -681,7 +687,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
*packet_id_ptr = htonl(entry->id);
/* nfnetlink_unicast will either free the nskb or add it to a socket */
- err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
+ err = nfnetlink_unicast(nskb, net, queue->peer_portid);
if (err < 0) {
if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
failopen = 1;
@@ -711,9 +717,15 @@ static struct nf_queue_entry *
nf_queue_entry_dup(struct nf_queue_entry *e)
{
struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
- if (entry)
- nf_queue_entry_get_refs(entry);
- return entry;
+
+ if (!entry)
+ return NULL;
+
+ if (nf_queue_entry_get_refs(entry))
+ return entry;
+
+ kfree(entry);
+ return NULL;
}
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
@@ -737,12 +749,6 @@ static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
#define nf_bridge_adjust_segmented_data(s) do {} while (0)
#endif
-static void free_entry(struct nf_queue_entry *entry)
-{
- nf_queue_entry_release_refs(entry);
- kfree(entry);
-}
-
static int
__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
struct sk_buff *skb, struct nf_queue_entry *entry)
@@ -768,7 +774,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
entry_seg->skb = skb;
ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
if (ret)
- free_entry(entry_seg);
+ nf_queue_entry_free(entry_seg);
}
return ret;
}
@@ -827,7 +833,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
if (queued) {
if (err) /* some segments are already queued */
- free_entry(entry);
+ nf_queue_entry_free(entry);
kfree_skb(skb);
return 0;
}
@@ -837,11 +843,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
}
static int
-nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
+nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
struct sk_buff *nskb;
if (diff < 0) {
+ unsigned int min_len = skb_transport_offset(e->skb);
+
+ if (data_len < min_len)
+ return -EINVAL;
+
if (pskb_trim(e->skb, data_len))
return -ENOMEM;
} else if (diff > 0) {
@@ -959,6 +970,16 @@ static void nfqnl_nf_hook_drop(struct net *net)
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int i;
+ /* This function is also called on net namespace error unwind,
+ * when pernet_ops->init() failed and ->exit() functions of the
+ * previous pernet_ops gets called.
+ *
+ * This may result in a call to nfqnl_nf_hook_drop() before
+ * struct nfnl_queue_net was allocated.
+ */
+ if (!q)
+ return;
+
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct nfqnl_instance *inst;
struct hlist_head *head = &q->instance_table[i];
@@ -1011,11 +1032,13 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
[NFQA_CT] = { .type = NLA_UNSPEC },
[NFQA_EXP] = { .type = NLA_UNSPEC },
[NFQA_VLAN] = { .type = NLA_NESTED },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
[NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
[NFQA_MARK] = { .type = NLA_U32 },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static struct nfqnl_instance *
@@ -1054,20 +1077,17 @@ static int nfq_id_after(unsigned int id, unsigned int max)
return (int)(id - max) > 0;
}
-static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nfqa[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u16 queue_num = ntohs(info->nfmsg->res_id);
struct nf_queue_entry *entry, *tmp;
- unsigned int verdict, maxid;
struct nfqnl_msg_verdict_hdr *vhdr;
struct nfqnl_instance *queue;
+ unsigned int verdict, maxid;
LIST_HEAD(batch_list);
- u16 queue_num = ntohs(nfmsg->res_id);
- struct nfnl_queue_net *q = nfnl_queue_pernet(net);
queue = verdict_instance_lookup(q, queue_num,
NETLINK_CB(skb).portid);
@@ -1099,20 +1119,24 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
}
return 0;
}
-static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
+static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct,
const struct nlmsghdr *nlh,
const struct nlattr * const nfqa[],
struct nf_queue_entry *entry,
enum ip_conntrack_info *ctinfo)
{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
struct nf_conn *ct;
- ct = nfnl_ct->get_ct(entry->skb, ctinfo);
+ ct = nf_ct_get(entry->skb, ctinfo);
if (ct == NULL)
return NULL;
@@ -1124,6 +1148,9 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
NETLINK_CB(entry->skb).portid,
nlmsg_report(nlh));
return ct;
+#else
+ return NULL;
+#endif
}
static int nfqa_parse_bridge(struct nf_queue_entry *entry,
@@ -1162,22 +1189,18 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry,
return 0;
}
-static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfqa[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u_int16_t queue_num = ntohs(info->nfmsg->res_id);
+ const struct nfnl_ct_hook *nfnl_ct;
struct nfqnl_msg_verdict_hdr *vhdr;
+ enum ip_conntrack_info ctinfo;
struct nfqnl_instance *queue;
- unsigned int verdict;
struct nf_queue_entry *entry;
- enum ip_conntrack_info uninitialized_var(ctinfo);
- struct nfnl_ct_hook *nfnl_ct;
struct nf_conn *ct = NULL;
- struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ unsigned int verdict;
int err;
queue = verdict_instance_lookup(q, queue_num,
@@ -1200,7 +1223,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_CT]) {
if (nfnl_ct != NULL)
- ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
+ ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry,
+ &ctinfo);
}
if (entry->state.pf == PF_BRIDGE) {
@@ -1224,14 +1248,15 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
return 0;
}
-static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
return -ENOTSUPP;
}
@@ -1249,16 +1274,13 @@ static const struct nf_queue_handler nfqh = {
.nf_hook_drop = nfqnl_nf_hook_drop,
};
-static int nfqnl_recv_config(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfqa[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
- struct nfqnl_instance *queue;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u_int16_t queue_num = ntohs(info->nfmsg->res_id);
struct nfqnl_msg_config_cmd *cmd = NULL;
- struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ struct nfqnl_instance *queue;
__u32 flags = 0, mask = 0;
int ret = 0;
@@ -1377,17 +1399,29 @@ err_out_unlock:
}
static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
- [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp,
- .attr_count = NFQA_MAX, },
- [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict,
- .attr_count = NFQA_MAX,
- .policy = nfqa_verdict_policy },
- [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
- .attr_count = NFQA_CFG_MAX,
- .policy = nfqa_cfg_policy },
- [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch,
- .attr_count = NFQA_MAX,
- .policy = nfqa_verdict_batch_policy },
+ [NFQNL_MSG_PACKET] = {
+ .call = nfqnl_recv_unsupp,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFQA_MAX,
+ },
+ [NFQNL_MSG_VERDICT] = {
+ .call = nfqnl_recv_verdict,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_policy
+ },
+ [NFQNL_MSG_CONFIG] = {
+ .call = nfqnl_recv_config,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFQA_CFG_MAX,
+ .policy = nfqa_cfg_policy
+ },
+ [NFQNL_MSG_VERDICT_BATCH] = {
+ .call = nfqnl_recv_verdict_batch,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_batch_policy
+ },
};
static const struct nfnetlink_subsystem nfqnl_subsys = {
@@ -1505,7 +1539,6 @@ static int __net_init nfnl_queue_net_init(struct net *net)
&nfqnl_seq_ops, sizeof(struct iter_state)))
return -ENOMEM;
#endif
- nf_register_queue_handler(net, &nfqh);
return 0;
}
@@ -1514,7 +1547,6 @@ static void __net_exit nfnl_queue_net_exit(struct net *net)
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
unsigned int i;
- nf_unregister_queue_handler(net);
#ifdef CONFIG_PROC_FS
remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
#endif
@@ -1522,15 +1554,9 @@ static void __net_exit nfnl_queue_net_exit(struct net *net)
WARN_ON_ONCE(!hlist_empty(&q->instance_table[i]));
}
-static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
-{
- synchronize_rcu();
-}
-
static struct pernet_operations nfnl_queue_net_ops = {
.init = nfnl_queue_net_init,
.exit = nfnl_queue_net_exit,
- .exit_batch = nfnl_queue_net_exit_batch,
.id = &nfnl_queue_net_id,
.size = sizeof(struct nfnl_queue_net),
};
@@ -1558,6 +1584,8 @@ static int __init nfnetlink_queue_init(void)
goto cleanup_netlink_subsys;
}
+ nf_register_queue_handler(&nfqh);
+
return status;
cleanup_netlink_subsys:
@@ -1571,6 +1599,7 @@ out:
static void __exit nfnetlink_queue_fini(void)
{
+ nf_unregister_queue_handler();
unregister_netdevice_notifier(&nfqnl_dev_notifier);
nfnetlink_subsys_unregister(&nfqnl_subsys);
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 0ed2281f03be..e6e402b247d0 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -16,8 +16,8 @@
#include <net/netfilter/nf_tables_offload.h>
struct nft_bitwise {
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
enum nft_bitwise_ops op:8;
u8 len;
struct nft_data mask;
@@ -30,7 +30,7 @@ static void nft_bitwise_eval_bool(u32 *dst, const u32 *src,
{
unsigned int i;
- for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++)
+ for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++)
dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i];
}
@@ -93,7 +93,16 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
static int nft_bitwise_init_bool(struct nft_bitwise *priv,
const struct nlattr *const tb[])
{
- struct nft_data_desc d1, d2;
+ struct nft_data_desc mask = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->mask),
+ .len = priv->len,
+ };
+ struct nft_data_desc xor = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->xor),
+ .len = priv->len,
+ };
int err;
if (tb[NFTA_BITWISE_DATA])
@@ -103,36 +112,30 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv,
!tb[NFTA_BITWISE_XOR])
return -EINVAL;
- err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1,
- tb[NFTA_BITWISE_MASK]);
+ err = nft_data_init(NULL, &priv->mask, &mask, tb[NFTA_BITWISE_MASK]);
if (err < 0)
return err;
- if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) {
- err = -EINVAL;
- goto err1;
- }
- err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2,
- tb[NFTA_BITWISE_XOR]);
+ err = nft_data_init(NULL, &priv->xor, &xor, tb[NFTA_BITWISE_XOR]);
if (err < 0)
- goto err1;
- if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) {
- err = -EINVAL;
- goto err2;
- }
+ goto err_xor_err;
return 0;
-err2:
- nft_data_release(&priv->xor, d2.type);
-err1:
- nft_data_release(&priv->mask, d1.type);
+
+err_xor_err:
+ nft_data_release(&priv->mask, mask.type);
+
return err;
}
static int nft_bitwise_init_shift(struct nft_bitwise *priv,
const struct nlattr *const tb[])
{
- struct nft_data_desc d;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ .len = sizeof(u32),
+ };
int err;
if (tb[NFTA_BITWISE_MASK] ||
@@ -142,13 +145,12 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv,
if (!tb[NFTA_BITWISE_DATA])
return -EINVAL;
- err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d,
- tb[NFTA_BITWISE_DATA]);
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_BITWISE_DATA]);
if (err < 0)
return err;
- if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) ||
- priv->data.data[0] >= BITS_PER_TYPE(u32)) {
- nft_data_release(&priv->data, d.type);
+
+ if (priv->data.data[0] >= BITS_PER_TYPE(u32)) {
+ nft_data_release(&priv->data, desc.type);
return -EINVAL;
}
@@ -163,25 +165,20 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
u32 len;
int err;
- if (!tb[NFTA_BITWISE_SREG] ||
- !tb[NFTA_BITWISE_DREG] ||
- !tb[NFTA_BITWISE_LEN])
- return -EINVAL;
-
err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
if (err < 0)
return err;
priv->len = len;
- priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]);
- err = nft_validate_register_load(priv->sreg, priv->len);
+ err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg,
+ priv->len);
if (err < 0)
return err;
- priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
if (err < 0)
return err;
@@ -283,19 +280,254 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx,
return 0;
}
+static bool nft_bitwise_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ const struct nft_bitwise *bitwise;
+ unsigned int regcount;
+ u8 dreg;
+ int i;
+
+ if (!track->regs[priv->sreg].selector)
+ return false;
+
+ bitwise = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector &&
+ track->regs[priv->sreg].num_reg == 0 &&
+ track->regs[priv->dreg].bitwise &&
+ track->regs[priv->dreg].bitwise->ops == expr->ops &&
+ priv->sreg == bitwise->sreg &&
+ priv->dreg == bitwise->dreg &&
+ priv->op == bitwise->op &&
+ priv->len == bitwise->len &&
+ !memcmp(&priv->mask, &bitwise->mask, sizeof(priv->mask)) &&
+ !memcmp(&priv->xor, &bitwise->xor, sizeof(priv->xor)) &&
+ !memcmp(&priv->data, &bitwise->data, sizeof(priv->data))) {
+ track->cur = expr;
+ return true;
+ }
+
+ if (track->regs[priv->sreg].bitwise ||
+ track->regs[priv->sreg].num_reg != 0) {
+ nft_reg_track_cancel(track, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (priv->sreg != priv->dreg) {
+ nft_reg_track_update(track, track->regs[priv->sreg].selector,
+ priv->dreg, priv->len);
+ }
+
+ dreg = priv->dreg;
+ regcount = DIV_ROUND_UP(priv->len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ track->regs[priv->dreg].bitwise = expr;
+
+ return false;
+}
+
static const struct nft_expr_ops nft_bitwise_ops = {
.type = &nft_bitwise_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
.eval = nft_bitwise_eval,
.init = nft_bitwise_init,
.dump = nft_bitwise_dump,
+ .reduce = nft_bitwise_reduce,
.offload = nft_bitwise_offload,
};
+static int
+nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out)
+{
+ struct nft_data data;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ .len = sizeof(u32),
+ };
+ int err;
+
+ err = nft_data_init(NULL, &data, &desc, tb);
+ if (err < 0)
+ return err;
+
+ *out = data.data[0];
+
+ return 0;
+}
+
+static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ int err;
+
+ err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg,
+ sizeof(u32));
+ if (err < 0)
+ return err;
+
+ err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_BITWISE_DATA])
+ return -EINVAL;
+
+ if (!tb[NFTA_BITWISE_MASK] ||
+ !tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_MASK], &priv->mask);
+ if (err < 0)
+ return err;
+
+ err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_XOR], &priv->xor);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int
+nft_bitwise_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data data;
+
+ if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg))
+ return -1;
+ if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg))
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32))))
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_BOOL)))
+ return -1;
+
+ data.data[0] = priv->mask;
+ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+
+ data.data[0] = priv->xor;
+ if (nft_data_dump(skb, NFTA_BITWISE_XOR, &data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_bitwise_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ if (priv->xor || priv->sreg != priv->dreg || reg->len != sizeof(u32))
+ return -EOPNOTSUPP;
+
+ reg->mask.data[0] = priv->mask;
+ return 0;
+}
+
+static bool nft_bitwise_fast_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ const struct nft_bitwise_fast_expr *bitwise;
+
+ if (!track->regs[priv->sreg].selector)
+ return false;
+
+ bitwise = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector &&
+ track->regs[priv->dreg].bitwise &&
+ track->regs[priv->dreg].bitwise->ops == expr->ops &&
+ priv->sreg == bitwise->sreg &&
+ priv->dreg == bitwise->dreg &&
+ priv->mask == bitwise->mask &&
+ priv->xor == bitwise->xor) {
+ track->cur = expr;
+ return true;
+ }
+
+ if (track->regs[priv->sreg].bitwise) {
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+ return false;
+ }
+
+ if (priv->sreg != priv->dreg) {
+ track->regs[priv->dreg].selector =
+ track->regs[priv->sreg].selector;
+ }
+ track->regs[priv->dreg].bitwise = expr;
+
+ return false;
+}
+
+const struct nft_expr_ops nft_bitwise_fast_ops = {
+ .type = &nft_bitwise_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_bitwise_fast_init,
+ .dump = nft_bitwise_fast_dump,
+ .reduce = nft_bitwise_fast_reduce,
+ .offload = nft_bitwise_fast_offload,
+};
+
+static const struct nft_expr_ops *
+nft_bitwise_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ int err;
+ u32 len;
+
+ if (!tb[NFTA_BITWISE_LEN] ||
+ !tb[NFTA_BITWISE_SREG] ||
+ !tb[NFTA_BITWISE_DREG])
+ return ERR_PTR(-EINVAL);
+
+ err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (len != sizeof(u32))
+ return &nft_bitwise_ops;
+
+ if (tb[NFTA_BITWISE_OP] &&
+ ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_BOOL)
+ return &nft_bitwise_ops;
+
+ return &nft_bitwise_fast_ops;
+}
+
struct nft_expr_type nft_bitwise_type __read_mostly = {
.name = "bitwise",
- .ops = &nft_bitwise_ops,
+ .select_ops = nft_bitwise_select_ops,
.policy = nft_bitwise_policy,
.maxattr = NFTA_BITWISE_MAX,
.owner = THIS_MODULE,
};
+
+bool nft_expr_reduce_bitwise(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_expr *last = track->last;
+ const struct nft_expr *next;
+
+ if (expr == last)
+ return false;
+
+ next = nft_expr_next(expr);
+ if (next->ops == &nft_bitwise_ops)
+ return nft_bitwise_reduce(track, next);
+ else if (next->ops == &nft_bitwise_fast_ops)
+ return nft_bitwise_fast_reduce(track, next);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nft_expr_reduce_bitwise);
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index 12bed3f7bbc6..f952a80275a8 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -16,8 +16,8 @@
#include <net/netfilter/nf_tables.h>
struct nft_byteorder {
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
enum nft_byteorder_ops op:8;
u8 len;
u8 size;
@@ -44,7 +44,8 @@ void nft_byteorder_eval(const struct nft_expr *expr,
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
src64 = nft_reg_load64(&src[i]);
- nft_reg_store64(&dst[i], be64_to_cpu(src64));
+ nft_reg_store64(&dst[i],
+ be64_to_cpu((__force __be64)src64));
}
break;
case NFT_BYTEORDER_HTON:
@@ -131,20 +132,20 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
return -EINVAL;
}
- priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]);
err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len);
if (err < 0)
return err;
priv->len = len;
- err = nft_validate_register_load(priv->sreg, priv->len);
+ err = nft_parse_register_load(tb[NFTA_BYTEORDER_SREG], &priv->sreg,
+ priv->len);
if (err < 0)
return err;
- priv->dreg = nft_parse_register(tb[NFTA_BYTEORDER_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
}
static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -167,12 +168,23 @@ nla_put_failure:
return -1;
}
+static bool nft_byteorder_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_byteorder *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, priv->len);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_byteorder_ops = {
.type = &nft_byteorder_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
.eval = nft_byteorder_eval,
.init = nft_byteorder_init,
.dump = nft_byteorder_dump,
+ .reduce = nft_byteorder_reduce,
};
struct nft_expr_type nft_byteorder_type __read_mostly = {
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index c78d01bc02e9..c3563f0be269 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -18,7 +18,7 @@ static unsigned int nft_do_chain_ipv4(void *priv,
struct nft_pktinfo pkt;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
return nft_do_chain(&pkt, priv);
}
@@ -62,7 +62,7 @@ static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb,
struct nft_pktinfo pkt;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_unspec(&pkt, skb);
+ nft_set_pktinfo_unspec(&pkt);
return nft_do_chain(&pkt, priv);
}
@@ -102,7 +102,7 @@ static unsigned int nft_do_chain_ipv6(void *priv,
struct nft_pktinfo pkt;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
return nft_do_chain(&pkt, priv);
}
@@ -149,10 +149,10 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
switch (state->pf) {
case NFPROTO_IPV4:
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
break;
case NFPROTO_IPV6:
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
break;
default:
break;
@@ -161,16 +161,49 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
return nft_do_chain(&pkt, priv);
}
+static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_hook_state ingress_state = *state;
+ struct nft_pktinfo pkt;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ /* Original hook is NFPROTO_NETDEV and NF_NETDEV_INGRESS. */
+ ingress_state.pf = NFPROTO_IPV4;
+ ingress_state.hook = NF_INET_INGRESS;
+ nft_set_pktinfo(&pkt, skb, &ingress_state);
+
+ if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0)
+ return NF_DROP;
+ break;
+ case htons(ETH_P_IPV6):
+ ingress_state.pf = NFPROTO_IPV6;
+ ingress_state.hook = NF_INET_INGRESS;
+ nft_set_pktinfo(&pkt, skb, &ingress_state);
+
+ if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0)
+ return NF_DROP;
+ break;
+ default:
+ return NF_ACCEPT;
+ }
+
+ return nft_do_chain(&pkt, priv);
+}
+
static const struct nft_chain_type nft_chain_filter_inet = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
.family = NFPROTO_INET,
- .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ .hook_mask = (1 << NF_INET_INGRESS) |
+ (1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_FORWARD) |
(1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_POST_ROUTING),
.hooks = {
+ [NF_INET_INGRESS] = nft_do_chain_inet_ingress,
[NF_INET_LOCAL_IN] = nft_do_chain_inet,
[NF_INET_LOCAL_OUT] = nft_do_chain_inet,
[NF_INET_FORWARD] = nft_do_chain_inet,
@@ -205,13 +238,13 @@ nft_do_chain_bridge(void *priv,
switch (eth_hdr(skb)->h_proto) {
case htons(ETH_P_IP):
- nft_set_pktinfo_ipv4_validate(&pkt, skb);
+ nft_set_pktinfo_ipv4_validate(&pkt);
break;
case htons(ETH_P_IPV6):
- nft_set_pktinfo_ipv6_validate(&pkt, skb);
+ nft_set_pktinfo_ipv6_validate(&pkt);
break;
default:
- nft_set_pktinfo_unspec(&pkt, skb);
+ nft_set_pktinfo_unspec(&pkt);
break;
}
@@ -260,13 +293,13 @@ static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb,
switch (skb->protocol) {
case htons(ETH_P_IP):
- nft_set_pktinfo_ipv4_validate(&pkt, skb);
+ nft_set_pktinfo_ipv4_validate(&pkt);
break;
case htons(ETH_P_IPV6):
- nft_set_pktinfo_ipv6_validate(&pkt, skb);
+ nft_set_pktinfo_ipv6_validate(&pkt);
break;
default:
- nft_set_pktinfo_unspec(&pkt, skb);
+ nft_set_pktinfo_unspec(&pkt);
break;
}
@@ -277,9 +310,11 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
.family = NFPROTO_NETDEV,
- .hook_mask = (1 << NF_NETDEV_INGRESS),
+ .hook_mask = (1 << NF_NETDEV_INGRESS) |
+ (1 << NF_NETDEV_EGRESS),
.hooks = {
[NF_NETDEV_INGRESS] = nft_do_chain_netdev,
+ [NF_NETDEV_EGRESS] = nft_do_chain_netdev,
},
};
@@ -309,12 +344,6 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev,
return;
}
- /* UNREGISTER events are also happening on netns exit.
- *
- * Although nf_tables core releases all tables/chains, only this event
- * handler provides guarantee that hook->ops.dev is still accessible,
- * so we cannot skip exiting net namespaces.
- */
__nft_release_basechain(ctx);
}
@@ -322,6 +351,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct nftables_pernet *nft_net;
struct nft_table *table;
struct nft_chain *chain, *nr;
struct nft_ctx ctx = {
@@ -332,8 +362,12 @@ static int nf_tables_netdev_event(struct notifier_block *this,
event != NETDEV_CHANGENAME)
return NOTIFY_DONE;
- mutex_lock(&ctx.net->nft.commit_mutex);
- list_for_each_entry(table, &ctx.net->nft.tables, list) {
+ if (!check_net(ctx.net))
+ return NOTIFY_DONE;
+
+ nft_net = nft_pernet(ctx.net);
+ mutex_lock(&nft_net->commit_mutex);
+ list_for_each_entry(table, &nft_net->tables, list) {
if (table->family != NFPROTO_NETDEV)
continue;
@@ -347,7 +381,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
nft_netdev_event(event, dev, &ctx);
}
}
- mutex_unlock(&ctx.net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
return NOTIFY_DONE;
}
diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c
index eac4a901233f..98e4946100c5 100644
--- a/net/netfilter/nft_chain_nat.c
+++ b/net/netfilter/nft_chain_nat.c
@@ -17,12 +17,12 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb,
switch (state->pf) {
#ifdef CONFIG_NF_TABLES_IPV4
case NFPROTO_IPV4:
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
break;
#endif
#ifdef CONFIG_NF_TABLES_IPV6
case NFPROTO_IPV6:
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
break;
#endif
default:
diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c
index 8826bbe71136..925db0dce48d 100644
--- a/net/netfilter/nft_chain_route.c
+++ b/net/netfilter/nft_chain_route.c
@@ -26,7 +26,7 @@ static unsigned int nf_route_table_hook4(void *priv,
u8 tos;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
mark = skb->mark;
iph = ip_hdr(skb);
@@ -42,7 +42,7 @@ static unsigned int nf_route_table_hook4(void *priv,
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -74,7 +74,7 @@ static unsigned int nf_route_table_hook6(void *priv,
int err;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
/* save source/dest address, mark, hoplimit, flowlabel, priority */
memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
@@ -92,7 +92,7 @@ static unsigned int nf_route_table_hook6(void *priv,
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
flowlabel != *((u32 *)ipv6_hdr(skb)))) {
- err = nf_ip6_route_me_harder(state->net, skb);
+ err = nf_ip6_route_me_harder(state->net, state->sk, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 8a28c127effc..963cf831799c 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -18,7 +18,7 @@
struct nft_cmp_expr {
struct nft_data data;
- enum nft_registers sreg:8;
+ u8 sreg;
u8 len;
enum nft_cmp_ops op:8;
};
@@ -43,7 +43,7 @@ void nft_cmp_eval(const struct nft_expr *expr,
case NFT_CMP_LT:
if (d == 0)
goto mismatch;
- /* fall through */
+ fallthrough;
case NFT_CMP_LTE:
if (d > 0)
goto mismatch;
@@ -51,7 +51,7 @@ void nft_cmp_eval(const struct nft_expr *expr,
case NFT_CMP_GT:
if (d == 0)
goto mismatch;
- /* fall through */
+ fallthrough;
case NFT_CMP_GTE:
if (d < 0)
goto mismatch;
@@ -73,22 +73,17 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_cmp_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ };
int err;
- err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return err;
- if (desc.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- nft_data_release(&priv->data, desc.type);
- return err;
- }
-
- priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
- err = nft_validate_register_load(priv->sreg, desc.len);
+ err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
@@ -115,19 +110,56 @@ nla_put_failure:
return -1;
}
+union nft_cmp_offload_data {
+ u16 val16;
+ u32 val32;
+ u64 val64;
+};
+
+static void nft_payload_n2h(union nft_cmp_offload_data *data,
+ const u8 *val, u32 len)
+{
+ switch (len) {
+ case 2:
+ data->val16 = ntohs(*((__be16 *)val));
+ break;
+ case 4:
+ data->val32 = ntohl(*((__be32 *)val));
+ break;
+ case 8:
+ data->val64 = be64_to_cpu(*((__be64 *)val));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
struct nft_flow_rule *flow,
const struct nft_cmp_expr *priv)
{
struct nft_offload_reg *reg = &ctx->regs[priv->sreg];
+ union nft_cmp_offload_data _data, _datamask;
u8 *mask = (u8 *)&flow->match.mask;
u8 *key = (u8 *)&flow->match.key;
+ u8 *data, *datamask;
- if (priv->op != NFT_CMP_EQ || reg->len != priv->len)
+ if (priv->op != NFT_CMP_EQ || priv->len > reg->len)
return -EOPNOTSUPP;
- memcpy(key + reg->offset, &priv->data, priv->len);
- memcpy(mask + reg->offset, &reg->mask, priv->len);
+ if (reg->flags & NFT_OFFLOAD_F_NETWORK2HOST) {
+ nft_payload_n2h(&_data, (u8 *)&priv->data, reg->len);
+ nft_payload_n2h(&_datamask, (u8 *)&reg->mask, reg->len);
+ data = (u8 *)&_data;
+ datamask = (u8 *)&_datamask;
+ } else {
+ data = (u8 *)&priv->data;
+ datamask = (u8 *)&reg->mask;
+ }
+
+ memcpy(key + reg->offset, data, reg->len);
+ memcpy(mask + reg->offset, datamask, reg->len);
flow->match.dissector.used_keys |= BIT(reg->key);
flow->match.dissector.offset[reg->key] = reg->base_offset;
@@ -137,7 +169,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
nft_reg_load16(priv->data.data) != ARPHRD_ETHER)
return -EOPNOTSUPP;
- nft_offload_update_dependency(ctx, &priv->data, priv->len);
+ nft_offload_update_dependency(ctx, &priv->data, reg->len);
return 0;
}
@@ -157,34 +189,48 @@ static const struct nft_expr_ops nft_cmp_ops = {
.eval = nft_cmp_eval,
.init = nft_cmp_init,
.dump = nft_cmp_dump,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_cmp_offload,
};
+/* Calculate the mask for the nft_cmp_fast expression. On big endian the
+ * mask needs to include the *upper* bytes when interpreting that data as
+ * something smaller than the full u32, therefore a cpu_to_le32 is done.
+ */
+static u32 nft_cmp_fast_mask(unsigned int len)
+{
+ __le32 mask = cpu_to_le32(~0U >> (sizeof_field(struct nft_cmp_fast_expr,
+ data) * BITS_PER_BYTE - len));
+
+ return (__force u32)mask;
+}
+
static int nft_cmp_fast_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
struct nft_data data;
- u32 mask;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ };
int err;
- err = nft_data_init(NULL, &data, sizeof(data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return err;
- priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
- err = nft_validate_register_load(priv->sreg, desc.len);
+ err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
desc.len *= BITS_PER_BYTE;
- mask = nft_cmp_fast_mask(desc.len);
- priv->data = data.data[0] & mask;
+ priv->mask = nft_cmp_fast_mask(desc.len);
+ priv->data = data.data[0] & priv->mask;
priv->len = desc.len;
+ priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
return 0;
}
@@ -201,7 +247,7 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
},
.sreg = priv->sreg,
.len = priv->len / BITS_PER_BYTE,
- .op = NFT_CMP_EQ,
+ .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
};
return __nft_cmp_offload(ctx, flow, &cmp);
@@ -210,11 +256,12 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
struct nft_data data;
if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ)))
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
goto nla_put_failure;
data.data[0] = priv->data;
@@ -233,15 +280,113 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
.eval = NULL, /* inlined */
.init = nft_cmp_fast_init,
.dump = nft_cmp_fast_dump,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_cmp_fast_offload,
};
+static u32 nft_cmp_mask(u32 bitlen)
+{
+ return (__force u32)cpu_to_le32(~0U >> (sizeof(u32) * BITS_PER_BYTE - bitlen));
+}
+
+static void nft_cmp16_fast_mask(struct nft_data *data, unsigned int bitlen)
+{
+ int len = bitlen / BITS_PER_BYTE;
+ int i, words = len / sizeof(u32);
+
+ for (i = 0; i < words; i++) {
+ data->data[i] = 0xffffffff;
+ bitlen -= sizeof(u32) * BITS_PER_BYTE;
+ }
+
+ if (len % sizeof(u32))
+ data->data[i++] = nft_cmp_mask(bitlen);
+
+ for (; i < 4; i++)
+ data->data[i] = 0;
+}
+
+static int nft_cmp16_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ };
+ int err;
+
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+ if (err < 0)
+ return err;
+
+ err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
+ if (err < 0)
+ return err;
+
+ nft_cmp16_fast_mask(&priv->mask, desc.len * BITS_PER_BYTE);
+ priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
+ priv->len = desc.len;
+
+ return 0;
+}
+
+static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_cmp_expr cmp = {
+ .data = priv->data,
+ .sreg = priv->sreg,
+ .len = priv->len,
+ .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
+ };
+
+ return __nft_cmp_offload(ctx, flow, &cmp);
+}
+
+static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
+
+ if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+
+const struct nft_expr_ops nft_cmp16_fast_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp16_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_cmp16_fast_init,
+ .dump = nft_cmp16_fast_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_cmp16_fast_offload,
+};
+
static const struct nft_expr_ops *
nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
- struct nft_data_desc desc;
struct nft_data data;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ };
enum nft_cmp_ops op;
+ u8 sreg;
int err;
if (tb[NFTA_CMP_SREG] == NULL ||
@@ -262,23 +407,21 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
return ERR_PTR(-EINVAL);
}
- err = nft_data_init(NULL, &data, sizeof(data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return ERR_PTR(err);
- if (desc.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- goto err1;
- }
-
- if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ)
- return &nft_cmp_fast_ops;
+ sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+ if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) {
+ if (desc.len <= sizeof(u32))
+ return &nft_cmp_fast_ops;
+ else if (desc.len <= sizeof(data) &&
+ ((sreg >= NFT_REG_1 && sreg <= NFT_REG_4) ||
+ (sreg >= NFT_REG32_00 && sreg <= NFT_REG32_12 && sreg % 2 == 0)))
+ return &nft_cmp16_fast_ops;
+ }
return &nft_cmp_ops;
-err1:
- nft_data_release(&data, desc.type);
- return ERR_PTR(-EINVAL);
}
struct nft_expr_type nft_cmp_type __read_mostly = {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index f9adca62ccb3..c16172427622 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -19,6 +19,7 @@
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_arp/arp_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
/* Used for matches where *info is larger than X byte */
#define NFT_MATCH_LARGE_THRESH 192
@@ -57,8 +58,13 @@ union nft_entry {
};
static inline void
-nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info)
+nft_compat_set_par(struct xt_action_param *par,
+ const struct nft_pktinfo *pkt,
+ const void *xt, const void *xt_info)
{
+ par->state = pkt->state;
+ par->thoff = nft_thoff(pkt);
+ par->fragoff = pkt->fragoff;
par->target = xt;
par->targinfo = xt_info;
par->hotdrop = false;
@@ -71,13 +77,14 @@ static void nft_target_eval_xt(const struct nft_expr *expr,
void *info = nft_expr_priv(expr);
struct xt_target *target = expr->ops->data;
struct sk_buff *skb = pkt->skb;
+ struct xt_action_param xt;
int ret;
- nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+ nft_compat_set_par(&xt, pkt, target, info);
- ret = target->target(skb, &pkt->xt);
+ ret = target->target(skb, &xt);
- if (pkt->xt.hotdrop)
+ if (xt.hotdrop)
ret = NF_DROP;
switch (ret) {
@@ -97,13 +104,14 @@ static void nft_target_eval_bridge(const struct nft_expr *expr,
void *info = nft_expr_priv(expr);
struct xt_target *target = expr->ops->data;
struct sk_buff *skb = pkt->skb;
+ struct xt_action_param xt;
int ret;
- nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+ nft_compat_set_par(&xt, pkt, target, info);
- ret = target->target(skb, &pkt->xt);
+ ret = target->target(skb, &xt);
- if (pkt->xt.hotdrop)
+ if (xt.hotdrop)
ret = NF_DROP;
switch (ret) {
@@ -213,6 +221,17 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
return 0;
}
+static void nft_compat_wait_for_destructors(void)
+{
+ /* xtables matches or targets can have side effects, e.g.
+ * creation/destruction of /proc files.
+ * The xt ->destroy functions are run asynchronously from
+ * work queue. If we have pending invocations we thus
+ * need to wait for those to finish.
+ */
+ nf_tables_trans_destroy_flush_work();
+}
+
static int
nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -236,9 +255,25 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+ nft_compat_wait_for_destructors();
+
ret = xt_check_target(&par, size, proto, inv);
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ const char *modname = NULL;
+
+ if (strcmp(target->name, "LOG") == 0)
+ modname = "nf_log_syslog";
+ else if (strcmp(target->name, "NFLOG") == 0)
+ modname = "nfnetlink_log";
+
+ if (modname &&
+ nft_request_module(ctx->net, "%s", modname) == -EAGAIN)
+ return -EAGAIN;
+ }
+
return ret;
+ }
/* The standard target cannot be used */
if (!target->target)
@@ -247,6 +282,12 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return 0;
}
+static void __nft_mt_tg_destroy(struct module *me, const struct nft_expr *expr)
+{
+ module_put(me);
+ kfree(expr->ops);
+}
+
static void
nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -262,8 +303,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
if (par.target->destroy != NULL)
par.target->destroy(&par);
- module_put(me);
- kfree(expr->ops);
+ __nft_mt_tg_destroy(me, expr);
}
static int nft_extension_dump_info(struct sk_buff *skb, int attr,
@@ -332,13 +372,14 @@ static void __nft_match_eval(const struct nft_expr *expr,
{
struct xt_match *match = expr->ops->data;
struct sk_buff *skb = pkt->skb;
+ struct xt_action_param xt;
bool ret;
- nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info);
+ nft_compat_set_par(&xt, pkt, match, info);
- ret = match->match(skb, (struct xt_action_param *)&pkt->xt);
+ ret = match->match(skb, &xt);
- if (pkt->xt.hotdrop) {
+ if (xt.hotdrop) {
regs->verdict.code = NF_DROP;
return;
}
@@ -451,6 +492,8 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+ nft_compat_wait_for_destructors();
+
return xt_check_match(&par, size, proto, inv);
}
@@ -494,8 +537,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr,
if (par.match->destroy != NULL)
par.match->destroy(&par);
- module_put(me);
- kfree(expr->ops);
+ __nft_mt_tg_destroy(me, expr);
}
static void
@@ -572,19 +614,14 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int rev, int target)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
@@ -599,17 +636,15 @@ nla_put_failure:
return -1;
}
-static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_compat_get_rcu(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ u8 family = info->nfmsg->nfgen_family;
+ const char *name, *fmt;
+ struct sk_buff *skb2;
int ret = 0, target;
- struct nfgenmsg *nfmsg;
- const char *fmt;
- const char *name;
u32 rev;
- struct sk_buff *skb2;
if (tb[NFTA_COMPAT_NAME] == NULL ||
tb[NFTA_COMPAT_REV] == NULL ||
@@ -620,9 +655,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
- nfmsg = nlmsg_data(nlh);
-
- switch(nfmsg->nfgen_family) {
+ switch(family) {
case AF_INET:
fmt = "ipt_%s";
break;
@@ -636,8 +669,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
fmt = "arpt_%s";
break;
default:
- pr_err("nft_compat: unsupported protocol %d\n",
- nfmsg->nfgen_family);
+ pr_err("nft_compat: unsupported protocol %d\n", family);
return -EINVAL;
}
@@ -645,9 +677,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
return -EINVAL;
rcu_read_unlock();
- try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
- rev, target, &ret),
- fmt, name);
+ try_then_request_module(xt_find_revision(family, name, rev, target, &ret),
+ fmt, name);
if (ret < 0)
goto out_put;
@@ -659,23 +690,20 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
/* include the best revision for this extension in the message */
if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
NFNL_MSG_COMPAT_GET,
- nfmsg->nfgen_family,
- name, ret, target) <= 0) {
+ family, name, ret, target) <= 0) {
kfree_skb(skb2);
goto out_put;
}
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
out_put:
rcu_read_lock();
module_put(THIS_MODULE);
- return ret == -EAGAIN ? -ENOBUFS : ret;
+
+ return ret;
}
static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
@@ -686,9 +714,12 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
};
static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
- [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu,
- .attr_count = NFTA_COMPAT_MAX,
- .policy = nfnl_compat_policy_get },
+ [NFNL_MSG_COMPAT_GET] = {
+ .call = nfnl_compat_get_rcu,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_COMPAT_MAX,
+ .policy = nfnl_compat_policy_get
+ },
};
static const struct nfnetlink_subsystem nfnl_compat_subsys = {
@@ -700,6 +731,14 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = {
static struct nft_expr_type nft_match_type;
+static bool nft_match_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct xt_match *match = expr->ops->data;
+
+ return strcmp(match->name, "comment") == 0;
+}
+
static const struct nft_expr_ops *
nft_match_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -742,6 +781,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
ops->dump = nft_match_dump;
ops->validate = nft_match_validate;
ops->data = match;
+ ops->reduce = nft_match_reduce;
matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
if (matchsize > NFT_MATCH_LARGE_THRESH) {
@@ -831,6 +871,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
ops->dump = nft_target_dump;
ops->validate = nft_target_validate;
ops->data = target;
+ ops->reduce = NFT_REDUCE_READONLY;
if (family == NFPROTO_BRIDGE)
ops->eval = nft_target_eval_bridge;
@@ -902,3 +943,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("match");
MODULE_ALIAS_NFT_EXPR("target");
+MODULE_DESCRIPTION("x_tables over nftables support");
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 69d6173f91e2..d657f999a11b 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -14,7 +14,7 @@
#include <net/netfilter/nf_conntrack_zones.h>
struct nft_connlimit {
- struct nf_conncount_list list;
+ struct nf_conncount_list *list;
u32 limit;
bool invert;
};
@@ -43,12 +43,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
return;
}
- if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) {
+ if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) {
regs->verdict.code = NF_DROP;
return;
}
- count = priv->list.count;
+ count = priv->list->count;
if ((count > priv->limit) ^ priv->invert) {
regs->verdict.code = NFT_BREAK;
@@ -62,6 +62,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
{
bool invert = false;
u32 flags, limit;
+ int err;
if (!tb[NFTA_CONNLIMIT_COUNT])
return -EINVAL;
@@ -76,18 +77,31 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
invert = true;
}
- nf_conncount_list_init(&priv->list);
+ priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT);
+ if (!priv->list)
+ return -ENOMEM;
+
+ nf_conncount_list_init(priv->list);
priv->limit = limit;
priv->invert = invert;
- return nf_ct_netns_get(ctx->net, ctx->family);
+ err = nf_ct_netns_get(ctx->net, ctx->family);
+ if (err < 0)
+ goto err_netns;
+
+ return 0;
+err_netns:
+ kfree(priv->list);
+
+ return err;
}
static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
struct nft_connlimit *priv)
{
nf_ct_netns_put(ctx->net, ctx->family);
- nf_conncount_cache_free(&priv->list);
+ nf_conncount_cache_free(priv->list);
+ kfree(priv->list);
}
static int nft_connlimit_do_dump(struct sk_buff *skb,
@@ -200,7 +214,11 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
struct nft_connlimit *priv_dst = nft_expr_priv(dst);
struct nft_connlimit *priv_src = nft_expr_priv(src);
- nf_conncount_list_init(&priv_dst->list);
+ priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC);
+ if (!priv_dst->list)
+ return -ENOMEM;
+
+ nf_conncount_list_init(priv_dst->list);
priv_dst->limit = priv_src->limit;
priv_dst->invert = priv_src->invert;
@@ -212,7 +230,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- nf_conncount_cache_free(&priv->list);
+ nf_conncount_cache_free(priv->list);
+ kfree(priv->list);
}
static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
@@ -221,7 +240,7 @@ static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
bool ret;
local_bh_disable();
- ret = nf_conncount_gc_list(net, &priv->list);
+ ret = nf_conncount_gc_list(net, priv->list);
local_bh_enable();
return ret;
@@ -238,6 +257,7 @@ static const struct nft_expr_ops nft_connlimit_ops = {
.destroy_clone = nft_connlimit_destroy_clone,
.dump = nft_connlimit_dump,
.gc = nft_connlimit_gc,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_connlimit_type __read_mostly = {
@@ -280,3 +300,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso");
MODULE_ALIAS_NFT_EXPR("connlimit");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT);
+MODULE_DESCRIPTION("nftables connlimit rule support");
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index f6d4d0fa23a6..f4d3573e8782 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -13,6 +13,8 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables_offload.h>
struct nft_counter {
s64 bytes;
@@ -60,7 +62,7 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- cpu_stats = alloc_percpu(struct nft_counter);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT);
if (cpu_stats == NULL)
return -ENOMEM;
@@ -173,7 +175,7 @@ static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
[NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
};
-static struct nft_object_type nft_counter_obj_type;
+struct nft_object_type nft_counter_obj_type;
static const struct nft_object_ops nft_counter_obj_ops = {
.type = &nft_counter_obj_type,
.size = sizeof(struct nft_counter_percpu_priv),
@@ -183,7 +185,7 @@ static const struct nft_object_ops nft_counter_obj_ops = {
.dump = nft_counter_obj_dump,
};
-static struct nft_object_type nft_counter_obj_type __read_mostly = {
+struct nft_object_type nft_counter_obj_type __read_mostly = {
.type = NFT_OBJECT_COUNTER,
.ops = &nft_counter_obj_ops,
.maxattr = NFTA_COUNTER_MAX,
@@ -191,9 +193,8 @@ static struct nft_object_type nft_counter_obj_type __read_mostly = {
.owner = THIS_MODULE,
};
-static void nft_counter_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
@@ -248,7 +249,41 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
return 0;
}
-static struct nft_expr_type nft_counter_type;
+static int nft_counter_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ /* No specific offload action is needed, but report success. */
+ return 0;
+}
+
+static void nft_counter_offload_stats(struct nft_expr *expr,
+ const struct flow_stats *stats)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct nft_counter *this_cpu;
+ seqcount_t *myseq;
+
+ preempt_disable();
+ this_cpu = this_cpu_ptr(priv->counter);
+ myseq = this_cpu_ptr(&nft_counter_seq);
+
+ write_seqcount_begin(myseq);
+ this_cpu->packets += stats->pkts;
+ this_cpu->bytes += stats->bytes;
+ write_seqcount_end(myseq);
+ preempt_enable();
+}
+
+void nft_counter_init_seqcount(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
+}
+
+struct nft_expr_type nft_counter_type;
static const struct nft_expr_ops nft_counter_ops = {
.type = &nft_counter_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)),
@@ -258,9 +293,12 @@ static const struct nft_expr_ops nft_counter_ops = {
.destroy_clone = nft_counter_destroy,
.dump = nft_counter_dump,
.clone = nft_counter_clone,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_counter_offload,
+ .offload_stats = nft_counter_offload_stats,
};
-static struct nft_expr_type nft_counter_type __read_mostly = {
+struct nft_expr_type nft_counter_type __read_mostly = {
.name = "counter",
.ops = &nft_counter_ops,
.policy = nft_counter_policy,
@@ -268,38 +306,3 @@ static struct nft_expr_type nft_counter_type __read_mostly = {
.flags = NFT_EXPR_STATEFUL,
.owner = THIS_MODULE,
};
-
-static int __init nft_counter_module_init(void)
-{
- int cpu, err;
-
- for_each_possible_cpu(cpu)
- seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
-
- err = nft_register_obj(&nft_counter_obj_type);
- if (err < 0)
- return err;
-
- err = nft_register_expr(&nft_counter_type);
- if (err < 0)
- goto err1;
-
- return 0;
-err1:
- nft_unregister_obj(&nft_counter_obj_type);
- return err;
-}
-
-static void __exit nft_counter_module_exit(void)
-{
- nft_unregister_expr(&nft_counter_type);
- nft_unregister_obj(&nft_counter_obj_type);
-}
-
-module_init(nft_counter_module_init);
-module_exit(nft_counter_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("counter");
-MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index faea72c2df32..a3f01f209a53 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -26,9 +26,10 @@
struct nft_ct {
enum nft_ct_keys key:8;
enum ip_conntrack_dir dir:8;
+ u8 len;
union {
- enum nft_registers dreg:8;
- enum nft_registers sreg:8;
+ u8 dreg;
+ u8 sreg;
};
};
@@ -41,6 +42,7 @@ struct nft_ct_helper_obj {
#ifdef CONFIG_NF_CONNTRACK_ZONES
static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
+static DEFINE_MUTEX(nft_ct_pcpu_mutex);
#endif
static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
@@ -129,7 +131,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
return;
}
#endif
- case NFT_CT_BYTES: /* fallthrough */
+ case NFT_CT_BYTES:
case NFT_CT_PKTS: {
const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
u64 count = 0;
@@ -177,8 +179,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
}
#endif
case NFT_CT_ID:
- if (!nf_ct_is_confirmed(ct))
- goto err;
*dest = nf_ct_get_id(ct);
return;
default:
@@ -204,12 +204,12 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
case NFT_CT_SRC_IP:
if (nf_ct_l3num(ct) != NFPROTO_IPV4)
goto err;
- *dest = tuple->src.u3.ip;
+ *dest = (__force __u32)tuple->src.u3.ip;
return;
case NFT_CT_DST_IP:
if (nf_ct_l3num(ct) != NFPROTO_IPV4)
goto err;
- *dest = tuple->dst.u3.ip;
+ *dest = (__force __u32)tuple->dst.u3.ip;
return;
case NFT_CT_SRC_IP6:
if (nf_ct_l3num(ct) != NFPROTO_IPV6)
@@ -260,10 +260,13 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
ct = this_cpu_read(nft_ct_pcpu_template);
- if (likely(atomic_read(&ct->ct_general.use) == 1)) {
+ if (likely(refcount_read(&ct->ct_general.use) == 1)) {
+ refcount_inc(&ct->ct_general.use);
nf_ct_zone_add(ct, &zone);
} else {
- /* previous skb got queued to userspace */
+ /* previous skb got queued to userspace, allocate temporary
+ * one until percpu template can be reused.
+ */
ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC);
if (!ct) {
regs->verdict.code = NF_DROP;
@@ -271,7 +274,6 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
}
}
- atomic_inc(&ct->ct_general.use);
nf_ct_set(skb, ct, IP_CT_NEW);
}
#endif
@@ -376,7 +378,6 @@ static bool nft_ct_tmpl_alloc_pcpu(void)
return false;
}
- atomic_set(&tmp->ct_general.use, 1);
per_cpu(nft_ct_pcpu_template, cpu) = tmp;
}
@@ -500,9 +501,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
}
}
- priv->dreg = nft_parse_register(tb[NFTA_CT_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
if (err < 0)
return err;
@@ -528,8 +529,11 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
case NFT_CT_ZONE:
+ mutex_lock(&nft_ct_pcpu_mutex);
if (--nft_ct_pcpu_template_refcnt == 0)
nft_ct_tmpl_put_pcpu();
+ mutex_unlock(&nft_ct_pcpu_mutex);
+ break;
#endif
default:
break;
@@ -566,9 +570,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
case NFT_CT_ZONE:
- if (!nft_ct_tmpl_alloc_pcpu())
+ mutex_lock(&nft_ct_pcpu_mutex);
+ if (!nft_ct_tmpl_alloc_pcpu()) {
+ mutex_unlock(&nft_ct_pcpu_mutex);
return -ENOMEM;
+ }
nft_ct_pcpu_template_refcnt++;
+ mutex_unlock(&nft_ct_pcpu_mutex);
len = sizeof(u16);
break;
#endif
@@ -602,8 +610,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
}
}
- priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
- err = nft_validate_register_load(priv->sreg, len);
+ priv->len = len;
+ err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len);
if (err < 0)
goto err1;
@@ -672,6 +680,29 @@ nla_put_failure:
return -1;
}
+static bool nft_ct_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ const struct nft_ct *ct;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ ct = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != ct->key) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_ct *priv = nft_expr_priv(expr);
@@ -705,8 +736,27 @@ static const struct nft_expr_ops nft_ct_get_ops = {
.init = nft_ct_get_init,
.destroy = nft_ct_get_destroy,
.dump = nft_ct_get_dump,
+ .reduce = nft_ct_get_reduce,
};
+static bool nft_ct_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_ct_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_ct_set_ops = {
.type = &nft_ct_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
@@ -714,6 +764,7 @@ static const struct nft_expr_ops nft_ct_set_ops = {
.init = nft_ct_set_init,
.destroy = nft_ct_set_destroy,
.dump = nft_ct_set_dump,
+ .reduce = nft_ct_set_reduce,
};
#ifdef CONFIG_NF_CONNTRACK_ZONES
@@ -724,6 +775,7 @@ static const struct nft_expr_ops nft_ct_set_zone_ops = {
.init = nft_ct_set_init,
.destroy = nft_ct_set_destroy,
.dump = nft_ct_set_dump,
+ .reduce = nft_ct_set_reduce,
};
#endif
@@ -780,6 +832,7 @@ static const struct nft_expr_ops nft_notrack_ops = {
.type = &nft_notrack_type,
.size = NFT_EXPR_SIZE(0),
.eval = nft_notrack_eval,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_notrack_type __read_mostly = {
@@ -990,7 +1043,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
if (!priv->l4proto)
return -ENOENT;
- nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
+ nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
if (tb[NFTA_CT_HELPER_L3PROTO])
family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO]));
@@ -1013,8 +1066,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
help6 = nf_conntrack_helper_try_module_get(name, family,
priv->l4proto);
break;
- case NFPROTO_NETDEV: /* fallthrough */
- case NFPROTO_BRIDGE: /* same */
+ case NFPROTO_NETDEV:
+ case NFPROTO_BRIDGE:
case NFPROTO_INET:
help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4,
priv->l4proto);
@@ -1220,7 +1273,7 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj,
struct nf_conn *ct;
ct = nf_ct_get(pkt->skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_UNTRACKED) {
+ if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -1345,3 +1398,4 @@ MODULE_ALIAS_NFT_EXPR("notrack");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER);
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT);
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT);
+MODULE_DESCRIPTION("Netfilter nf_tables conntrack module");
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index c2e78c160fd7..63507402716d 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -14,7 +14,7 @@
#include <net/netfilter/nf_dup_netdev.h>
struct nft_dup_netdev {
- enum nft_registers sreg_dev:8;
+ u8 sreg_dev;
};
static void nft_dup_netdev_eval(const struct nft_expr *expr,
@@ -40,8 +40,8 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_DEV] == NULL)
return -EINVAL;
- priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ return nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev,
+ sizeof(int));
}
static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -67,6 +67,11 @@ static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx,
return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif);
}
+static bool nft_dup_netdev_offload_action(const struct nft_expr *expr)
+{
+ return true;
+}
+
static struct nft_expr_type nft_dup_netdev_type;
static const struct nft_expr_ops nft_dup_netdev_ops = {
.type = &nft_dup_netdev_type,
@@ -74,7 +79,9 @@ static const struct nft_expr_ops nft_dup_netdev_ops = {
.eval = nft_dup_netdev_eval,
.init = nft_dup_netdev_init,
.dump = nft_dup_netdev_dump,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_dup_netdev_offload,
+ .offload_action = nft_dup_netdev_offload_action,
};
static struct nft_expr_type nft_dup_netdev_type __read_mostly = {
@@ -102,3 +109,4 @@ module_exit(nft_dup_netdev_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_AF_EXPR(5, "dup");
+MODULE_DESCRIPTION("nftables netdev packet duplication support");
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 683785225a3e..6983e6ddeef9 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -16,28 +16,31 @@ struct nft_dynset {
struct nft_set *set;
struct nft_set_ext_tmpl tmpl;
enum nft_dynset_ops op:8;
- enum nft_registers sreg_key:8;
- enum nft_registers sreg_data:8;
+ u8 sreg_key;
+ u8 sreg_data;
bool invert;
+ bool expr;
+ u8 num_exprs;
u64 timeout;
- struct nft_expr *expr;
+ struct nft_expr *expr_array[NFT_SET_EXPR_MAX];
struct nft_set_binding binding;
};
-static int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
+static int nft_dynset_expr_setup(const struct nft_dynset *priv,
+ const struct nft_set_ext *ext)
{
- int err;
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ struct nft_expr *expr;
+ int i;
- if (src->ops->clone) {
- dst->ops = src->ops;
- err = src->ops->clone(dst, src);
- if (err < 0)
- return err;
- } else {
- memcpy(dst, src, src->ops->size);
+ for (i = 0; i < priv->num_exprs; i++) {
+ expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
+ if (nft_expr_clone(expr, priv->expr_array[i]) < 0)
+ return -1;
+
+ elem_expr->size += priv->expr_array[i]->ops->size;
}
- __module_get(src->ops->type->owner);
return 0;
}
@@ -57,12 +60,11 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
&regs->data[priv->sreg_key], NULL,
&regs->data[priv->sreg_data],
timeout, 0, GFP_ATOMIC);
- if (elem == NULL)
+ if (IS_ERR(elem))
goto err1;
ext = nft_set_elem_ext(set, elem);
- if (priv->expr != NULL &&
- nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0)
+ if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0)
goto err2;
return elem;
@@ -81,7 +83,6 @@ void nft_dynset_eval(const struct nft_expr *expr,
const struct nft_dynset *priv = nft_expr_priv(expr);
struct nft_set *set = priv->set;
const struct nft_set_ext *ext;
- const struct nft_expr *sexpr;
u64 timeout;
if (priv->op == NFT_DYNSET_OP_DELETE) {
@@ -91,18 +92,13 @@ void nft_dynset_eval(const struct nft_expr *expr,
if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
expr, regs, &ext)) {
- sexpr = NULL;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- sexpr = nft_set_ext_expr(ext);
-
if (priv->op == NFT_DYNSET_OP_UPDATE &&
nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
timeout = priv->timeout ? : set->timeout;
*nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
}
- if (sexpr != NULL)
- sexpr->ops->eval(sexpr, regs, pkt);
+ nft_set_elem_update_expr(ext, regs, pkt);
if (priv->invert)
regs->verdict.code = NFT_BREAK;
@@ -113,6 +109,41 @@ void nft_dynset_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
+static void nft_dynset_ext_add_expr(struct nft_dynset *priv)
+{
+ u8 size = 0;
+ int i;
+
+ for (i = 0; i < priv->num_exprs; i++)
+ size += priv->expr_array[i]->ops->size;
+
+ nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPRESSIONS,
+ sizeof(struct nft_set_elem_expr) + size);
+}
+
+static struct nft_expr *
+nft_dynset_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set,
+ const struct nlattr *attr, int pos)
+{
+ struct nft_expr *expr;
+ int err;
+
+ expr = nft_set_elem_expr_alloc(ctx, set, attr);
+ if (IS_ERR(expr))
+ return expr;
+
+ if (set->exprs[pos] && set->exprs[pos]->ops != expr->ops) {
+ err = -EOPNOTSUPP;
+ goto err_dynset_expr;
+ }
+
+ return expr;
+
+err_dynset_expr:
+ nft_expr_destroy(ctx, expr);
+ return ERR_PTR(err);
+}
+
static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
[NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
@@ -123,19 +154,21 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
[NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 },
[NFTA_DYNSET_EXPR] = { .type = NLA_NESTED },
[NFTA_DYNSET_FLAGS] = { .type = NLA_U32 },
+ [NFTA_DYNSET_EXPRESSIONS] = { .type = NLA_NESTED },
};
static int nft_dynset_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct nft_dynset *priv = nft_expr_priv(expr);
u8 genmask = nft_genmask_next(ctx->net);
struct nft_set *set;
u64 timeout;
- int err;
+ int err, i;
- lockdep_assert_held(&ctx->net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
if (tb[NFTA_DYNSET_SET_NAME] == NULL ||
tb[NFTA_DYNSET_OP] == NULL ||
@@ -144,11 +177,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (tb[NFTA_DYNSET_FLAGS]) {
u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
-
- if (flags & ~NFT_DYNSET_F_INV)
- return -EINVAL;
+ if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR))
+ return -EOPNOTSUPP;
if (flags & NFT_DYNSET_F_INV)
priv->invert = true;
+ if (flags & NFT_DYNSET_F_EXPR)
+ priv->expr = true;
}
set = nft_set_lookup_global(ctx->net, ctx->table,
@@ -164,82 +198,123 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
return -EBUSY;
priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP]));
- switch (priv->op) {
- case NFT_DYNSET_OP_ADD:
- case NFT_DYNSET_OP_DELETE:
- break;
- case NFT_DYNSET_OP_UPDATE:
- if (!(set->flags & NFT_SET_TIMEOUT))
- return -EOPNOTSUPP;
- break;
- default:
+ if (priv->op > NFT_DYNSET_OP_DELETE)
return -EOPNOTSUPP;
- }
timeout = 0;
if (tb[NFTA_DYNSET_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
- return -EINVAL;
- timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
- tb[NFTA_DYNSET_TIMEOUT])));
+ return -EOPNOTSUPP;
+
+ err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout);
+ if (err)
+ return err;
}
- priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]);
- err = nft_validate_register_load(priv->sreg_key, set->klen);
+ err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key,
+ set->klen);
if (err < 0)
return err;
if (tb[NFTA_DYNSET_SREG_DATA] != NULL) {
if (!(set->flags & NFT_SET_MAP))
- return -EINVAL;
+ return -EOPNOTSUPP;
if (set->dtype == NFT_DATA_VERDICT)
return -EOPNOTSUPP;
- priv->sreg_data = nft_parse_register(tb[NFTA_DYNSET_SREG_DATA]);
- err = nft_validate_register_load(priv->sreg_data, set->dlen);
+ err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_DATA],
+ &priv->sreg_data, set->dlen);
if (err < 0)
return err;
} else if (set->flags & NFT_SET_MAP)
return -EINVAL;
- if (tb[NFTA_DYNSET_EXPR] != NULL) {
- if (!(set->flags & NFT_SET_EVAL))
- return -EINVAL;
+ if ((tb[NFTA_DYNSET_EXPR] || tb[NFTA_DYNSET_EXPRESSIONS]) &&
+ !(set->flags & NFT_SET_EVAL))
+ return -EINVAL;
- priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
- if (IS_ERR(priv->expr))
- return PTR_ERR(priv->expr);
+ if (tb[NFTA_DYNSET_EXPR]) {
+ struct nft_expr *dynset_expr;
- err = -EOPNOTSUPP;
- if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
- goto err1;
-
- if (priv->expr->ops->type->flags & NFT_EXPR_GC) {
- if (set->flags & NFT_SET_TIMEOUT)
- goto err1;
- if (!set->ops->gc_init)
- goto err1;
- set->ops->gc_init(set);
+ dynset_expr = nft_dynset_expr_alloc(ctx, set,
+ tb[NFTA_DYNSET_EXPR], 0);
+ if (IS_ERR(dynset_expr))
+ return PTR_ERR(dynset_expr);
+
+ priv->num_exprs++;
+ priv->expr_array[0] = dynset_expr;
+
+ if (set->num_exprs > 1 ||
+ (set->num_exprs == 1 &&
+ dynset_expr->ops != set->exprs[0]->ops)) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
}
+ } else if (tb[NFTA_DYNSET_EXPRESSIONS]) {
+ struct nft_expr *dynset_expr;
+ struct nlattr *tmp;
+ int left;
+
+ if (!priv->expr)
+ return -EINVAL;
+
+ i = 0;
+ nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX) {
+ err = -E2BIG;
+ goto err_expr_free;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_expr_free;
+ }
+ dynset_expr = nft_dynset_expr_alloc(ctx, set, tmp, i);
+ if (IS_ERR(dynset_expr)) {
+ err = PTR_ERR(dynset_expr);
+ goto err_expr_free;
+ }
+ priv->expr_array[i] = dynset_expr;
+ priv->num_exprs++;
+
+ if (set->num_exprs &&
+ dynset_expr->ops != set->exprs[i]->ops) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
+ i++;
+ }
+ if (set->num_exprs && set->num_exprs != i) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
+ } else if (set->num_exprs > 0) {
+ err = nft_set_elem_expr_clone(ctx, set, priv->expr_array);
+ if (err < 0)
+ return err;
+
+ priv->num_exprs = set->num_exprs;
}
nft_set_ext_prepare(&priv->tmpl);
nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen);
if (set->flags & NFT_SET_MAP)
nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen);
- if (priv->expr != NULL)
- nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR,
- priv->expr->ops->size);
+
+ if (priv->num_exprs)
+ nft_dynset_ext_add_expr(priv);
+
if (set->flags & NFT_SET_TIMEOUT) {
- if (timeout || set->timeout)
+ if (timeout || set->timeout) {
+ nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT);
nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION);
+ }
}
priv->timeout = timeout;
err = nf_tables_bind_set(ctx, set, &priv->binding);
if (err < 0)
- goto err1;
+ goto err_expr_free;
if (set->size == 0)
set->size = 0xffff;
@@ -247,9 +322,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
priv->set = set;
return 0;
-err1:
- if (priv->expr != NULL)
- nft_expr_destroy(ctx, priv->expr);
+err_expr_free:
+ for (i = 0; i < priv->num_exprs; i++)
+ nft_expr_destroy(ctx, priv->expr_array[i]);
return err;
}
@@ -274,9 +349,10 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
struct nft_dynset *priv = nft_expr_priv(expr);
+ int i;
- if (priv->expr != NULL)
- nft_expr_destroy(ctx, priv->expr);
+ for (i = 0; i < priv->num_exprs; i++)
+ nft_expr_destroy(ctx, priv->expr_array[i]);
nf_tables_destroy_set(ctx, priv->set);
}
@@ -285,6 +361,7 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
+ int i;
if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
goto nla_put_failure;
@@ -296,11 +373,29 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name))
goto nla_put_failure;
if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT,
- cpu_to_be64(jiffies_to_msecs(priv->timeout)),
+ nf_jiffies64_to_msecs(priv->timeout),
NFTA_DYNSET_PAD))
goto nla_put_failure;
- if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
- goto nla_put_failure;
+ if (priv->set->num_exprs == 0) {
+ if (priv->num_exprs == 1) {
+ if (nft_expr_dump(skb, NFTA_DYNSET_EXPR,
+ priv->expr_array[0]))
+ goto nla_put_failure;
+ } else if (priv->num_exprs > 1) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, NFTA_DYNSET_EXPRESSIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ for (i = 0; i < priv->num_exprs; i++) {
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM,
+ priv->expr_array[i]))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
+ }
if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags)))
goto nla_put_failure;
return 0;
@@ -318,6 +413,7 @@ static const struct nft_expr_ops nft_dynset_ops = {
.activate = nft_dynset_activate,
.deactivate = nft_dynset_deactivate,
.dump = nft_dynset_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_dynset_type __read_mostly = {
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a5e8469859e3..a67ea9c3ae57 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -10,8 +10,10 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/sctp.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/sctp/sctp.h>
#include <net/tcp.h>
struct nft_exthdr {
@@ -19,8 +21,8 @@ struct nft_exthdr {
u8 offset;
u8 len;
u8 op;
- enum nft_registers dreg:8;
- enum nft_registers sreg:8;
+ u8 dreg;
+ u8 sreg;
u8 flags;
};
@@ -42,9 +44,12 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
unsigned int offset = 0;
int err;
+ if (pkt->skb->protocol != htons(ETH_P_IPV6))
+ goto err;
+
err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
if (priv->flags & NFT_EXTHDR_F_PRESENT) {
- *dest = (err >= 0);
+ nft_reg_store8(dest, err >= 0);
return;
} else if (err < 0) {
goto err;
@@ -141,7 +146,7 @@ static void nft_exthdr_ipv4_eval(const struct nft_expr *expr,
err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type);
if (priv->flags & NFT_EXTHDR_F_PRESENT) {
- *dest = (err >= 0);
+ nft_reg_store8(dest, err >= 0);
return;
} else if (err < 0) {
goto err;
@@ -162,10 +167,10 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
{
struct tcphdr *tcph;
- if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP)
+ if (pkt->tprot != IPPROTO_TCP || pkt->fragoff)
return NULL;
- tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buffer);
+ tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer);
if (!tcph)
return NULL;
@@ -173,7 +178,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len)
return NULL;
- return skb_header_pointer(pkt->skb, pkt->xt.thoff, *tcphdr_len, buffer);
+ return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer);
}
static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
@@ -228,16 +233,14 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
unsigned int i, optl, tcphdr_len, offset;
struct tcphdr *tcph;
u8 *opt;
- u32 src;
tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
if (!tcph)
- return;
+ goto err;
opt = (u8 *)tcph;
for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
union {
- u8 octet;
__be16 v16;
__be32 v32;
} old, new;
@@ -248,24 +251,24 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
continue;
if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
- return;
+ goto err;
if (skb_ensure_writable(pkt->skb,
- pkt->xt.thoff + i + priv->len))
- return;
+ nft_thoff(pkt) + i + priv->len))
+ goto err;
tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff,
&tcphdr_len);
if (!tcph)
- return;
+ goto err;
- src = regs->data[priv->sreg];
offset = i + priv->offset;
switch (priv->len) {
case 2:
- old.v16 = get_unaligned((u16 *)(opt + offset));
- new.v16 = src;
+ old.v16 = (__force __be16)get_unaligned((u16 *)(opt + offset));
+ new.v16 = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg]);
switch (priv->type) {
case TCPOPT_MSS:
@@ -278,18 +281,18 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (old.v16 == new.v16)
return;
- put_unaligned(new.v16, (u16*)(opt + offset));
+ put_unaligned(new.v16, (__be16*)(opt + offset));
inet_proto_csum_replace2(&tcph->check, pkt->skb,
old.v16, new.v16, false);
break;
case 4:
- new.v32 = src;
- old.v32 = get_unaligned((u32 *)(opt + offset));
+ new.v32 = nft_reg_load_be32(&regs->data[priv->sreg]);
+ old.v32 = (__force __be32)get_unaligned((u32 *)(opt + offset));
if (old.v32 == new.v32)
return;
- put_unaligned(new.v32, (u32*)(opt + offset));
+ put_unaligned(new.v32, (__be32*)(opt + offset));
inet_proto_csum_replace4(&tcph->check, pkt->skb,
old.v32, new.v32, false);
break;
@@ -300,6 +303,108 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
return;
}
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ unsigned int i, tcphdr_len, optl;
+ struct tcphdr *tcph;
+ u8 *opt;
+
+ tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+ if (!tcph)
+ goto err;
+
+ if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
+ goto drop;
+
+ opt = (u8 *)nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+ if (!opt)
+ goto err;
+ for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+ unsigned int j;
+
+ optl = optlen(opt, i);
+ if (priv->type != opt[i])
+ continue;
+
+ if (i + optl > tcphdr_len)
+ goto drop;
+
+ for (j = 0; j < optl; ++j) {
+ u16 n = TCPOPT_NOP;
+ u16 o = opt[i+j];
+
+ if ((i + j) % 2 == 0) {
+ o <<= 8;
+ n <<= 8;
+ }
+ inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o),
+ htons(n), false);
+ }
+ memset(opt + i, TCPOPT_NOP, optl);
+ return;
+ }
+
+ /* option not found, continue. This allows to do multiple
+ * option removals per rule.
+ */
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+ return;
+drop:
+ /* can't remove, no choice but to drop */
+ regs->verdict.code = NF_DROP;
+}
+
+static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr);
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ const struct sctp_chunkhdr *sch;
+ struct sctp_chunkhdr _sch;
+
+ if (pkt->tprot != IPPROTO_SCTP)
+ goto err;
+
+ do {
+ sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch);
+ if (!sch || !sch->length)
+ break;
+
+ if (sch->type == priv->type) {
+ if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+ nft_reg_store8(dest, true);
+ return;
+ }
+ if (priv->offset + priv->len > ntohs(sch->length) ||
+ offset + ntohs(sch->length) > pkt->skb->len)
+ break;
+
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (skb_copy_bits(pkt->skb, offset + priv->offset,
+ dest, priv->len) < 0)
+ break;
+ return;
+ }
+ offset += SCTP_PAD4(ntohs(sch->length));
+ } while (offset < pkt->skb->len);
+err:
+ if (priv->flags & NFT_EXTHDR_F_PRESENT)
+ nft_reg_store8(dest, false);
+ else
+ regs->verdict.code = NFT_BREAK;
}
static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
@@ -352,12 +457,12 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
priv->offset = offset;
priv->len = len;
- priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
priv->flags = flags;
priv->op = op;
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ return nft_parse_register_store(ctx, tb[NFTA_EXTHDR_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
}
static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
@@ -402,11 +507,33 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
priv->offset = offset;
priv->len = len;
- priv->sreg = nft_parse_register(tb[NFTA_EXTHDR_SREG]);
priv->flags = flags;
priv->op = op;
- return nft_validate_register_load(priv->sreg, priv->len);
+ return nft_parse_register_load(tb[NFTA_EXTHDR_SREG], &priv->sreg,
+ priv->len);
+}
+
+static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_EXTHDR_SREG] ||
+ tb[NFTA_EXTHDR_DREG] ||
+ tb[NFTA_EXTHDR_FLAGS] ||
+ tb[NFTA_EXTHDR_OFFSET] ||
+ tb[NFTA_EXTHDR_LEN])
+ return -EINVAL;
+
+ if (!tb[NFTA_EXTHDR_TYPE])
+ return -EINVAL;
+
+ priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+ priv->op = NFT_EXTHDR_OP_TCPOPT;
+
+ return 0;
}
static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
@@ -469,12 +596,47 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
+static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ return nft_exthdr_dump_common(skb, priv);
+}
+
+static bool nft_exthdr_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+ const struct nft_exthdr *exthdr;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ exthdr = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->type != exthdr->type ||
+ priv->op != exthdr->op ||
+ priv->flags != exthdr->flags ||
+ priv->offset != exthdr->offset ||
+ priv->len != exthdr->len) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
.eval = nft_exthdr_ipv6_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
@@ -483,6 +645,7 @@ static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
.eval = nft_exthdr_ipv4_eval,
.init = nft_exthdr_ipv4_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_tcp_ops = {
@@ -491,6 +654,7 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = {
.eval = nft_exthdr_tcp_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
@@ -499,6 +663,25 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
.eval = nft_exthdr_tcp_set_eval,
.init = nft_exthdr_tcp_set_init,
.dump = nft_exthdr_dump_set,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_tcp_strip_eval,
+ .init = nft_exthdr_tcp_strip_init,
+ .dump = nft_exthdr_dump_strip,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static const struct nft_expr_ops nft_exthdr_sctp_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_sctp_eval,
+ .init = nft_exthdr_init,
+ .dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops *
@@ -520,7 +703,7 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
return &nft_exthdr_tcp_set_ops;
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_tcp_ops;
- break;
+ return &nft_exthdr_tcp_strip_ops;
case NFT_EXTHDR_OP_IPV6:
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_ipv6_ops;
@@ -531,6 +714,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
return &nft_exthdr_ipv4_ops;
}
break;
+ case NFT_EXTHDR_OP_SCTP:
+ if (tb[NFTA_EXTHDR_DREG])
+ return &nft_exthdr_sctp_ops;
+ break;
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index cfac0964f48d..1f12d7ade606 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -32,9 +32,13 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
unsigned int hooks;
switch (priv->result) {
- case NFT_FIB_RESULT_OIF: /* fallthrough */
+ case NFT_FIB_RESULT_OIF:
case NFT_FIB_RESULT_OIFNAME:
hooks = (1 << NF_INET_PRE_ROUTING);
+ if (priv->flags & NFTA_FIB_F_IIF) {
+ hooks |= (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD);
+ }
break;
case NFT_FIB_RESULT_ADDRTYPE:
if (priv->flags & NFTA_FIB_F_IIF)
@@ -86,7 +90,6 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
- priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]);
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
@@ -106,8 +109,8 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
}
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ err = nft_parse_register_store(ctx, tb[NFTA_FIB_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
if (err < 0)
return err;
@@ -157,5 +160,47 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv,
}
EXPORT_SYMBOL_GPL(nft_fib_store_result);
+bool nft_fib_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int len = NFT_REG32_SIZE;
+ const struct nft_fib *fib;
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ if (priv->flags & NFTA_FIB_F_PRESENT)
+ len = NFT_REG32_SIZE;
+ else
+ len = IFNAMSIZ;
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, len);
+ return false;
+ }
+
+ fib = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->result != fib->result ||
+ priv->flags != fib->flags) {
+ nft_reg_track_update(track, expr, priv->dreg, len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nft_fib_reduce);
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c
index 465432e0531b..666a3741d20b 100644
--- a/net/netfilter/nft_fib_inet.c
+++ b/net/netfilter/nft_fib_inet.c
@@ -49,6 +49,7 @@ static const struct nft_expr_ops nft_fib_inet_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static struct nft_expr_type nft_fib_inet_type __read_mostly = {
@@ -76,3 +77,4 @@ module_exit(nft_fib_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
MODULE_ALIAS_NFT_AF_EXPR(1, "fib");
+MODULE_DESCRIPTION("nftables fib inet support");
diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c
index a2e726ae7f07..9121ec64e918 100644
--- a/net/netfilter/nft_fib_netdev.c
+++ b/net/netfilter/nft_fib_netdev.c
@@ -58,6 +58,7 @@ static const struct nft_expr_ops nft_fib_netdev_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static struct nft_expr_type nft_fib_netdev_type __read_mostly = {
@@ -85,3 +86,4 @@ module_exit(nft_fib_netdev_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo M. Bermudo Garay <pablombg@gmail.com>");
MODULE_ALIAS_NFT_AF_EXPR(5, "fib");
+MODULE_DESCRIPTION("nftables netdev fib lookups support");
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index b70b48996801..a25c88bc8b75 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -19,10 +19,210 @@ struct nft_flow_offload {
struct nft_flowtable *flowtable;
};
+static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
+{
+ if (dst_xfrm(dst))
+ return FLOW_OFFLOAD_XMIT_XFRM;
+
+ return FLOW_OFFLOAD_XMIT_NEIGH;
+}
+
+static void nft_default_forward_path(struct nf_flow_route *route,
+ struct dst_entry *dst_cache,
+ enum ip_conntrack_dir dir)
+{
+ route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex;
+ route->tuple[dir].dst = dst_cache;
+ route->tuple[dir].xmit_type = nft_xmit_type(dst_cache);
+}
+
+static bool nft_is_valid_ether_device(const struct net_device *dev)
+{
+ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+ return false;
+
+ return true;
+}
+
+static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
+ const struct dst_entry *dst_cache,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir, u8 *ha,
+ struct net_device_path_stack *stack)
+{
+ const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
+ struct net_device *dev = dst_cache->dev;
+ struct neighbour *n;
+ u8 nud_state;
+
+ if (!nft_is_valid_ether_device(dev))
+ goto out;
+
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -1;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+
+ if (!(nud_state & NUD_VALID))
+ return -1;
+
+out:
+ return dev_fill_forward_path(dev, ha, stack);
+}
+
+struct nft_forward_info {
+ const struct net_device *indev;
+ const struct net_device *outdev;
+ const struct net_device *hw_outdev;
+ struct id {
+ __u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
+ u8 num_encaps;
+ u8 ingress_vlans;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ enum flow_offload_xmit_type xmit_type;
+};
+
+static void nft_dev_path_info(const struct net_device_path_stack *stack,
+ struct nft_forward_info *info,
+ unsigned char *ha, struct nf_flowtable *flowtable)
+{
+ const struct net_device_path *path;
+ int i;
+
+ memcpy(info->h_dest, ha, ETH_ALEN);
+
+ for (i = 0; i < stack->num_paths; i++) {
+ path = &stack->path[i];
+ switch (path->type) {
+ case DEV_PATH_ETHERNET:
+ case DEV_PATH_DSA:
+ case DEV_PATH_VLAN:
+ case DEV_PATH_PPPOE:
+ info->indev = path->dev;
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ if (path->type == DEV_PATH_ETHERNET)
+ break;
+ if (path->type == DEV_PATH_DSA) {
+ i = stack->num_paths;
+ break;
+ }
+
+ /* DEV_PATH_VLAN and DEV_PATH_PPPOE */
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ if (!info->outdev)
+ info->outdev = path->dev;
+ info->encap[info->num_encaps].id = path->encap.id;
+ info->encap[info->num_encaps].proto = path->encap.proto;
+ info->num_encaps++;
+ if (path->type == DEV_PATH_PPPOE)
+ memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ break;
+ case DEV_PATH_BRIDGE:
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ info->ingress_vlans |= BIT(info->num_encaps - 1);
+ break;
+ case DEV_PATH_BR_VLAN_TAG:
+ info->encap[info->num_encaps].id = path->bridge.vlan_id;
+ info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
+ info->num_encaps++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG:
+ info->num_encaps--;
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+ break;
+ default:
+ info->indev = NULL;
+ break;
+ }
+ }
+ if (!info->outdev)
+ info->outdev = info->indev;
+
+ info->hw_outdev = info->indev;
+
+ if (nf_flowtable_hw_offload(flowtable) &&
+ nft_is_valid_ether_device(info->indev))
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+}
+
+static bool nft_flowtable_find_dev(const struct net_device *dev,
+ struct nft_flowtable *ft)
+{
+ struct nft_hook *hook;
+ bool found = false;
+
+ list_for_each_entry_rcu(hook, &ft->hook_list, list) {
+ if (hook->ops.dev != dev)
+ continue;
+
+ found = true;
+ break;
+ }
+
+ return found;
+}
+
+static void nft_dev_forward_path(struct nf_flow_route *route,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
+{
+ const struct dst_entry *dst = route->tuple[dir].dst;
+ struct net_device_path_stack stack;
+ struct nft_forward_info info = {};
+ unsigned char ha[ETH_ALEN];
+ int i;
+
+ if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+ nft_dev_path_info(&stack, &info, ha, &ft->data);
+
+ if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
+ return;
+
+ route->tuple[!dir].in.ifindex = info.indev->ifindex;
+ for (i = 0; i < info.num_encaps; i++) {
+ route->tuple[!dir].in.encap[i].id = info.encap[i].id;
+ route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
+ }
+ route->tuple[!dir].in.num_encaps = info.num_encaps;
+ route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
+
+ if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+ memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+ memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+ route->tuple[dir].out.ifindex = info.outdev->ifindex;
+ route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
+ route->tuple[dir].xmit_type = info.xmit_type;
+ }
+}
+
static int nft_flow_route(const struct nft_pktinfo *pkt,
const struct nf_conn *ct,
struct nf_flow_route *route,
- enum ip_conntrack_dir dir)
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
{
struct dst_entry *this_dst = skb_dst(pkt->skb);
struct dst_entry *other_dst = NULL;
@@ -32,11 +232,21 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
switch (nft_pf(pkt)) {
case NFPROTO_IPV4:
fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
+ fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
+ fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos);
+ fl.u.ip4.flowi4_mark = pkt->skb->mark;
+ fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
break;
case NFPROTO_IPV6:
fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6;
fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
+ fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
+ fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+ fl.u.ip6.flowi6_mark = pkt->skb->mark;
+ fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
break;
}
@@ -44,8 +254,14 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
if (!other_dst)
return -ENOENT;
- route->tuple[dir].dst = this_dst;
- route->tuple[!dir].dst = other_dst;
+ nft_default_forward_path(route, this_dst, dir);
+ nft_default_forward_path(route, other_dst, !dir);
+
+ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH &&
+ route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
+ nft_dev_forward_path(route, ct, dir, ft);
+ nft_dev_forward_path(route, ct, !dir, ft);
+ }
return 0;
}
@@ -74,8 +290,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
struct nft_flow_offload *priv = nft_expr_priv(expr);
struct nf_flowtable *flowtable = &priv->flowtable->data;
struct tcphdr _tcph, *tcph = NULL;
+ struct nf_flow_route route = {};
enum ip_conntrack_info ctinfo;
- struct nf_flow_route route;
struct flow_offload *flow;
enum ip_conntrack_dir dir;
struct nf_conn *ct;
@@ -90,19 +306,33 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
case IPPROTO_TCP:
- tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff,
+ tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt),
sizeof(_tcph), &_tcph);
- if (unlikely(!tcph || tcph->fin || tcph->rst))
+ if (unlikely(!tcph || tcph->fin || tcph->rst ||
+ !nf_conntrack_tcp_established(ct)))
goto out;
break;
case IPPROTO_UDP:
break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE: {
+ struct nf_conntrack_tuple *tuple;
+
+ if (ct->status & IPS_NAT_MASK)
+ goto out;
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ /* No support for GRE v1 */
+ if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
+ goto out;
+ break;
+ }
+#endif
default:
goto out;
}
if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
- ct->status & IPS_SEQ_ADJUST)
+ ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH))
goto out;
if (!nf_ct_is_confirmed(ct))
@@ -112,7 +342,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
goto out;
dir = CTINFO2DIR(ctinfo);
- if (nft_flow_route(pkt, ct, &route, dir) < 0)
+ if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0)
goto err_flow_route;
flow = flow_offload_alloc(ct);
@@ -227,6 +457,7 @@ static const struct nft_expr_ops nft_flow_offload_ops = {
.destroy = nft_flow_offload_destroy,
.validate = nft_flow_offload_validate,
.dump = nft_flow_offload_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_flow_offload_type __read_mostly = {
@@ -286,3 +517,4 @@ module_exit(nft_flow_offload_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("flow_offload");
+MODULE_DESCRIPTION("nftables hardware flow offload module");
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index aba11c2333f3..7c5876dc9ff2 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -18,7 +18,7 @@
#include <net/ip.h>
struct nft_fwd_netdev {
- enum nft_registers sreg_dev:8;
+ u8 sreg_dev;
};
static void nft_fwd_netdev_eval(const struct nft_expr *expr,
@@ -27,6 +27,11 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
{
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
+ struct sk_buff *skb = pkt->skb;
+
+ /* This is used by ifb only. */
+ skb->skb_iif = skb->dev->ifindex;
+ skb_set_redirected(skb, nft_hook(pkt) == NF_NETDEV_INGRESS);
nf_fwd_netdev_egress(pkt, oif);
regs->verdict.code = NF_STOLEN;
@@ -47,8 +52,8 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
if (tb[NFTA_FWD_SREG_DEV] == NULL)
return -EINVAL;
- priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ return nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
+ sizeof(int));
}
static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -74,9 +79,14 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx,
return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif);
}
+static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr)
+{
+ return true;
+}
+
struct nft_fwd_neigh {
- enum nft_registers sreg_dev:8;
- enum nft_registers sreg_addr:8;
+ u8 sreg_dev;
+ u8 sreg_addr;
u8 nfproto;
};
@@ -135,6 +145,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
return;
skb->dev = dev;
+ skb_clear_tstamp(skb);
neigh_xmit(neigh_table, dev, addr, skb);
out:
regs->verdict.code = verdict;
@@ -153,8 +164,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
!tb[NFTA_FWD_NFPROTO])
return -EINVAL;
- priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
- priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
switch (priv->nfproto) {
@@ -168,11 +177,13 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ err = nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
+ sizeof(int));
if (err < 0)
return err;
- return nft_validate_register_load(priv->sreg_addr, addr_len);
+ return nft_parse_register_load(tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr,
+ addr_len);
}
static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -190,6 +201,14 @@ nla_put_failure:
return -1;
}
+static int nft_fwd_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) |
+ (1 << NF_NETDEV_EGRESS));
+}
+
static struct nft_expr_type nft_fwd_netdev_type;
static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.type = &nft_fwd_netdev_type,
@@ -197,6 +216,8 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.eval = nft_fwd_neigh_eval,
.init = nft_fwd_neigh_init,
.dump = nft_fwd_neigh_dump,
+ .validate = nft_fwd_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops nft_fwd_netdev_ops = {
@@ -205,7 +226,10 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
.eval = nft_fwd_netdev_eval,
.init = nft_fwd_netdev_init,
.dump = nft_fwd_netdev_dump,
+ .validate = nft_fwd_validate,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_fwd_netdev_offload,
+ .offload_action = nft_fwd_netdev_offload_action,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index b836d550b919..e5631e88b285 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -14,8 +14,8 @@
#include <linux/jhash.h>
struct nft_jhash {
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
u8 len;
bool autogen_seed:1;
u32 modulus;
@@ -38,7 +38,7 @@ static void nft_jhash_eval(const struct nft_expr *expr,
}
struct nft_symhash {
- enum nft_registers dreg:8;
+ u8 dreg;
u32 modulus;
u32 offset;
};
@@ -83,9 +83,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
if (tb[NFTA_HASH_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
- priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
- priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
-
err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len);
if (err < 0)
return err;
@@ -94,6 +91,10 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
priv->len = len;
+ err = nft_parse_register_load(tb[NFTA_HASH_SREG], &priv->sreg, len);
+ if (err < 0)
+ return err;
+
priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
if (priv->modulus < 1)
return -ERANGE;
@@ -108,9 +109,8 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
get_random_bytes(&priv->seed, sizeof(priv->seed));
}
- return nft_validate_register_load(priv->sreg, len) &&
- nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
}
static int nft_symhash_init(const struct nft_ctx *ctx,
@@ -126,8 +126,6 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
if (tb[NFTA_HASH_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
- priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
-
priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
if (priv->modulus < 1)
return -ERANGE;
@@ -135,8 +133,9 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ sizeof(u32));
}
static int nft_jhash_dump(struct sk_buff *skb,
@@ -166,6 +165,16 @@ nla_put_failure:
return -1;
}
+static bool nft_jhash_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_jhash *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, sizeof(u32));
+
+ return false;
+}
+
static int nft_symhash_dump(struct sk_buff *skb,
const struct nft_expr *expr)
{
@@ -186,6 +195,30 @@ nla_put_failure:
return -1;
}
+static bool nft_symhash_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_symhash *priv = nft_expr_priv(expr);
+ struct nft_symhash *symhash;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, sizeof(u32));
+ return false;
+ }
+
+ symhash = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->offset != symhash->offset ||
+ priv->modulus != symhash->modulus) {
+ nft_reg_track_update(track, expr, priv->dreg, sizeof(u32));
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+
static struct nft_expr_type nft_hash_type;
static const struct nft_expr_ops nft_jhash_ops = {
.type = &nft_hash_type,
@@ -193,6 +226,7 @@ static const struct nft_expr_ops nft_jhash_ops = {
.eval = nft_jhash_eval,
.init = nft_jhash_init,
.dump = nft_jhash_dump,
+ .reduce = nft_jhash_reduce,
};
static const struct nft_expr_ops nft_symhash_ops = {
@@ -201,6 +235,7 @@ static const struct nft_expr_ops nft_symhash_ops = {
.eval = nft_symhash_eval,
.init = nft_symhash_init,
.dump = nft_symhash_dump,
+ .reduce = nft_symhash_reduce,
};
static const struct nft_expr_ops *
@@ -248,3 +283,4 @@ module_exit(nft_hash_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
MODULE_ALIAS_NFT_EXPR("hash");
+MODULE_DESCRIPTION("Netfilter nftables hash module");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index c7f0ef73d939..5f28b21abc7d 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -29,31 +29,64 @@ static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = {
[NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED },
};
+static enum nft_data_types nft_reg_to_type(const struct nlattr *nla)
+{
+ enum nft_data_types type;
+ u8 reg;
+
+ reg = ntohl(nla_get_be32(nla));
+ if (reg == NFT_REG_VERDICT)
+ type = NFT_DATA_VERDICT;
+ else
+ type = NFT_DATA_VALUE;
+
+ return type;
+}
+
static int nft_immediate_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_immediate_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
+ struct nft_data_desc desc = {
+ .size = sizeof(priv->data),
+ };
int err;
if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
tb[NFTA_IMMEDIATE_DATA] == NULL)
return -EINVAL;
- err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc,
- tb[NFTA_IMMEDIATE_DATA]);
+ desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]);
+ err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]);
if (err < 0)
return err;
priv->dlen = desc.len;
- priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, &priv->data,
- desc.type, desc.len);
+ err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG],
+ &priv->dreg, &priv->data, desc.type,
+ desc.len);
if (err < 0)
goto err1;
+ if (priv->dreg == NFT_REG_VERDICT) {
+ struct nft_chain *chain = priv->data.verdict.chain;
+
+ switch (priv->data.verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ if (nft_chain_is_bound(chain)) {
+ err = -EBUSY;
+ goto err1;
+ }
+ chain->bound = true;
+ break;
+ default:
+ break;
+ }
+ }
+
return 0;
err1:
@@ -81,6 +114,39 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx,
return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg));
}
+static void nft_immediate_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data = &priv->data;
+ struct nft_rule *rule, *n;
+ struct nft_ctx chain_ctx;
+ struct nft_chain *chain;
+
+ if (priv->dreg != NFT_REG_VERDICT)
+ return;
+
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ chain = data->verdict.chain;
+
+ if (!nft_chain_is_bound(chain))
+ break;
+
+ chain_ctx = *ctx;
+ chain_ctx.chain = chain;
+
+ list_for_each_entry_safe(rule, n, &chain->rules, list)
+ nf_tables_rule_release(&chain_ctx, rule);
+
+ nf_tables_chain_destroy(&chain_ctx);
+ break;
+ default:
+ break;
+ }
+}
+
static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
@@ -163,6 +229,27 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx,
return 0;
}
+static bool nft_immediate_offload_action(const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg == NFT_REG_VERDICT)
+ return true;
+
+ return false;
+}
+
+static bool nft_immediate_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg != NFT_REG_VERDICT)
+ nft_reg_track_cancel(track, priv->dreg, priv->dlen);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_imm_ops = {
.type = &nft_imm_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -170,10 +257,12 @@ static const struct nft_expr_ops nft_imm_ops = {
.init = nft_immediate_init,
.activate = nft_immediate_activate,
.deactivate = nft_immediate_deactivate,
+ .destroy = nft_immediate_destroy,
.dump = nft_immediate_dump,
.validate = nft_immediate_validate,
+ .reduce = nft_immediate_reduce,
.offload = nft_immediate_offload,
- .offload_flags = NFT_OFFLOAD_F_ACTION,
+ .offload_action = nft_immediate_offload_action,
};
struct nft_expr_type nft_imm_type __read_mostly = {
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
new file mode 100644
index 000000000000..bb15a55dad5c
--- /dev/null
+++ b/net/netfilter/nft_last.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_last {
+ unsigned long jiffies;
+ unsigned int set;
+};
+
+struct nft_last_priv {
+ struct nft_last *last;
+};
+
+static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = {
+ [NFTA_LAST_SET] = { .type = NLA_U32 },
+ [NFTA_LAST_MSECS] = { .type = NLA_U64 },
+};
+
+static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last;
+ u64 last_jiffies;
+ int err;
+
+ last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT);
+ if (!last)
+ return -ENOMEM;
+
+ if (tb[NFTA_LAST_SET])
+ last->set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
+
+ if (last->set && tb[NFTA_LAST_MSECS]) {
+ err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies);
+ if (err < 0)
+ goto err;
+
+ last->jiffies = jiffies - (unsigned long)last_jiffies;
+ }
+ priv->last = last;
+
+ return 0;
+err:
+ kfree(last);
+
+ return err;
+}
+
+static void nft_last_eval(const struct nft_expr *expr,
+ struct nft_regs *regs, const struct nft_pktinfo *pkt)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last = priv->last;
+
+ if (READ_ONCE(last->jiffies) != jiffies)
+ WRITE_ONCE(last->jiffies, jiffies);
+ if (READ_ONCE(last->set) == 0)
+ WRITE_ONCE(last->set, 1);
+}
+
+static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last = priv->last;
+ unsigned long last_jiffies = READ_ONCE(last->jiffies);
+ u32 last_set = READ_ONCE(last->set);
+ __be64 msecs;
+
+ if (time_before(jiffies, last_jiffies)) {
+ WRITE_ONCE(last->set, 0);
+ last_set = 0;
+ }
+
+ if (last_set)
+ msecs = nf_jiffies64_to_msecs(jiffies - last_jiffies);
+ else
+ msecs = 0;
+
+ if (nla_put_be32(skb, NFTA_LAST_SET, htonl(last_set)) ||
+ nla_put_be64(skb, NFTA_LAST_MSECS, msecs, NFTA_LAST_PAD))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_last_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+
+ kfree(priv->last);
+}
+
+static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_last_priv *priv_dst = nft_expr_priv(dst);
+
+ priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC);
+ if (!priv_dst->last)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static const struct nft_expr_ops nft_last_ops = {
+ .type = &nft_last_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)),
+ .eval = nft_last_eval,
+ .init = nft_last_init,
+ .destroy = nft_last_destroy,
+ .clone = nft_last_clone,
+ .dump = nft_last_dump,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+struct nft_expr_type nft_last_type __read_mostly = {
+ .name = "last",
+ .ops = &nft_last_ops,
+ .policy = nft_last_policy,
+ .maxattr = NFTA_LAST_MAX,
+ .flags = NFT_EXPR_STATEFUL,
+ .owner = THIS_MODULE,
+};
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 35b67d7e3694..981addb2d051 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -18,6 +18,10 @@ struct nft_limit {
spinlock_t lock;
u64 last;
u64 tokens;
+};
+
+struct nft_limit_priv {
+ struct nft_limit *limit;
u64 tokens_max;
u64 rate;
u64 nsecs;
@@ -25,33 +29,33 @@ struct nft_limit {
bool invert;
};
-static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost)
+static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost)
{
u64 now, tokens;
s64 delta;
- spin_lock_bh(&limit->lock);
+ spin_lock_bh(&priv->limit->lock);
now = ktime_get_ns();
- tokens = limit->tokens + now - limit->last;
- if (tokens > limit->tokens_max)
- tokens = limit->tokens_max;
+ tokens = priv->limit->tokens + now - priv->limit->last;
+ if (tokens > priv->tokens_max)
+ tokens = priv->tokens_max;
- limit->last = now;
+ priv->limit->last = now;
delta = tokens - cost;
if (delta >= 0) {
- limit->tokens = delta;
- spin_unlock_bh(&limit->lock);
- return limit->invert;
+ priv->limit->tokens = delta;
+ spin_unlock_bh(&priv->limit->lock);
+ return priv->invert;
}
- limit->tokens = tokens;
- spin_unlock_bh(&limit->lock);
- return !limit->invert;
+ priv->limit->tokens = tokens;
+ spin_unlock_bh(&priv->limit->lock);
+ return !priv->invert;
}
/* Use same default as in iptables. */
#define NFT_LIMIT_PKT_BURST_DEFAULT 5
-static int nft_limit_init(struct nft_limit *limit,
+static int nft_limit_init(struct nft_limit_priv *priv,
const struct nlattr * const tb[], bool pkts)
{
u64 unit, tokens;
@@ -60,58 +64,62 @@ static int nft_limit_init(struct nft_limit *limit,
tb[NFTA_LIMIT_UNIT] == NULL)
return -EINVAL;
- limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
- limit->nsecs = unit * NSEC_PER_SEC;
- if (limit->rate == 0 || limit->nsecs < unit)
+ priv->nsecs = unit * NSEC_PER_SEC;
+ if (priv->rate == 0 || priv->nsecs < unit)
return -EOVERFLOW;
if (tb[NFTA_LIMIT_BURST])
- limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
+ priv->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
- if (pkts && limit->burst == 0)
- limit->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
+ if (pkts && priv->burst == 0)
+ priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
- if (limit->rate + limit->burst < limit->rate)
+ if (priv->rate + priv->burst < priv->rate)
return -EOVERFLOW;
if (pkts) {
- tokens = div_u64(limit->nsecs, limit->rate) * limit->burst;
+ tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst;
} else {
/* The token bucket size limits the number of tokens can be
* accumulated. tokens_max specifies the bucket size.
* tokens_max = unit * (rate + burst) / rate.
*/
- tokens = div_u64(limit->nsecs * (limit->rate + limit->burst),
- limit->rate);
+ tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst),
+ priv->rate);
}
- limit->tokens = tokens;
- limit->tokens_max = limit->tokens;
+ priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT);
+ if (!priv->limit)
+ return -ENOMEM;
+
+ priv->limit->tokens = tokens;
+ priv->tokens_max = priv->limit->tokens;
if (tb[NFTA_LIMIT_FLAGS]) {
u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS]));
if (flags & NFT_LIMIT_F_INV)
- limit->invert = true;
+ priv->invert = true;
}
- limit->last = ktime_get_ns();
- spin_lock_init(&limit->lock);
+ priv->limit->last = ktime_get_ns();
+ spin_lock_init(&priv->limit->lock);
return 0;
}
-static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit_priv *priv,
enum nft_limit_type type)
{
- u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0;
- u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
+ u32 flags = priv->invert ? NFT_LIMIT_F_INV : 0;
+ u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC);
- if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate),
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate),
NFTA_LIMIT_PAD) ||
nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs),
NFTA_LIMIT_PAD) ||
- nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
+ nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(priv->burst)) ||
nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) ||
nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags)))
goto nla_put_failure;
@@ -121,8 +129,34 @@ nla_put_failure:
return -1;
}
-struct nft_limit_pkts {
- struct nft_limit limit;
+static void nft_limit_destroy(const struct nft_ctx *ctx,
+ const struct nft_limit_priv *priv)
+{
+ kfree(priv->limit);
+}
+
+static int nft_limit_clone(struct nft_limit_priv *priv_dst,
+ const struct nft_limit_priv *priv_src)
+{
+ priv_dst->tokens_max = priv_src->tokens_max;
+ priv_dst->rate = priv_src->rate;
+ priv_dst->nsecs = priv_src->nsecs;
+ priv_dst->burst = priv_src->burst;
+ priv_dst->invert = priv_src->invert;
+
+ priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC);
+ if (!priv_dst->limit)
+ return -ENOMEM;
+
+ spin_lock_init(&priv_dst->limit->lock);
+ priv_dst->limit->tokens = priv_src->tokens_max;
+ priv_dst->limit->last = ktime_get_ns();
+
+ return 0;
+}
+
+struct nft_limit_priv_pkts {
+ struct nft_limit_priv limit;
u64 cost;
};
@@ -130,7 +164,7 @@ static void nft_limit_pkts_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
if (nft_limit_eval(&priv->limit, priv->cost))
regs->verdict.code = NFT_BREAK;
@@ -148,7 +182,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
int err;
err = nft_limit_init(&priv->limit, tb, true);
@@ -161,25 +195,46 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
- const struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
}
+static void nft_limit_pkts_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
+
+ nft_limit_destroy(ctx, &priv->limit);
+}
+
+static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst);
+ struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src);
+
+ priv_dst->cost = priv_src->cost;
+
+ return nft_limit_clone(&priv_dst->limit, &priv_src->limit);
+}
+
static struct nft_expr_type nft_limit_type;
static const struct nft_expr_ops nft_limit_pkts_ops = {
.type = &nft_limit_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)),
.eval = nft_limit_pkts_eval,
.init = nft_limit_pkts_init,
+ .destroy = nft_limit_pkts_destroy,
+ .clone = nft_limit_pkts_clone,
.dump = nft_limit_pkts_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static void nft_limit_bytes_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ struct nft_limit_priv *priv = nft_expr_priv(expr);
u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
if (nft_limit_eval(priv, cost))
@@ -190,7 +245,7 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ struct nft_limit_priv *priv = nft_expr_priv(expr);
return nft_limit_init(priv, tb, false);
}
@@ -198,17 +253,36 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx,
static int nft_limit_bytes_dump(struct sk_buff *skb,
const struct nft_expr *expr)
{
- const struct nft_limit *priv = nft_expr_priv(expr);
+ const struct nft_limit_priv *priv = nft_expr_priv(expr);
return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
}
+static void nft_limit_bytes_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit_priv *priv = nft_expr_priv(expr);
+
+ nft_limit_destroy(ctx, priv);
+}
+
+static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_limit_priv *priv_dst = nft_expr_priv(dst);
+ struct nft_limit_priv *priv_src = nft_expr_priv(src);
+
+ return nft_limit_clone(priv_dst, priv_src);
+}
+
static const struct nft_expr_ops nft_limit_bytes_ops = {
.type = &nft_limit_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv)),
.eval = nft_limit_bytes_eval,
.init = nft_limit_bytes_init,
.dump = nft_limit_bytes_dump,
+ .clone = nft_limit_bytes_clone,
+ .destroy = nft_limit_bytes_destroy,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -240,7 +314,7 @@ static void nft_limit_obj_pkts_eval(struct nft_object *obj,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit_pkts *priv = nft_obj_data(obj);
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
if (nft_limit_eval(&priv->limit, priv->cost))
regs->verdict.code = NFT_BREAK;
@@ -250,7 +324,7 @@ static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
{
- struct nft_limit_pkts *priv = nft_obj_data(obj);
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
int err;
err = nft_limit_init(&priv->limit, tb, true);
@@ -265,16 +339,25 @@ static int nft_limit_obj_pkts_dump(struct sk_buff *skb,
struct nft_object *obj,
bool reset)
{
- const struct nft_limit_pkts *priv = nft_obj_data(obj);
+ const struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
}
+static void nft_limit_obj_pkts_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
+
+ nft_limit_destroy(ctx, &priv->limit);
+}
+
static struct nft_object_type nft_limit_obj_type;
static const struct nft_object_ops nft_limit_obj_pkts_ops = {
.type = &nft_limit_obj_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)),
.init = nft_limit_obj_pkts_init,
+ .destroy = nft_limit_obj_pkts_destroy,
.eval = nft_limit_obj_pkts_eval,
.dump = nft_limit_obj_pkts_dump,
};
@@ -283,7 +366,7 @@ static void nft_limit_obj_bytes_eval(struct nft_object *obj,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit *priv = nft_obj_data(obj);
+ struct nft_limit_priv *priv = nft_obj_data(obj);
u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
if (nft_limit_eval(priv, cost))
@@ -294,7 +377,7 @@ static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
{
- struct nft_limit *priv = nft_obj_data(obj);
+ struct nft_limit_priv *priv = nft_obj_data(obj);
return nft_limit_init(priv, tb, false);
}
@@ -303,16 +386,25 @@ static int nft_limit_obj_bytes_dump(struct sk_buff *skb,
struct nft_object *obj,
bool reset)
{
- const struct nft_limit *priv = nft_obj_data(obj);
+ const struct nft_limit_priv *priv = nft_obj_data(obj);
return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
}
+static void nft_limit_obj_bytes_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_limit_priv *priv = nft_obj_data(obj);
+
+ nft_limit_destroy(ctx, priv);
+}
+
static struct nft_object_type nft_limit_obj_type;
static const struct nft_object_ops nft_limit_obj_bytes_ops = {
.type = &nft_limit_obj_type,
- .size = sizeof(struct nft_limit),
+ .size = sizeof(struct nft_limit_priv),
.init = nft_limit_obj_bytes_init,
+ .destroy = nft_limit_obj_bytes_destroy,
.eval = nft_limit_obj_bytes_eval,
.dump = nft_limit_obj_bytes_dump,
};
@@ -372,3 +464,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("limit");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT);
+MODULE_DESCRIPTION("nftables limit expression support");
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index fe4831f2258f..0e13c003f0c1 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -128,6 +128,20 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
[NFTA_LOG_FLAGS] = { .type = NLA_U32 },
};
+static int nft_log_modprobe(struct net *net, enum nf_log_type t)
+{
+ switch (t) {
+ case NF_LOG_TYPE_LOG:
+ return nft_request_module(net, "%s", "nf_log_syslog");
+ case NF_LOG_TYPE_ULOG:
+ return nft_request_module(net, "%s", "nfnetlink_log");
+ case NF_LOG_TYPE_MAX:
+ break;
+ }
+
+ return -ENOENT;
+}
+
static int nft_log_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -152,7 +166,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
if (priv->prefix == NULL)
return -ENOMEM;
- nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1);
+ nla_strscpy(priv->prefix, nla, nla_len(nla) + 1);
} else {
priv->prefix = (char *)nft_log_null_prefix;
}
@@ -197,8 +211,12 @@ static int nft_log_init(const struct nft_ctx *ctx,
return 0;
err = nf_logger_find_get(ctx->family, li->type);
- if (err < 0)
+ if (err < 0) {
+ if (nft_log_modprobe(ctx->net, li->type) == -EAGAIN)
+ err = -EAGAIN;
+
goto err1;
+ }
return 0;
@@ -272,6 +290,7 @@ static const struct nft_expr_ops nft_log_ops = {
.init = nft_log_init,
.destroy = nft_log_destroy,
.dump = nft_log_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_log_type __read_mostly = {
@@ -298,3 +317,4 @@ module_exit(nft_log_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("log");
+MODULE_DESCRIPTION("Netfilter nf_tables log module");
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 660bad688e2b..dfae12759c7c 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -17,32 +17,70 @@
struct nft_lookup {
struct nft_set *set;
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
bool invert;
struct nft_set_binding binding;
};
+#ifdef CONFIG_RETPOLINE
+bool nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
+{
+ if (set->ops == &nft_set_hash_fast_type.ops)
+ return nft_hash_lookup_fast(net, set, key, ext);
+ if (set->ops == &nft_set_hash_type.ops)
+ return nft_hash_lookup(net, set, key, ext);
+
+ if (set->ops == &nft_set_rhash_type.ops)
+ return nft_rhash_lookup(net, set, key, ext);
+
+ if (set->ops == &nft_set_bitmap_type.ops)
+ return nft_bitmap_lookup(net, set, key, ext);
+
+ if (set->ops == &nft_set_pipapo_type.ops)
+ return nft_pipapo_lookup(net, set, key, ext);
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ if (set->ops == &nft_set_pipapo_avx2_type.ops)
+ return nft_pipapo_avx2_lookup(net, set, key, ext);
+#endif
+
+ if (set->ops == &nft_set_rbtree_type.ops)
+ return nft_rbtree_lookup(net, set, key, ext);
+
+ WARN_ON_ONCE(1);
+ return set->ops->lookup(net, set, key, ext);
+}
+EXPORT_SYMBOL_GPL(nft_set_do_lookup);
+#endif
+
void nft_lookup_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
- const struct nft_set_ext *ext;
+ const struct nft_set_ext *ext = NULL;
+ const struct net *net = nft_net(pkt);
bool found;
- found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
- &ext) ^ priv->invert;
+ found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext) ^
+ priv->invert;
if (!found) {
- regs->verdict.code = NFT_BREAK;
- return;
+ ext = nft_set_catchall_lookup(net, set);
+ if (!ext) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
}
- if (set->flags & NFT_SET_MAP)
- nft_data_copy(&regs->data[priv->dreg],
- nft_set_ext_data(ext), set->dlen);
+ if (ext) {
+ if (set->flags & NFT_SET_MAP)
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), set->dlen);
+ nft_set_elem_update_expr(ext, regs, pkt);
+ }
}
static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
@@ -73,8 +111,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (IS_ERR(set))
return PTR_ERR(set);
- priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]);
- err = nft_validate_register_load(priv->sreg, set->klen);
+ err = nft_parse_register_load(tb[NFTA_LOOKUP_SREG], &priv->sreg,
+ set->klen);
if (err < 0)
return err;
@@ -97,9 +135,9 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (!(set->flags & NFT_SET_MAP))
return -EINVAL;
- priv->dreg = nft_parse_register(tb[NFTA_LOOKUP_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- set->dtype, set->dlen);
+ err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG],
+ &priv->dreg, NULL, set->dtype,
+ set->dlen);
if (err < 0)
return err;
} else if (set->flags & NFT_SET_MAP)
@@ -215,6 +253,17 @@ static int nft_lookup_validate(const struct nft_ctx *ctx,
return 0;
}
+static bool nft_lookup_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+
+ if (priv->set->flags & NFT_SET_MAP)
+ nft_reg_track_cancel(track, priv->dreg, priv->set->dlen);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_lookup_ops = {
.type = &nft_lookup_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -225,6 +274,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
.destroy = nft_lookup_destroy,
.dump = nft_lookup_dump,
.validate = nft_lookup_validate,
+ .reduce = nft_lookup_reduce,
};
struct nft_expr_type nft_lookup_type __read_mostly = {
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index bc9fd98c5d6d..2a0adc497bbb 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -15,8 +15,8 @@
struct nft_masq {
u32 flags;
- enum nft_registers sreg_proto_min:8;
- enum nft_registers sreg_proto_max:8;
+ u8 sreg_proto_min;
+ u8 sreg_proto_max;
};
static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
@@ -54,19 +54,15 @@ static int nft_masq_init(const struct nft_ctx *ctx,
}
if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
- priv->sreg_proto_min =
- nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]);
-
- err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN],
+ &priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
- priv->sreg_proto_max =
- nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]);
-
- err = nft_validate_register_load(priv->sreg_proto_max,
- plen);
+ err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MAX],
+ &priv->sreg_proto_max,
+ plen);
if (err < 0)
return err;
} else {
@@ -133,6 +129,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = {
.destroy = nft_masq_ipv4_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
@@ -179,6 +176,7 @@ static const struct nft_expr_ops nft_masq_ipv6_ops = {
.destroy = nft_masq_ipv6_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_masq_ipv6_type __read_mostly = {
@@ -234,6 +232,7 @@ static const struct nft_expr_ops nft_masq_inet_ops = {
.destroy = nft_masq_inet_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_masq_inet_type __read_mostly = {
@@ -305,3 +304,4 @@ module_exit(nft_masq_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_EXPR("masq");
+MODULE_DESCRIPTION("Netfilter nftables masquerade expression support");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 951b6e87ed5d..55d2d49c3425 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -14,6 +14,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/random.h>
#include <linux/smp.h>
#include <linux/static_key.h>
#include <net/dst.h>
@@ -32,8 +33,6 @@
#define NFT_META_SECS_PER_DAY 86400
#define NFT_META_DAYS_PER_WEEK 7
-static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
-
static u8 nft_meta_weekday(void)
{
time64_t secs = ktime_get_real_seconds();
@@ -147,11 +146,11 @@ nft_meta_get_eval_skugid(enum nft_meta_keys key,
switch (key) {
case NFT_META_SKUID:
- *dest = from_kuid_munged(&init_user_ns,
+ *dest = from_kuid_munged(sock_net(sk)->user_ns,
sock->file->f_cred->fsuid);
break;
case NFT_META_SKGID:
- *dest = from_kgid_munged(&init_user_ns,
+ *dest = from_kgid_munged(sock_net(sk)->user_ns,
sock->file->f_cred->fsgid);
break;
default:
@@ -244,7 +243,11 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
case NFT_META_OIF:
nft_meta_store_ifindex(dest, nft_out(pkt));
break;
- case NFT_META_IIFTYPE:
+ case NFT_META_IFTYPE:
+ if (!nft_meta_store_iftype(dest, pkt->skb->dev))
+ return false;
+ break;
+ case __NFT_META_IIFTYPE:
if (!nft_meta_store_iftype(dest, nft_in(pkt)))
return false;
break;
@@ -253,7 +256,7 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
return false;
break;
case NFT_META_IIFGROUP:
- if (!nft_meta_store_ifgroup(dest, nft_out(pkt)))
+ if (!nft_meta_store_ifgroup(dest, nft_in(pkt)))
return false;
break;
case NFT_META_OIFGROUP:
@@ -267,13 +270,6 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
return true;
}
-static noinline u32 nft_prandom_u32(void)
-{
- struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
-
- return prandom_u32_state(state);
-}
-
#ifdef CONFIG_IP_ROUTE_CLASSID
static noinline bool
nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest)
@@ -329,7 +325,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
nft_reg_store8(dest, nft_pf(pkt));
break;
case NFT_META_L4PROTO:
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
goto err;
nft_reg_store8(dest, pkt->tprot);
break;
@@ -385,7 +381,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
#endif
case NFT_META_PRANDOM:
- *dest = nft_prandom_u32();
+ *dest = get_random_u32();
break;
#ifdef CONFIG_XFRM
case NFT_META_SECPATH:
@@ -514,7 +510,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
len = IFNAMSIZ;
break;
case NFT_META_PRANDOM:
- prandom_init_once(&nft_prandom_state);
len = sizeof(u32);
break;
#ifdef CONFIG_XFRM
@@ -535,9 +530,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
EXPORT_SYMBOL_GPL(nft_meta_get_init);
@@ -661,8 +656,8 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
- err = nft_validate_register_load(priv->sreg, len);
+ priv->len = len;
+ err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len);
if (err < 0)
return err;
@@ -724,22 +719,22 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
switch (priv->key) {
case NFT_META_PROTOCOL:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto,
- sizeof(__u16), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto,
+ sizeof(__u16), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
case NFT_META_L4PROTO:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
- sizeof(__u8), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+ sizeof(__u8), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
break;
case NFT_META_IIF:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
- ingress_ifindex, sizeof(__u32), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_ifindex, sizeof(__u32), reg);
break;
case NFT_META_IIFTYPE:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
- ingress_iftype, sizeof(__u16), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_iftype, sizeof(__u16), reg);
break;
default:
return -EOPNOTSUPP;
@@ -748,16 +743,60 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
return 0;
}
+bool nft_meta_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct nft_meta *meta;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ meta = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != meta->key ||
+ priv->dreg != meta->dreg) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_reduce);
+
static const struct nft_expr_ops nft_meta_get_ops = {
.type = &nft_meta_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
.eval = nft_meta_get_eval,
.init = nft_meta_get_init,
.dump = nft_meta_get_dump,
+ .reduce = nft_meta_get_reduce,
.validate = nft_meta_get_validate,
.offload = nft_meta_get_offload,
};
+static bool nft_meta_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_meta_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_meta_set_ops = {
.type = &nft_meta_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -765,6 +804,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
.init = nft_meta_set_init,
.destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
+ .reduce = nft_meta_set_reduce,
.validate = nft_meta_set_validate,
};
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 8b44a4de5329..e5fd6995e4bf 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -21,15 +21,85 @@
#include <net/ip.h>
struct nft_nat {
- enum nft_registers sreg_addr_min:8;
- enum nft_registers sreg_addr_max:8;
- enum nft_registers sreg_proto_min:8;
- enum nft_registers sreg_proto_max:8;
+ u8 sreg_addr_min;
+ u8 sreg_addr_max;
+ u8 sreg_proto_min;
+ u8 sreg_proto_max;
enum nf_nat_manip_type type:8;
u8 family;
u16 flags;
};
+static void nft_nat_setup_addr(struct nf_nat_range2 *range,
+ const struct nft_regs *regs,
+ const struct nft_nat *priv)
+{
+ switch (priv->family) {
+ case AF_INET:
+ range->min_addr.ip = (__force __be32)
+ regs->data[priv->sreg_addr_min];
+ range->max_addr.ip = (__force __be32)
+ regs->data[priv->sreg_addr_max];
+ break;
+ case AF_INET6:
+ memcpy(range->min_addr.ip6, &regs->data[priv->sreg_addr_min],
+ sizeof(range->min_addr.ip6));
+ memcpy(range->max_addr.ip6, &regs->data[priv->sreg_addr_max],
+ sizeof(range->max_addr.ip6));
+ break;
+ }
+}
+
+static void nft_nat_setup_proto(struct nf_nat_range2 *range,
+ const struct nft_regs *regs,
+ const struct nft_nat *priv)
+{
+ range->min_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+ range->max_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_max]);
+}
+
+static void nft_nat_setup_netmap(struct nf_nat_range2 *range,
+ const struct nft_pktinfo *pkt,
+ const struct nft_nat *priv)
+{
+ struct sk_buff *skb = pkt->skb;
+ union nf_inet_addr new_addr;
+ __be32 netmask;
+ int i, len = 0;
+
+ switch (priv->type) {
+ case NFT_NAT_SNAT:
+ if (nft_pf(pkt) == NFPROTO_IPV4) {
+ new_addr.ip = ip_hdr(skb)->saddr;
+ len = sizeof(struct in_addr);
+ } else {
+ new_addr.in6 = ipv6_hdr(skb)->saddr;
+ len = sizeof(struct in6_addr);
+ }
+ break;
+ case NFT_NAT_DNAT:
+ if (nft_pf(pkt) == NFPROTO_IPV4) {
+ new_addr.ip = ip_hdr(skb)->daddr;
+ len = sizeof(struct in_addr);
+ } else {
+ new_addr.in6 = ipv6_hdr(skb)->daddr;
+ len = sizeof(struct in6_addr);
+ }
+ break;
+ }
+
+ for (i = 0; i < len / sizeof(__be32); i++) {
+ netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]);
+ new_addr.ip6[i] &= ~netmask;
+ new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask;
+ }
+
+ range->min_addr = new_addr;
+ range->max_addr = new_addr;
+}
+
static void nft_nat_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -40,33 +110,17 @@ static void nft_nat_eval(const struct nft_expr *expr,
struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
- if (priv->sreg_addr_min) {
- if (priv->family == AF_INET) {
- range.min_addr.ip = (__force __be32)
- regs->data[priv->sreg_addr_min];
- range.max_addr.ip = (__force __be32)
- regs->data[priv->sreg_addr_max];
- } else {
- memcpy(range.min_addr.ip6,
- &regs->data[priv->sreg_addr_min],
- sizeof(range.min_addr.ip6));
- memcpy(range.max_addr.ip6,
- &regs->data[priv->sreg_addr_max],
- sizeof(range.max_addr.ip6));
- }
- range.flags |= NF_NAT_RANGE_MAP_IPS;
+ if (priv->sreg_addr_min) {
+ nft_nat_setup_addr(&range, regs, priv);
+ if (priv->flags & NF_NAT_RANGE_NETMAP)
+ nft_nat_setup_netmap(&range, pkt, priv);
}
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
- }
+ if (priv->sreg_proto_min)
+ nft_nat_setup_proto(&range, regs, priv);
- range.flags |= priv->flags;
+ range.flags = priv->flags;
regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type);
}
@@ -129,7 +183,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->type = NF_NAT_MANIP_DST;
break;
default:
- return -EINVAL;
+ return -EOPNOTSUPP;
}
if (tb[NFTA_NAT_FAMILY] == NULL)
@@ -147,56 +201,55 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
alen = sizeof_field(struct nf_nat_range, min_addr.ip6);
break;
default:
- return -EAFNOSUPPORT;
+ if (tb[NFTA_NAT_REG_ADDR_MIN])
+ return -EAFNOSUPPORT;
+ break;
}
priv->family = family;
if (tb[NFTA_NAT_REG_ADDR_MIN]) {
- priv->sreg_addr_min =
- nft_parse_register(tb[NFTA_NAT_REG_ADDR_MIN]);
- err = nft_validate_register_load(priv->sreg_addr_min, alen);
+ err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN],
+ &priv->sreg_addr_min, alen);
if (err < 0)
return err;
if (tb[NFTA_NAT_REG_ADDR_MAX]) {
- priv->sreg_addr_max =
- nft_parse_register(tb[NFTA_NAT_REG_ADDR_MAX]);
-
- err = nft_validate_register_load(priv->sreg_addr_max,
- alen);
+ err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX],
+ &priv->sreg_addr_max,
+ alen);
if (err < 0)
return err;
} else {
priv->sreg_addr_max = priv->sreg_addr_min;
}
+
+ priv->flags |= NF_NAT_RANGE_MAP_IPS;
}
plen = sizeof_field(struct nf_nat_range, min_addr.all);
if (tb[NFTA_NAT_REG_PROTO_MIN]) {
- priv->sreg_proto_min =
- nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]);
-
- err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN],
+ &priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_NAT_REG_PROTO_MAX]) {
- priv->sreg_proto_max =
- nft_parse_register(tb[NFTA_NAT_REG_PROTO_MAX]);
-
- err = nft_validate_register_load(priv->sreg_proto_max,
- plen);
+ err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX],
+ &priv->sreg_proto_max,
+ plen);
if (err < 0)
return err;
} else {
priv->sreg_proto_max = priv->sreg_proto_min;
}
+
+ priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
if (tb[NFTA_NAT_FLAGS]) {
- priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
+ priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
+ return -EOPNOTSUPP;
}
return nf_ct_netns_get(ctx->net, family);
@@ -264,6 +317,7 @@ static const struct nft_expr_ops nft_nat_ops = {
.destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_nat_type __read_mostly = {
@@ -281,7 +335,8 @@ static void nft_nat_inet_eval(const struct nft_expr *expr,
{
const struct nft_nat *priv = nft_expr_priv(expr);
- if (priv->family == nft_pf(pkt))
+ if (priv->family == nft_pf(pkt) ||
+ priv->family == NFPROTO_INET)
nft_nat_eval(expr, regs, pkt);
}
@@ -293,6 +348,7 @@ static const struct nft_expr_ops nft_nat_inet_ops = {
.destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_inet_nat_type __read_mostly = {
@@ -344,3 +400,4 @@ module_exit(nft_nat_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
MODULE_ALIAS_NFT_EXPR("nat");
+MODULE_DESCRIPTION("Network Address Translation support");
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 48edb9d5f012..45d3dc9e96f2 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -9,16 +9,15 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/random.h>
#include <linux/static_key.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
-static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
-
struct nft_ng_inc {
- enum nft_registers dreg:8;
+ u8 dreg;
u32 modulus;
- atomic_t counter;
+ atomic_t *counter;
u32 offset;
};
@@ -27,9 +26,9 @@ static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
u32 nval, oval;
do {
- oval = atomic_read(&priv->counter);
+ oval = atomic_read(priv->counter);
nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
- } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
+ } while (atomic_cmpxchg(priv->counter, oval, nval) != oval);
return nval + priv->offset;
}
@@ -55,6 +54,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_ng_inc *priv = nft_expr_priv(expr);
+ int err;
if (tb[NFTA_NG_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
@@ -66,11 +66,32 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
- atomic_set(&priv->counter, priv->modulus - 1);
+ priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL);
+ if (!priv->counter)
+ return -ENOMEM;
+
+ atomic_set(priv->counter, priv->modulus - 1);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ err = nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
+ if (err < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(priv->counter);
+
+ return err;
+}
+
+static bool nft_ng_inc_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+
+ return false;
}
static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
@@ -99,18 +120,23 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static void nft_ng_inc_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ kfree(priv->counter);
+}
+
struct nft_ng_random {
- enum nft_registers dreg:8;
+ u8 dreg;
u32 modulus;
u32 offset;
};
-static u32 nft_ng_random_gen(struct nft_ng_random *priv)
+static u32 nft_ng_random_gen(const struct nft_ng_random *priv)
{
- struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
-
- return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
- priv->offset;
+ return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset;
}
static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -138,12 +164,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- prandom_init_once(&nft_numgen_prandom_state);
-
- priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
-
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
}
static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -154,13 +176,25 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static bool nft_ng_random_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+
+ return false;
+}
+
static struct nft_expr_type nft_ng_type;
static const struct nft_expr_ops nft_ng_inc_ops = {
.type = &nft_ng_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
.eval = nft_ng_inc_eval,
.init = nft_ng_inc_init,
+ .destroy = nft_ng_inc_destroy,
.dump = nft_ng_inc_dump,
+ .reduce = nft_ng_inc_reduce,
};
static const struct nft_expr_ops nft_ng_random_ops = {
@@ -169,6 +203,7 @@ static const struct nft_expr_ops nft_ng_random_ops = {
.eval = nft_ng_random_eval,
.init = nft_ng_random_init,
.dump = nft_ng_random_dump,
+ .reduce = nft_ng_random_reduce,
};
static const struct nft_expr_ops *
@@ -217,3 +252,4 @@ module_exit(nft_ng_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
MODULE_ALIAS_NFT_EXPR("numgen");
+MODULE_DESCRIPTION("nftables number generator module");
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index bfd18d2b65a2..5d8d91b3904d 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -9,7 +9,7 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr))
@@ -91,11 +91,12 @@ static const struct nft_expr_ops nft_objref_ops = {
.activate = nft_objref_activate,
.deactivate = nft_objref_deactivate,
.dump = nft_objref_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_objref_map {
struct nft_set *set;
- enum nft_registers sreg:8;
+ u8 sreg;
struct nft_set_binding binding;
};
@@ -105,15 +106,18 @@ static void nft_objref_map_eval(const struct nft_expr *expr,
{
struct nft_objref_map *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
+ struct net *net = nft_net(pkt);
const struct nft_set_ext *ext;
struct nft_object *obj;
bool found;
- found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
- &ext);
+ found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext);
if (!found) {
- regs->verdict.code = NFT_BREAK;
- return;
+ ext = nft_set_catchall_lookup(net, set);
+ if (!ext) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
}
obj = *nft_set_ext_obj(ext);
obj->ops->eval(obj, regs, pkt);
@@ -137,8 +141,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
if (!(set->flags & NFT_SET_OBJECT))
return -EINVAL;
- priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]);
- err = nft_validate_register_load(priv->sreg, set->klen);
+ err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg,
+ set->klen);
if (err < 0)
return err;
@@ -201,6 +205,7 @@ static const struct nft_expr_ops nft_objref_map_ops = {
.deactivate = nft_objref_map_deactivate,
.destroy = nft_objref_map_destroy,
.dump = nft_objref_map_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -252,3 +257,4 @@ module_exit(nft_objref_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("objref");
+MODULE_DESCRIPTION("nftables stateful object reference module");
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index b42247aa48a9..adacf95b6e2b 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -6,7 +6,7 @@
#include <linux/netfilter/nfnetlink_osf.h>
struct nft_osf {
- enum nft_registers dreg:8;
+ u8 dreg;
u8 ttl;
u32 flags;
};
@@ -28,6 +28,11 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct nf_osf_data data;
struct tcphdr _tcph;
+ if (pkt->tprot != IPPROTO_TCP) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
tcp = skb_header_pointer(skb, ip_hdrlen(skb),
sizeof(struct tcphdr), &_tcph);
if (!tcp) {
@@ -46,7 +51,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s",
data.genre, data.version);
else
- strlcpy(os_match, data.genre, NFT_OSF_MAXGENRELEN);
+ strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN);
strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN);
}
@@ -78,9 +83,9 @@ static int nft_osf_init(const struct nft_ctx *ctx,
priv->flags = flags;
}
- priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN);
+ err = nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE,
+ NFT_OSF_MAXGENRELEN);
if (err < 0)
return err;
@@ -94,7 +99,7 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_OSF_FLAGS, ntohl(priv->flags)))
+ if (nla_put_u32(skb, NFTA_OSF_FLAGS, ntohl((__force __be32)priv->flags)))
goto nla_put_failure;
if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
@@ -110,9 +115,45 @@ static int nft_osf_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nft_data **data)
{
- return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
- (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_FORWARD));
+ unsigned int hooks;
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_FORWARD);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+static bool nft_osf_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_osf *priv = nft_expr_priv(expr);
+ struct nft_osf *osf;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN);
+ return false;
+ }
+
+ osf = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->flags != osf->flags ||
+ priv->ttl != osf->ttl) {
+ nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
}
static struct nft_expr_type nft_osf_type;
@@ -123,6 +164,7 @@ static const struct nft_expr_ops nft_osf_op = {
.dump = nft_osf_dump,
.type = &nft_osf_type,
.validate = nft_osf_validate,
+ .reduce = nft_osf_reduce,
};
static struct nft_expr_type nft_osf_type __read_mostly = {
@@ -149,3 +191,4 @@ module_exit(nft_osf_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
MODULE_ALIAS_NFT_EXPR("osf");
+MODULE_DESCRIPTION("nftables passive OS fingerprint support");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index a7de3a58f553..4edd899aeb9b 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -22,6 +22,7 @@
#include <linux/icmpv6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <net/sctp/checksum.h>
static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
struct vlan_ethhdr *veth)
@@ -78,6 +79,45 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0;
}
+static int __nft_payload_inner_offset(struct nft_pktinfo *pkt)
+{
+ unsigned int thoff = nft_thoff(pkt);
+
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
+ return -1;
+
+ switch (pkt->tprot) {
+ case IPPROTO_UDP:
+ pkt->inneroff = thoff + sizeof(struct udphdr);
+ break;
+ case IPPROTO_TCP: {
+ struct tcphdr *th, _tcph;
+
+ th = skb_header_pointer(pkt->skb, thoff, sizeof(_tcph), &_tcph);
+ if (!th)
+ return -1;
+
+ pkt->inneroff = thoff + __tcp_hdrlen(th);
+ }
+ break;
+ default:
+ return -1;
+ }
+
+ pkt->flags |= NFT_PKTINFO_INNER;
+
+ return 0;
+}
+
+static int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
+{
+ if (!(pkt->flags & NFT_PKTINFO_INNER) &&
+ __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0)
+ return -1;
+
+ return pkt->inneroff;
+}
+
void nft_payload_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -87,7 +127,9 @@ void nft_payload_eval(const struct nft_expr *expr,
u32 *dest = &regs->data[priv->dreg];
int offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
if (!skb_mac_header_was_set(skb))
@@ -105,12 +147,18 @@ void nft_payload_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
+ goto err;
+ offset = nft_thoff(pkt);
+ break;
+ case NFT_PAYLOAD_INNER_HEADER:
+ offset = nft_payload_inner_offset(pkt);
+ if (offset < 0)
goto err;
- offset = pkt->xt.thoff;
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
+ goto err;
}
offset += priv->offset;
@@ -125,10 +173,10 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 },
};
@@ -141,10 +189,10 @@ static int nft_payload_init(const struct nft_ctx *ctx,
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
- priv->dreg = nft_parse_register(tb[NFTA_PAYLOAD_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
}
static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -162,6 +210,59 @@ nla_put_failure:
return -1;
}
+static bool nft_payload_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct nft_payload *payload;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ payload = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->base != payload->base ||
+ priv->offset != payload->offset ||
+ priv->len != payload->len) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
+static bool nft_payload_offload_mask(struct nft_offload_reg *reg,
+ u32 priv_len, u32 field_len)
+{
+ unsigned int remainder, delta, k;
+ struct nft_data mask = {};
+ __be32 remainder_mask;
+
+ if (priv_len == field_len) {
+ memset(&reg->mask, 0xff, priv_len);
+ return true;
+ } else if (priv_len > field_len) {
+ return false;
+ }
+
+ memset(&mask, 0xff, field_len);
+ remainder = priv_len % sizeof(u32);
+ if (remainder) {
+ k = priv_len / sizeof(u32);
+ delta = field_len - priv_len;
+ remainder_mask = htonl(~((1 << (delta * BITS_PER_BYTE)) - 1));
+ mask.data[k] = (__force u32)remainder_mask;
+ }
+
+ memcpy(&reg->mask, &mask, field_len);
+
+ return true;
+}
+
static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
struct nft_flow_rule *flow,
const struct nft_payload *priv)
@@ -170,21 +271,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct ethhdr, h_source):
- if (priv->len != ETH_ALEN)
+ if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
src, ETH_ALEN, reg);
break;
case offsetof(struct ethhdr, h_dest):
- if (priv->len != ETH_ALEN)
+ if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
dst, ETH_ALEN, reg);
break;
case offsetof(struct ethhdr, h_proto):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic,
@@ -192,14 +293,15 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
case offsetof(struct vlan_ethhdr, h_vlan_TCI):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
- vlan_tci, sizeof(__be16), reg);
+ NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_VLAN, vlan,
+ vlan_tci, sizeof(__be16), reg,
+ NFT_OFFLOAD_F_NETWORK2HOST);
break;
case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
@@ -207,19 +309,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
- vlan_tci, sizeof(__be16), reg);
+ NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_CVLAN, cvlan,
+ vlan_tci, sizeof(__be16), reg,
+ NFT_OFFLOAD_F_NETWORK2HOST);
break;
case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) +
sizeof(struct vlan_hdr):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, cvlan,
vlan_tpid, sizeof(__be16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
default:
return -EOPNOTSUPP;
@@ -236,21 +340,25 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct iphdr, saddr):
- if (priv->len != sizeof(struct in_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src,
sizeof(struct in_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
break;
case offsetof(struct iphdr, daddr):
- if (priv->len != sizeof(struct in_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst,
sizeof(struct in_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
break;
case offsetof(struct iphdr, protocol):
- if (priv->len != sizeof(__u8))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
@@ -272,21 +380,25 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct ipv6hdr, saddr):
- if (priv->len != sizeof(struct in6_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in6_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src,
sizeof(struct in6_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
break;
case offsetof(struct ipv6hdr, daddr):
- if (priv->len != sizeof(struct in6_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in6_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst,
sizeof(struct in6_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
break;
case offsetof(struct ipv6hdr, nexthdr):
- if (priv->len != sizeof(__u8))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
@@ -328,14 +440,14 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct tcphdr, source):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
sizeof(__be16), reg);
break;
case offsetof(struct tcphdr, dest):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
@@ -356,14 +468,14 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct udphdr, source):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
sizeof(__be16), reg);
break;
case offsetof(struct udphdr, dest):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
@@ -426,6 +538,7 @@ static const struct nft_expr_ops nft_payload_ops = {
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .reduce = nft_payload_reduce,
.offload = nft_payload_offload,
};
@@ -435,6 +548,7 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .reduce = nft_payload_reduce,
.offload = nft_payload_offload,
};
@@ -460,14 +574,17 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
struct sk_buff *skb,
unsigned int *l4csum_offset)
{
+ if (pkt->fragoff)
+ return -1;
+
switch (pkt->tprot) {
case IPPROTO_TCP:
*l4csum_offset = offsetof(struct tcphdr, check);
break;
case IPPROTO_UDP:
- if (!nft_payload_udp_checksum(skb, pkt->xt.thoff))
+ if (!nft_payload_udp_checksum(skb, nft_thoff(pkt)))
return -1;
- /* Fall through. */
+ fallthrough;
case IPPROTO_UDPLITE:
*l4csum_offset = offsetof(struct udphdr, check);
break;
@@ -478,7 +595,20 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
return -1;
}
- *l4csum_offset += pkt->xt.thoff;
+ *l4csum_offset += nft_thoff(pkt);
+ return 0;
+}
+
+static int nft_payload_csum_sctp(struct sk_buff *skb, int offset)
+{
+ struct sctphdr *sh;
+
+ if (skb_ensure_writable(skb, offset + sizeof(*sh)))
+ return -1;
+
+ sh = (struct sctphdr *)(skb->data + offset);
+ sh->checksum = sctp_compute_cksum(skb, offset);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return 0;
}
@@ -555,19 +685,26 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
+ goto err;
+ offset = nft_thoff(pkt);
+ break;
+ case NFT_PAYLOAD_INNER_HEADER:
+ offset = nft_payload_inner_offset(pkt);
+ if (offset < 0)
goto err;
- offset = pkt->xt.thoff;
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
+ goto err;
}
csum_offset = offset + priv->csum_offset;
offset += priv->offset;
if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) &&
- (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER ||
+ ((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER &&
+ priv->base != NFT_PAYLOAD_INNER_HEADER) ||
skb->ip_summed != CHECKSUM_PARTIAL)) {
fsum = skb_checksum(skb, offset, priv->len, 0);
tsum = csum_partial(src, priv->len, 0);
@@ -585,6 +722,14 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
skb_store_bits(skb, offset, src, priv->len) < 0)
goto err;
+ if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP &&
+ pkt->tprot == IPPROTO_SCTP &&
+ skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (pkt->fragoff == 0 &&
+ nft_payload_csum_sctp(skb, nft_thoff(pkt)))
+ goto err;
+ }
+
return;
err:
regs->verdict.code = NFT_BREAK;
@@ -595,18 +740,23 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_payload_set *priv = nft_expr_priv(expr);
+ u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
+ int err;
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
- priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]);
if (tb[NFTA_PAYLOAD_CSUM_TYPE])
- priv->csum_type =
- ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
- if (tb[NFTA_PAYLOAD_CSUM_OFFSET])
- priv->csum_offset =
- ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET]));
+ csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
+ if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) {
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_CSUM_OFFSET], U8_MAX,
+ &csum_offset);
+ if (err < 0)
+ return err;
+
+ priv->csum_offset = csum_offset;
+ }
if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) {
u32 flags;
@@ -617,15 +767,24 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
priv->csum_flags = flags;
}
- switch (priv->csum_type) {
+ switch (csum_type) {
case NFT_PAYLOAD_CSUM_NONE:
case NFT_PAYLOAD_CSUM_INET:
break;
+ case NFT_PAYLOAD_CSUM_SCTP:
+ if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER)
+ return -EINVAL;
+
+ if (priv->csum_offset != offsetof(struct sctphdr, checksum))
+ return -EINVAL;
+ break;
default:
return -EOPNOTSUPP;
}
+ priv->csum_type = csum_type;
- return nft_validate_register_load(priv->sreg, priv->len);
+ return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg,
+ priv->len);
}
static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -647,12 +806,32 @@ nla_put_failure:
return -1;
}
+static bool nft_payload_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_payload_ops &&
+ track->regs[i].selector->ops != &nft_payload_fast_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_payload_set_ops = {
.type = &nft_payload_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)),
.eval = nft_payload_set_eval,
.init = nft_payload_set_init,
.dump = nft_payload_set_dump,
+ .reduce = nft_payload_set_reduce,
};
static const struct nft_expr_ops *
@@ -661,6 +840,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
{
enum nft_payload_bases base;
unsigned int offset, len;
+ int err;
if (tb[NFTA_PAYLOAD_BASE] == NULL ||
tb[NFTA_PAYLOAD_OFFSET] == NULL ||
@@ -672,6 +852,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
case NFT_PAYLOAD_LL_HEADER:
case NFT_PAYLOAD_NETWORK_HEADER:
case NFT_PAYLOAD_TRANSPORT_HEADER:
+ case NFT_PAYLOAD_INNER_HEADER:
break;
default:
return ERR_PTR(-EOPNOTSUPP);
@@ -686,11 +867,16 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_DREG] == NULL)
return ERR_PTR(-EINVAL);
- offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
- len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_LEN], U8_MAX, &len);
+ if (err < 0)
+ return ERR_PTR(err);
if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) &&
- base != NFT_PAYLOAD_LL_HEADER)
+ base != NFT_PAYLOAD_LL_HEADER && base != NFT_PAYLOAD_INNER_HEADER)
return &nft_payload_fast_ops;
else
return &nft_payload_ops;
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 5ece0a6aa8c3..da29e92c03e2 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -19,10 +19,10 @@
static u32 jhash_initval __read_mostly;
struct nft_queue {
- enum nft_registers sreg_qnum:8;
- u16 queuenum;
- u16 queues_total;
- u16 flags;
+ u8 sreg_qnum;
+ u16 queuenum;
+ u16 queues_total;
+ u16 flags;
};
static void nft_queue_eval(const struct nft_expr *expr,
@@ -68,6 +68,31 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr,
regs->verdict.code = ret;
}
+static int nft_queue_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ case NFPROTO_BRIDGE:
+ break;
+ case NFPROTO_NETDEV: /* lacks okfn */
+ fallthrough;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, supported_hooks);
+}
+
static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
[NFTA_QUEUE_NUM] = { .type = NLA_U16 },
[NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
@@ -111,8 +136,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx,
struct nft_queue *priv = nft_expr_priv(expr);
int err;
- priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]);
- err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32));
+ err = nft_parse_register_load(tb[NFTA_QUEUE_SREG_QNUM],
+ &priv->sreg_qnum, sizeof(u32));
if (err < 0)
return err;
@@ -164,6 +189,8 @@ static const struct nft_expr_ops nft_queue_ops = {
.eval = nft_queue_eval,
.init = nft_queue_init,
.dump = nft_queue_dump,
+ .validate = nft_queue_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops nft_queue_sreg_ops = {
@@ -172,6 +199,8 @@ static const struct nft_expr_ops nft_queue_sreg_ops = {
.eval = nft_queue_sreg_eval,
.init = nft_queue_sreg_init,
.dump = nft_queue_sreg_dump,
+ .validate = nft_queue_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -216,3 +245,4 @@ module_exit(nft_queue_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Leblond <eric@regit.org>");
MODULE_ALIAS_NFT_EXPR("queue");
+MODULE_DESCRIPTION("Netfilter nftables queue module");
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 4413690591f2..e6b0df68feea 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -15,13 +15,13 @@
struct nft_quota {
atomic64_t quota;
unsigned long flags;
- atomic64_t consumed;
+ atomic64_t *consumed;
};
static inline bool nft_overquota(struct nft_quota *priv,
const struct sk_buff *skb)
{
- return atomic64_add_return(skb->len, &priv->consumed) >=
+ return atomic64_add_return(skb->len, priv->consumed) >=
atomic64_read(&priv->quota);
}
@@ -60,7 +60,7 @@ static void nft_quota_obj_eval(struct nft_object *obj,
if (overquota &&
!test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0,
- NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC);
+ NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC);
}
static int nft_quota_do_init(const struct nlattr * const tb[],
@@ -90,13 +90,23 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
return -EOPNOTSUPP;
}
+ priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT);
+ if (!priv->consumed)
+ return -ENOMEM;
+
atomic64_set(&priv->quota, quota);
priv->flags = flags;
- atomic64_set(&priv->consumed, consumed);
+ atomic64_set(priv->consumed, consumed);
return 0;
}
+static void nft_quota_do_destroy(const struct nft_ctx *ctx,
+ struct nft_quota *priv)
+{
+ kfree(priv->consumed);
+}
+
static int nft_quota_obj_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
@@ -128,7 +138,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
* that we see, don't go over the quota boundary in what we send to
* userspace.
*/
- consumed = atomic64_read(&priv->consumed);
+ consumed = atomic64_read(priv->consumed);
quota = atomic64_read(&priv->quota);
if (consumed >= quota) {
consumed_cap = quota;
@@ -145,7 +155,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
goto nla_put_failure;
if (reset) {
- atomic64_sub(consumed, &priv->consumed);
+ atomic64_sub(consumed, priv->consumed);
clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags);
}
return 0;
@@ -162,11 +172,20 @@ static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj,
return nft_quota_do_dump(skb, priv, reset);
}
+static void nft_quota_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_destroy(ctx, priv);
+}
+
static struct nft_object_type nft_quota_obj_type;
static const struct nft_object_ops nft_quota_obj_ops = {
.type = &nft_quota_obj_type,
.size = sizeof(struct nft_quota),
.init = nft_quota_obj_init,
+ .destroy = nft_quota_obj_destroy,
.eval = nft_quota_obj_eval,
.dump = nft_quota_obj_dump,
.update = nft_quota_obj_update,
@@ -205,13 +224,37 @@ static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
return nft_quota_do_dump(skb, priv, false);
}
+static void nft_quota_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_destroy(ctx, priv);
+}
+
+static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_quota *priv_dst = nft_expr_priv(dst);
+
+ priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC);
+ if (!priv_dst->consumed)
+ return -ENOMEM;
+
+ atomic64_set(priv_dst->consumed, 0);
+
+ return 0;
+}
+
static struct nft_expr_type nft_quota_type;
static const struct nft_expr_ops nft_quota_ops = {
.type = &nft_quota_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_quota)),
.eval = nft_quota_eval,
.init = nft_quota_init,
+ .destroy = nft_quota_destroy,
+ .clone = nft_quota_clone,
.dump = nft_quota_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_quota_type __read_mostly = {
@@ -254,3 +297,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("quota");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA);
+MODULE_DESCRIPTION("Netfilter nftables quota module");
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 89efcc5a533d..832f0d725a9e 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -15,7 +15,7 @@
struct nft_range_expr {
struct nft_data data_from;
struct nft_data data_to;
- enum nft_registers sreg:8;
+ u8 sreg;
u8 len;
enum nft_range_ops op:8;
};
@@ -51,7 +51,14 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
const struct nlattr * const tb[])
{
struct nft_range_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc_from, desc_to;
+ struct nft_data_desc desc_from = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data_from),
+ };
+ struct nft_data_desc desc_to = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data_to),
+ };
int err;
u32 op;
@@ -61,33 +68,23 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
!tb[NFTA_RANGE_TO_DATA])
return -EINVAL;
- err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
- &desc_from, tb[NFTA_RANGE_FROM_DATA]);
+ err = nft_data_init(NULL, &priv->data_from, &desc_from,
+ tb[NFTA_RANGE_FROM_DATA]);
if (err < 0)
return err;
- if (desc_from.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- goto err1;
- }
-
- err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
- &desc_to, tb[NFTA_RANGE_TO_DATA]);
+ err = nft_data_init(NULL, &priv->data_to, &desc_to,
+ tb[NFTA_RANGE_TO_DATA]);
if (err < 0)
goto err1;
- if (desc_to.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- goto err2;
- }
-
if (desc_from.len != desc_to.len) {
err = -EINVAL;
goto err2;
}
- priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]);
- err = nft_validate_register_load(priv->sreg, desc_from.len);
+ err = nft_parse_register_load(tb[NFTA_RANGE_SREG], &priv->sreg,
+ desc_from.len);
if (err < 0)
goto err2;
@@ -140,6 +137,7 @@ static const struct nft_expr_ops nft_range_ops = {
.eval = nft_range_eval,
.init = nft_range_init,
.dump = nft_range_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_range_type __read_mostly = {
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 5b779171565c..5086adfe731c 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -14,8 +14,8 @@
#include <net/netfilter/nf_tables.h>
struct nft_redir {
- enum nft_registers sreg_proto_min:8;
- enum nft_registers sreg_proto_max:8;
+ u8 sreg_proto_min;
+ u8 sreg_proto_max;
u16 flags;
};
@@ -50,19 +50,15 @@ static int nft_redir_init(const struct nft_ctx *ctx,
plen = sizeof_field(struct nf_nat_range, min_addr.all);
if (tb[NFTA_REDIR_REG_PROTO_MIN]) {
- priv->sreg_proto_min =
- nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]);
-
- err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN],
+ &priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_REDIR_REG_PROTO_MAX]) {
- priv->sreg_proto_max =
- nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MAX]);
-
- err = nft_validate_register_load(priv->sreg_proto_max,
- plen);
+ err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MAX],
+ &priv->sreg_proto_max,
+ plen);
if (err < 0)
return err;
} else {
@@ -138,6 +134,7 @@ static const struct nft_expr_ops nft_redir_ipv4_ops = {
.destroy = nft_redir_ipv4_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
@@ -187,6 +184,7 @@ static const struct nft_expr_ops nft_redir_ipv6_ops = {
.destroy = nft_redir_ipv6_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
@@ -229,6 +227,7 @@ static const struct nft_expr_ops nft_redir_inet_ops = {
.destroy = nft_redir_inet_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_redir_inet_type __read_mostly = {
@@ -292,3 +291,4 @@ module_exit(nft_redir_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_EXPR("redir");
+MODULE_DESCRIPTION("Netfilter nftables redirect support");
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 00f865fb80ca..927ff8459bd9 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -30,7 +30,8 @@ int nft_reject_validate(const struct nft_ctx *ctx,
return nft_chain_validate_hooks(ctx->chain,
(1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_FORWARD) |
- (1 << NF_INET_LOCAL_OUT));
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_PRE_ROUTING));
}
EXPORT_SYMBOL_GPL(nft_reject_validate);
@@ -39,6 +40,7 @@ int nft_reject_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_reject *priv = nft_expr_priv(expr);
+ int icmp_code;
if (tb[NFTA_REJECT_TYPE] == NULL)
return -EINVAL;
@@ -46,9 +48,17 @@ int nft_reject_init(const struct nft_ctx *ctx,
priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
return -EINVAL;
- priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+
+ icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+ if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
+ icmp_code > NFT_REJECT_ICMPX_MAX)
+ return -EINVAL;
+
+ priv->icmp_code = icmp_code;
+ break;
case NFT_REJECT_TCP_RST:
break;
default:
@@ -68,6 +78,7 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
goto nla_put_failure;
break;
@@ -119,3 +130,4 @@ EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Netfilter x_tables over nftables module");
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index f41f414b72d1..973fa31a9dd6 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -28,7 +28,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset(nft_net(pkt), nft_sk(pkt),
+ pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach(pkt->skb,
@@ -44,7 +45,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset6(nft_net(pkt), nft_sk(pkt),
+ pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach6(nft_net(pkt), pkt->skb,
@@ -58,60 +60,16 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
regs->verdict.code = NF_DROP;
}
-static int nft_reject_inet_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_reject_inet_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
{
- struct nft_reject *priv = nft_expr_priv(expr);
- int icmp_code;
-
- if (tb[NFTA_REJECT_TYPE] == NULL)
- return -EINVAL;
-
- priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
- return -EINVAL;
-
- icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
- if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
- icmp_code > NFT_REJECT_ICMPX_MAX)
- return -EINVAL;
-
- priv->icmp_code = icmp_code;
- break;
- case NFT_REJECT_TCP_RST:
- break;
- default:
- return -EINVAL;
- }
- return 0;
-}
-
-static int nft_reject_inet_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
-{
- const struct nft_reject *priv = nft_expr_priv(expr);
-
- if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
- goto nla_put_failure;
-
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
- goto nla_put_failure;
- break;
- default:
- break;
- }
-
- return 0;
-
-nla_put_failure:
- return -1;
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_INGRESS));
}
static struct nft_expr_type nft_reject_inet_type;
@@ -119,9 +77,10 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
.type = &nft_reject_inet_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
.eval = nft_reject_inet_eval,
- .init = nft_reject_inet_init,
- .dump = nft_reject_inet_dump,
- .validate = nft_reject_validate,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_inet_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_inet_type __read_mostly = {
@@ -149,3 +108,4 @@ module_exit(nft_reject_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_AF_EXPR(1, "reject");
+MODULE_DESCRIPTION("Netfilter nftables reject inet support");
diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c
new file mode 100644
index 000000000000..7865cd8b11bb
--- /dev/null
+++ b/net/netfilter/nft_reject_netdev.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 Laura Garcia Liebana <nevola@gmail.com>
+ * Copyright (c) 2020 Jose M. Guisado <guigom@riseup.net>
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+
+static void nft_reject_queue_xmit(struct sk_buff *nskb, struct sk_buff *oldskb)
+{
+ dev_hard_header(nskb, nskb->dev, ntohs(oldskb->protocol),
+ eth_hdr(oldskb)->h_source, eth_hdr(oldskb)->h_dest,
+ nskb->len);
+ dev_queue_xmit(nskb);
+}
+
+static void nft_reject_netdev_send_v4_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_send_v4_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_send_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+
+static void nft_reject_netdev_send_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct ethhdr *eth = eth_hdr(pkt->skb);
+ struct nft_reject *priv = nft_expr_priv(expr);
+ const unsigned char *dest = eth->h_dest;
+
+ if (is_broadcast_ether_addr(dest) ||
+ is_multicast_ether_addr(dest))
+ goto out;
+
+ switch (eth->h_proto) {
+ case htons(ETH_P_IP):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_netdev_send_v4_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmp_code(priv->icmp_code));
+ break;
+ }
+ break;
+ case htons(ETH_P_IPV6):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_netdev_send_v6_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmpv6_code(priv->icmp_code));
+ break;
+ }
+ break;
+ default:
+ /* No explicit way to reject this protocol, drop it. */
+ break;
+ }
+out:
+ regs->verdict.code = NF_DROP;
+}
+
+static int nft_reject_netdev_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS));
+}
+
+static struct nft_expr_type nft_reject_netdev_type;
+static const struct nft_expr_ops nft_reject_netdev_ops = {
+ .type = &nft_reject_netdev_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_netdev_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_netdev_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_reject_netdev_type __read_mostly = {
+ .family = NFPROTO_NETDEV,
+ .name = "reject",
+ .ops = &nft_reject_netdev_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_netdev_module_init(void)
+{
+ return nft_register_expr(&nft_reject_netdev_type);
+}
+
+static void __exit nft_reject_netdev_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_netdev_type);
+}
+
+module_init(nft_reject_netdev_module_init);
+module_exit(nft_reject_netdev_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laura Garcia Liebana <nevola@gmail.com>");
+MODULE_AUTHOR("Jose M. Guisado <guigom@riseup.net>");
+MODULE_DESCRIPTION("Reject packets from netdev via nftables");
+MODULE_ALIAS_NFT_AF_EXPR(5, "reject");
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 7cfcb0e2f7ee..71931ec91721 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -15,7 +15,7 @@
struct nft_rt {
enum nft_rt_keys key:8;
- enum nft_registers dreg:8;
+ u8 dreg;
};
static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst)
@@ -141,9 +141,8 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ return nft_parse_register_store(ctx, tb[NFTA_RT_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
static int nft_rt_get_dump(struct sk_buff *skb,
@@ -192,6 +191,7 @@ static const struct nft_expr_ops nft_rt_get_ops = {
.init = nft_rt_get_init,
.dump = nft_rt_get_dump,
.validate = nft_rt_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_rt_type __read_mostly = {
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 87e8d9ba0c9b..96081ac8d2b4 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -21,7 +21,7 @@ struct nft_bitmap_elem {
* the element state in the current and the future generation.
*
* An element can be in three states. The generation cursor is represented using
- * the ^ character, note that this cursor shifts on every succesful transaction.
+ * the ^ character, note that this cursor shifts on every successful transaction.
* If no transaction is going on, we observe all elements are in the following
* state:
*
@@ -39,7 +39,7 @@ struct nft_bitmap_elem {
* 10 = this element is active in the current generation and it becomes inactive
* ^ in the next one. This happens when the element is deactivated but commit
* path has not yet been executed yet, so removal is still pending. On
- * transation abortion, the next generation bit is reset to go back to
+ * transaction abortion, the next generation bit is reset to go back to
* restore its previous state.
*/
struct nft_bitmap {
@@ -73,8 +73,9 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
return (bitmap[idx] & (0x3 << off)) & (genmask << off);
}
-static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
{
const struct nft_bitmap *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -285,6 +286,8 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
/* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */
if (desc->klen > 2)
return false;
+ else if (desc->expr)
+ return false;
est->size = nft_bitmap_total_size(desc->klen);
est->lookup = NFT_SET_CLASS_O_1;
@@ -293,8 +296,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-struct nft_set_type nft_set_bitmap_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_bitmap_type = {
.ops = {
.privsize = nft_bitmap_privsize,
.elemsize = offsetof(struct nft_bitmap_elem, ext),
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index d350a7cd3af0..76de6c8d9865 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -74,8 +74,9 @@ static const struct rhashtable_params nft_rhash_params = {
.automatic_shrinking = true,
};
-static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
{
struct nft_rhash *priv = nft_set_priv(set);
const struct nft_rhash_elem *he;
@@ -142,6 +143,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
/* Another cpu may race to insert the element with the same key */
if (prev) {
nft_set_elem_destroy(set, he, true);
+ atomic_dec(&set->nelems);
he = prev;
}
@@ -151,6 +153,7 @@ out:
err2:
nft_set_elem_destroy(set, he, true);
+ atomic_dec(&set->nelems);
err1:
return false;
}
@@ -293,6 +296,22 @@ cont:
rhashtable_walk_exit(&hti);
}
+static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
+ struct nft_set_ext *ext)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ struct nft_expr *expr;
+ u32 size;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size) {
+ if (expr->ops->gc &&
+ expr->ops->gc(read_pnet(&set->net), expr))
+ return true;
+ }
+
+ return false;
+}
+
static void nft_rhash_gc(struct work_struct *work)
{
struct nft_set *set;
@@ -314,16 +333,13 @@ static void nft_rhash_gc(struct work_struct *work)
continue;
}
- if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
- struct nft_expr *expr = nft_set_ext_expr(&he->ext);
+ if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) &&
+ nft_rhash_expr_needs_gc_run(set, &he->ext))
+ goto needs_gc_run;
- if (expr->ops->gc &&
- expr->ops->gc(read_pnet(&set->net), expr))
- goto gc;
- }
if (!nft_set_elem_expired(&he->ext))
continue;
-gc:
+needs_gc_run:
if (nft_set_elem_mark_busy(&he->ext))
continue;
@@ -337,6 +353,12 @@ gc:
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
+ he = nft_set_catchall_gc(set);
+ if (he) {
+ gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+ if (gcb)
+ nft_set_gc_batch_add(gcb, he);
+ }
nft_set_gc_batch_complete(gcb);
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
@@ -393,9 +415,17 @@ static void nft_rhash_destroy(const struct nft_set *set)
(void *)set);
}
+/* Number of buckets is stored in u32, so cap our result to 1U<<31 */
+#define NFT_MAX_BUCKETS (1U << 31)
+
static u32 nft_hash_buckets(u32 size)
{
- return roundup_pow_of_two(size * 4 / 3);
+ u64 val = div_u64((u64)size * 4, 3);
+
+ if (val >= NFT_MAX_BUCKETS)
+ return NFT_MAX_BUCKETS;
+
+ return roundup_pow_of_two(val);
}
static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features,
@@ -419,8 +449,9 @@ struct nft_hash_elem {
struct nft_set_ext ext;
};
-static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -457,9 +488,10 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set,
return ERR_PTR(-ENOENT);
}
-static bool nft_hash_lookup_fast(const struct net *net,
- const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+bool nft_hash_lookup_fast(const struct net *net,
+ const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -596,7 +628,7 @@ static u64 nft_hash_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
{
return sizeof(struct nft_hash) +
- nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
+ (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
}
static int nft_hash_init(const struct nft_set *set,
@@ -636,8 +668,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
return false;
est->size = sizeof(struct nft_hash) +
- nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
- desc->size * sizeof(struct nft_hash_elem);
+ (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+ (u64)desc->size * sizeof(struct nft_hash_elem);
est->lookup = NFT_SET_CLASS_O_1;
est->space = NFT_SET_CLASS_O_N;
@@ -654,16 +686,15 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
return false;
est->size = sizeof(struct nft_hash) +
- nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
- desc->size * sizeof(struct nft_hash_elem);
+ (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+ (u64)desc->size * sizeof(struct nft_hash_elem);
est->lookup = NFT_SET_CLASS_O_1;
est->space = NFT_SET_CLASS_O_N;
return true;
}
-struct nft_set_type nft_set_rhash_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_rhash_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT | NFT_SET_EVAL,
.ops = {
@@ -686,8 +717,7 @@ struct nft_set_type nft_set_rhash_type __read_mostly = {
},
};
-struct nft_set_type nft_set_hash_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_hash_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
.privsize = nft_hash_privsize,
@@ -706,8 +736,7 @@ struct nft_set_type nft_set_hash_type __read_mostly = {
},
};
-struct nft_set_type nft_set_hash_fast_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_hash_fast_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
.privsize = nft_hash_privsize,
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 4fc0c924ed5d..4f9299b9dcdd 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -312,7 +312,7 @@
* Jay Ligatti, Josh Kuhn, and Chris Gage.
* Proceedings of the IEEE International Conference on Computer
* Communication Networks (ICCCN), August 2010.
- * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf
+ * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf
*
* [Rottenstreich 2010]
* Worst-Case TCAM Rule Expansion
@@ -325,149 +325,27 @@
* Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane,
* and Patrick Eugster.
* Proceedings of the 2014 ACM conference on SIGCOMM, August 2014.
- * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf
+ * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf
*/
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/log2.h>
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <uapi/linux/netfilter/nf_tables.h>
-#include <net/ipv6.h> /* For the maximum length of a field */
#include <linux/bitmap.h>
#include <linux/bitops.h>
-/* Count of concatenated fields depends on count of 32-bit nftables registers */
-#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT
-
-/* Largest supported field size */
-#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr))
-#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE)
-
-/* Number of bits to be grouped together in lookup table buckets, arbitrary */
-#define NFT_PIPAPO_GROUP_BITS 4
-#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS)
-
-/* Fields are padded to 32 bits in input registers */
-#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \
- (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32)))
-#define NFT_PIPAPO_GROUPS_PADDING(x) \
- (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE)
-
-/* Number of buckets, given by 2 ^ n, with n grouped bits */
-#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS)
-
-/* Each n-bit range maps to up to n * 2 rules */
-#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2))
-
-/* Use the rest of mapping table buckets for rule indices, but it makes no sense
- * to exceed 32 bits
- */
-#if BITS_PER_LONG == 64
-#define NFT_PIPAPO_MAP_TOBITS 32
-#else
-#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS)
-#endif
-
-/* ...which gives us the highest allowed index for a rule */
-#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \
- - (1UL << NFT_PIPAPO_MAP_NBITS))
-
-#define nft_pipapo_for_each_field(field, index, match) \
- for ((field) = (match)->f, (index) = 0; \
- (index) < (match)->field_count; \
- (index)++, (field)++)
-
-/**
- * union nft_pipapo_map_bucket - Bucket of mapping table
- * @to: First rule number (in next field) this rule maps to
- * @n: Number of rules (in next field) this rule maps to
- * @e: If there's no next field, pointer to element this rule maps to
- */
-union nft_pipapo_map_bucket {
- struct {
-#if BITS_PER_LONG == 64
- static_assert(NFT_PIPAPO_MAP_TOBITS <= 32);
- u32 to;
-
- static_assert(NFT_PIPAPO_MAP_NBITS <= 32);
- u32 n;
-#else
- unsigned long to:NFT_PIPAPO_MAP_TOBITS;
- unsigned long n:NFT_PIPAPO_MAP_NBITS;
-#endif
- };
- struct nft_pipapo_elem *e;
-};
-
-/**
- * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
- * @groups: Amount of 4-bit groups
- * @rules: Number of inserted rules
- * @bsize: Size of each bucket in lookup table, in longs
- * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets
- * @mt: Mapping table: one bucket per rule
- */
-struct nft_pipapo_field {
- int groups;
- unsigned long rules;
- size_t bsize;
- unsigned long *lt;
- union nft_pipapo_map_bucket *mt;
-};
-
-/**
- * struct nft_pipapo_match - Data used for lookup and matching
- * @field_count Amount of fields in set
- * @scratch: Preallocated per-CPU maps for partial matching results
- * @bsize_max: Maximum lookup table bucket size of all fields, in longs
- * @rcu Matching data is swapped on commits
- * @f: Fields, with lookup and mapping tables
- */
-struct nft_pipapo_match {
- int field_count;
- unsigned long * __percpu *scratch;
- size_t bsize_max;
- struct rcu_head rcu;
- struct nft_pipapo_field f[0];
-};
+#include "nft_set_pipapo_avx2.h"
+#include "nft_set_pipapo.h"
/* Current working bitmap index, toggled between field matches */
static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index);
/**
- * struct nft_pipapo - Representation of a set
- * @match: Currently in-use matching data
- * @clone: Copy where pending insertions and deletions are kept
- * @groups: Total amount of 4-bit groups for fields in this set
- * @width: Total bytes to be matched for one packet, including padding
- * @dirty: Working copy has pending insertions or deletions
- * @last_gc: Timestamp of last garbage collection run, jiffies
- */
-struct nft_pipapo {
- struct nft_pipapo_match __rcu *match;
- struct nft_pipapo_match *clone;
- int groups;
- int width;
- bool dirty;
- unsigned long last_gc;
-};
-
-struct nft_pipapo_elem;
-
-/**
- * struct nft_pipapo_elem - API-facing representation of single set element
- * @ext: nftables API extensions
- */
-struct nft_pipapo_elem {
- struct nft_set_ext ext;
-};
-
-/**
* pipapo_refill() - For each set bit, set bits from selected mapping table item
* @map: Bitmap to be scanned for set bits
* @len: Length of bitmap in longs
@@ -484,9 +362,8 @@ struct nft_pipapo_elem {
*
* Return: -1 on no match, bit position on 'match_only', 0 otherwise.
*/
-static int pipapo_refill(unsigned long *map, int len, int rules,
- unsigned long *dst, union nft_pipapo_map_bucket *mt,
- bool match_only)
+int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool match_only)
{
unsigned long bitset;
int k, ret = -1;
@@ -524,15 +401,15 @@ static int pipapo_refill(unsigned long *map, int len, int rules,
* nft_pipapo_lookup() - Lookup function
* @net: Network namespace
* @set: nftables API set representation
- * @elem: nftables API element representation containing key data
+ * @key: nftables API element representation containing key data
* @ext: nftables API extension pointer, filled with matching reference
*
* For more details, see DOC: Theory of Operation.
*
* Return: true on match, false otherwise.
*/
-static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
{
struct nft_pipapo *priv = nft_set_priv(set);
unsigned long *res_map, *fill_map;
@@ -559,26 +436,18 @@ static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1;
- unsigned long *lt = f->lt;
- int b, group;
+ int b;
- /* For each 4-bit group: select lookup table bucket depending on
+ /* For each bit group: select lookup table bucket depending on
* packet bytes value, then AND bucket value
*/
- for (group = 0; group < f->groups; group += 2) {
- u8 v;
-
- v = *rp >> 4;
- __bitmap_and(res_map, res_map, lt + v * f->bsize,
- f->bsize * BITS_PER_LONG);
- lt += f->bsize * NFT_PIPAPO_BUCKETS;
-
- v = *rp & 0x0f;
- rp++;
- __bitmap_and(res_map, res_map, lt + v * f->bsize,
- f->bsize * BITS_PER_LONG);
- lt += f->bsize * NFT_PIPAPO_BUCKETS;
- }
+ if (likely(f->bb == 8))
+ pipapo_and_field_buckets_8bit(f, res_map, rp);
+ else
+ pipapo_and_field_buckets_4bit(f, res_map, rp);
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+ rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
/* Now populate the bitmap for the next field, unless this is
* the last field, in which case return the matched 'ext'
@@ -621,7 +490,7 @@ next_match:
map_index = !map_index;
swap(res_map, fill_map);
- rp += NFT_PIPAPO_GROUPS_PADDING(f->groups);
+ rp += NFT_PIPAPO_GROUPS_PADDING(f);
}
out:
@@ -669,26 +538,19 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net,
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1;
- unsigned long *lt = f->lt;
- int b, group;
+ int b;
- /* For each 4-bit group: select lookup table bucket depending on
+ /* For each bit group: select lookup table bucket depending on
* packet bytes value, then AND bucket value
*/
- for (group = 0; group < f->groups; group++) {
- u8 v;
-
- if (group % 2) {
- v = *data & 0x0f;
- data++;
- } else {
- v = *data >> 4;
- }
- __bitmap_and(res_map, res_map, lt + v * f->bsize,
- f->bsize * BITS_PER_LONG);
+ if (f->bb == 8)
+ pipapo_and_field_buckets_8bit(f, res_map, data);
+ else if (f->bb == 4)
+ pipapo_and_field_buckets_4bit(f, res_map, data);
+ else
+ BUG();
- lt += f->bsize * NFT_PIPAPO_BUCKETS;
- }
+ data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
/* Now populate the bitmap for the next field, unless this is
* the last field, in which case return the matched 'ext'
@@ -713,7 +575,7 @@ next_match:
goto out;
}
- data += NFT_PIPAPO_GROUPS_PADDING(f->groups);
+ data += NFT_PIPAPO_GROUPS_PADDING(f);
/* Swap bitmap indices: fill_map will be the initial bitmap for
* the next field (i.e. the new res_map), and res_map is
@@ -736,8 +598,8 @@ out:
* @elem: nftables API element representation containing key data
* @flags: Unused
*/
-void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
return pipapo_get(net, set, (const u8 *)elem->key.val.data,
nft_genmask_cur(net));
@@ -763,6 +625,10 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
int group, bucket;
new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG);
+#ifdef NFT_PIPAPO_ALIGN
+ new_bucket_size = roundup(new_bucket_size,
+ NFT_PIPAPO_ALIGN / sizeof(*new_lt));
+#endif
if (new_bucket_size == f->bsize)
goto mt;
@@ -772,15 +638,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
else
copy = new_bucket_size;
- new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size *
- sizeof(*new_lt), GFP_KERNEL);
+ new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) *
+ new_bucket_size * sizeof(*new_lt) +
+ NFT_PIPAPO_ALIGN_HEADROOM,
+ GFP_KERNEL);
if (!new_lt)
return -ENOMEM;
- new_p = new_lt;
- old_p = old_lt;
+ new_p = NFT_PIPAPO_LT_ALIGN(new_lt);
+ old_p = NFT_PIPAPO_LT_ALIGN(old_lt);
+
for (group = 0; group < f->groups; group++) {
- for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) {
+ for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) {
memcpy(new_p, old_p, copy * sizeof(*new_p));
new_p += copy;
old_p += copy;
@@ -807,7 +676,7 @@ mt:
if (new_lt) {
f->bsize = new_bucket_size;
- f->lt = new_lt;
+ NFT_PIPAPO_LT_ASSIGN(f, new_lt);
kvfree(old_lt);
}
@@ -829,13 +698,196 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group,
{
unsigned long *pos;
- pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group;
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt);
+ pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group;
pos += f->bsize * v;
__set_bit(rule, pos);
}
/**
+ * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits
+ * @old_groups: Number of current groups
+ * @bsize: Size of one bucket, in longs
+ * @old_lt: Pointer to the current lookup table
+ * @new_lt: Pointer to the new, pre-allocated lookup table
+ *
+ * Each bucket with index b in the new lookup table, belonging to group g, is
+ * filled with the bit intersection between:
+ * - bucket with index given by the upper 4 bits of b, from group g, and
+ * - bucket with index given by the lower 4 bits of b, from group g + 1
+ *
+ * That is, given buckets from the new lookup table N(x, y) and the old lookup
+ * table O(x, y), with x bucket index, and y group index:
+ *
+ * N(b, g) := O(b / 16, g) & O(b % 16, g + 1)
+ *
+ * This ensures equivalence of the matching results on lookup. Two examples in
+ * pictures:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255
+ * 0 ^
+ * 1 | ^
+ * ... ( & ) |
+ * / \ |
+ * / \ .-( & )-.
+ * / bucket \ | |
+ * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 |
+ * 0 / \ | |
+ * 1 \ | |
+ * 2 | --'
+ * 3 '-
+ * ...
+ */
+static void pipapo_lt_4b_to_8b(int old_groups, int bsize,
+ unsigned long *old_lt, unsigned long *new_lt)
+{
+ int g, b, i;
+
+ for (g = 0; g < old_groups / 2; g++) {
+ int src_g0 = g * 2, src_g1 = g * 2 + 1;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) {
+ int src_b0 = b / NFT_PIPAPO_BUCKETS(4);
+ int src_b1 = b % NFT_PIPAPO_BUCKETS(4);
+ int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0;
+ int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1;
+
+ for (i = 0; i < bsize; i++) {
+ *new_lt = old_lt[src_i0 * bsize + i] &
+ old_lt[src_i1 * bsize + i];
+ new_lt++;
+ }
+ }
+ }
+}
+
+/**
+ * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits
+ * @old_groups: Number of current groups
+ * @bsize: Size of one bucket, in longs
+ * @old_lt: Pointer to the current lookup table
+ * @new_lt: Pointer to the new, pre-allocated lookup table
+ *
+ * Each bucket with index b in the new lookup table, belonging to group g, is
+ * filled with the bit union of:
+ * - all the buckets with index such that the upper four bits of the lower byte
+ * equal b, from group g, with g odd
+ * - all the buckets with index such that the lower four bits equal b, from
+ * group g, with g even
+ *
+ * That is, given buckets from the new lookup table N(x, y) and the old lookup
+ * table O(x, y), with x bucket index, and y group index:
+ *
+ * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4)
+ * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f)
+ *
+ * where U() denotes the arbitrary union operation (binary OR of n terms). This
+ * ensures equivalence of the matching results on lookup.
+ */
+static void pipapo_lt_8b_to_4b(int old_groups, int bsize,
+ unsigned long *old_lt, unsigned long *new_lt)
+{
+ int g, b, bsrc, i;
+
+ memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize *
+ sizeof(unsigned long));
+
+ for (g = 0; g < old_groups * 2; g += 2) {
+ int src_g = g / 2;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) {
+ for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g;
+ bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1);
+ bsrc++) {
+ if (((bsrc & 0xf0) >> 4) != b)
+ continue;
+
+ for (i = 0; i < bsize; i++)
+ new_lt[i] |= old_lt[bsrc * bsize + i];
+ }
+
+ new_lt += bsize;
+ }
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) {
+ for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g;
+ bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1);
+ bsrc++) {
+ if ((bsrc & 0x0f) != b)
+ continue;
+
+ for (i = 0; i < bsize; i++)
+ new_lt[i] |= old_lt[bsrc * bsize + i];
+ }
+
+ new_lt += bsize;
+ }
+ }
+}
+
+/**
+ * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed
+ * @f: Field containing lookup table
+ */
+static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
+{
+ unsigned long *new_lt;
+ int groups, bb;
+ size_t lt_size;
+
+ lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize *
+ sizeof(*f->lt);
+
+ if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET &&
+ lt_size > NFT_PIPAPO_LT_SIZE_HIGH) {
+ groups = f->groups * 2;
+ bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET;
+
+ lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
+ sizeof(*f->lt);
+ } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET &&
+ lt_size < NFT_PIPAPO_LT_SIZE_LOW) {
+ groups = f->groups / 2;
+ bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET;
+
+ lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
+ sizeof(*f->lt);
+
+ /* Don't increase group width if the resulting lookup table size
+ * would exceed the upper size threshold for a "small" set.
+ */
+ if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH)
+ return;
+ } else {
+ return;
+ }
+
+ new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL);
+ if (!new_lt)
+ return;
+
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+ if (f->bb == 4 && bb == 8) {
+ pipapo_lt_4b_to_8b(f->groups, f->bsize,
+ NFT_PIPAPO_LT_ALIGN(f->lt),
+ NFT_PIPAPO_LT_ALIGN(new_lt));
+ } else if (f->bb == 8 && bb == 4) {
+ pipapo_lt_8b_to_4b(f->groups, f->bsize,
+ NFT_PIPAPO_LT_ALIGN(f->lt),
+ NFT_PIPAPO_LT_ALIGN(new_lt));
+ } else {
+ BUG();
+ }
+
+ f->groups = groups;
+ f->bb = bb;
+ kvfree(f->lt);
+ NFT_PIPAPO_LT_ASSIGN(f, new_lt);
+}
+
+/**
* pipapo_insert() - Insert new rule in field given input key and mask length
* @f: Field containing lookup table
* @k: Input key for classification, without nftables padding
@@ -849,7 +901,7 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group,
static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
int mask_bits)
{
- int rule = f->rules++, group, ret;
+ int rule = f->rules++, group, ret, bit_offset = 0;
ret = pipapo_resize(f, f->rules - 1, f->rules);
if (ret)
@@ -859,28 +911,33 @@ static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
int i, v;
u8 mask;
- if (group % 2)
- v = k[group / 2] & 0x0f;
- else
- v = k[group / 2] >> 4;
+ v = k[group / (BITS_PER_BYTE / f->bb)];
+ v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0);
+ v >>= (BITS_PER_BYTE - bit_offset) - f->bb;
+
+ bit_offset += f->bb;
+ bit_offset %= BITS_PER_BYTE;
- if (mask_bits >= (group + 1) * 4) {
+ if (mask_bits >= (group + 1) * f->bb) {
/* Not masked */
pipapo_bucket_set(f, rule, group, v);
- } else if (mask_bits <= group * 4) {
+ } else if (mask_bits <= group * f->bb) {
/* Completely masked */
- for (i = 0; i < NFT_PIPAPO_BUCKETS; i++)
+ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++)
pipapo_bucket_set(f, rule, group, i);
} else {
/* The mask limit falls on this group */
- mask = 0x0f >> (mask_bits - group * 4);
- for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) {
+ mask = GENMASK(f->bb - 1, 0);
+ mask >>= mask_bits - group * f->bb;
+ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) {
if ((i & ~mask) == (v & ~mask))
pipapo_bucket_set(f, rule, group, i);
}
}
}
+ pipapo_lt_bits_adjust(f);
+
return 1;
}
@@ -1018,7 +1075,7 @@ out:
* @m: Matching data, including mapping table
* @map: Table of rule maps: array of first rule and amount of rules
* in next field a given rule maps to, for each field
- * @ext: For last field, nft_set_ext pointer matching rules map to
+ * @e: For last field, nft_set_ext pointer matching rules map to
*/
static void pipapo_map(struct nft_pipapo_match *m,
union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS],
@@ -1042,7 +1099,7 @@ static void pipapo_map(struct nft_pipapo_match *m,
/**
* pipapo_realloc_scratch() - Reallocate scratch maps for partial match results
* @clone: Copy of matching data with pending insertions and deletions
- * @bsize_max Maximum bucket size, scratch maps cover two buckets
+ * @bsize_max: Maximum bucket size, scratch maps cover two buckets
*
* Return: 0 on success, -ENOMEM on failure.
*/
@@ -1053,8 +1110,12 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
for_each_possible_cpu(i) {
unsigned long *scratch;
+#ifdef NFT_PIPAPO_ALIGN
+ unsigned long *scratch_aligned;
+#endif
- scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2,
+ scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 +
+ NFT_PIPAPO_ALIGN_HEADROOM,
GFP_KERNEL, cpu_to_node(i));
if (!scratch) {
/* On failure, there's no need to undo previous
@@ -1070,6 +1131,11 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
kfree(*per_cpu_ptr(clone->scratch, i));
*per_cpu_ptr(clone->scratch, i) = scratch;
+
+#ifdef NFT_PIPAPO_ALIGN
+ scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch);
+ *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned;
+#endif
}
return 0;
@@ -1098,21 +1164,41 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
struct nft_pipapo_field *f;
int i, bsize_max, err = 0;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
+ end = (const u8 *)nft_set_ext_key_end(ext)->data;
+ else
+ end = start;
+
dup = pipapo_get(net, set, start, genmask);
- if (PTR_ERR(dup) == -ENOENT) {
- if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) {
- end = (const u8 *)nft_set_ext_key_end(ext)->data;
- dup = pipapo_get(net, set, end, nft_genmask_next(net));
- } else {
- end = start;
+ if (!IS_ERR(dup)) {
+ /* Check if we already have the same exact entry */
+ const struct nft_data *dup_key, *dup_end;
+
+ dup_key = nft_set_ext_key(&dup->ext);
+ if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END))
+ dup_end = nft_set_ext_key_end(&dup->ext);
+ else
+ dup_end = dup_key;
+
+ if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) &&
+ !memcmp(end, dup_end->data, sizeof(*dup_end->data))) {
+ *ext2 = &dup->ext;
+ return -EEXIST;
}
+
+ return -ENOTEMPTY;
+ }
+
+ if (PTR_ERR(dup) == -ENOENT) {
+ /* Look for partially overlapping entries */
+ dup = pipapo_get(net, set, end, nft_genmask_next(net));
}
if (PTR_ERR(dup) != -ENOENT) {
if (IS_ERR(dup))
return PTR_ERR(dup);
*ext2 = &dup->ext;
- return -EEXIST;
+ return -ENOTEMPTY;
}
/* Validate */
@@ -1123,11 +1209,11 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
return -ENOSPC;
if (memcmp(start_p, end_p,
- f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0)
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0)
return -EINVAL;
- start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
- end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
/* Insert */
@@ -1141,32 +1227,31 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
rulemap[i].to = f->rules;
ret = memcmp(start, end,
- f->groups / NFT_PIPAPO_GROUPS_PER_BYTE);
- if (!ret) {
- ret = pipapo_insert(f, start,
- f->groups * NFT_PIPAPO_GROUP_BITS);
- } else {
- ret = pipapo_expand(f, start, end,
- f->groups * NFT_PIPAPO_GROUP_BITS);
- }
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f));
+ if (!ret)
+ ret = pipapo_insert(f, start, f->groups * f->bb);
+ else
+ ret = pipapo_expand(f, start, end, f->groups * f->bb);
if (f->bsize > bsize_max)
bsize_max = f->bsize;
rulemap[i].n = ret;
- start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
- end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
- if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) {
+ if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) {
+ put_cpu_ptr(m->scratch);
+
err = pipapo_realloc_scratch(m, bsize_max);
if (err)
return err;
- this_cpu_write(nft_pipapo_scratch_index, false);
-
m->bsize_max = bsize_max;
+ } else {
+ put_cpu_ptr(m->scratch);
}
*ext2 = &e->ext;
@@ -1200,23 +1285,40 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
if (!new->scratch)
goto out_scratch;
+#ifdef NFT_PIPAPO_ALIGN
+ new->scratch_aligned = alloc_percpu(*new->scratch_aligned);
+ if (!new->scratch_aligned)
+ goto out_scratch;
+#endif
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(new->scratch, i) = NULL;
+
+ if (pipapo_realloc_scratch(new, old->bsize_max))
+ goto out_scratch_realloc;
+
rcu_head_init(&new->rcu);
src = old->f;
dst = new->f;
for (i = 0; i < old->field_count; i++) {
+ unsigned long *new_lt;
+
memcpy(dst, src, offsetof(struct nft_pipapo_field, lt));
- dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS *
- src->bsize * sizeof(*dst->lt),
- GFP_KERNEL);
- if (!dst->lt)
+ new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) *
+ src->bsize * sizeof(*dst->lt) +
+ NFT_PIPAPO_ALIGN_HEADROOM,
+ GFP_KERNEL);
+ if (!new_lt)
goto out_lt;
- memcpy(dst->lt, src->lt,
+ NFT_PIPAPO_LT_ASSIGN(dst, new_lt);
+
+ memcpy(NFT_PIPAPO_LT_ALIGN(new_lt),
+ NFT_PIPAPO_LT_ALIGN(src->lt),
src->bsize * sizeof(*dst->lt) *
- src->groups * NFT_PIPAPO_BUCKETS);
+ src->groups * NFT_PIPAPO_BUCKETS(src->bb));
dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL);
if (!dst->mt)
@@ -1237,8 +1339,14 @@ out_lt:
kvfree(dst->lt);
dst--;
}
- free_percpu(new->scratch);
+out_scratch_realloc:
+ for_each_possible_cpu(i)
+ kfree(*per_cpu_ptr(new->scratch, i));
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(new->scratch_aligned);
+#endif
out_scratch:
+ free_percpu(new->scratch);
kfree(new);
return ERR_PTR(-ENOMEM);
@@ -1347,7 +1455,7 @@ static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules,
/**
* pipapo_drop() - Delete entry from lookup and mapping tables, given rule map
* @m: Matching data
- * @rulemap Table of rule maps, arrays of first rule and amount of rules
+ * @rulemap: Table of rule maps, arrays of first rule and amount of rules
* in next field a given entry maps to, for each field
*
* For each rule in lookup table buckets mapping to this set of rules, drop
@@ -1394,9 +1502,10 @@ static void pipapo_drop(struct nft_pipapo_match *m,
unsigned long *pos;
int b;
- pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize;
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g *
+ NFT_PIPAPO_BUCKETS(f->bb) * f->bsize;
- for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) {
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) {
bitmap_cut(pos, pos, rulemap[i].to,
rulemap[i].n,
f->bsize * BITS_PER_LONG);
@@ -1414,6 +1523,8 @@ static void pipapo_drop(struct nft_pipapo_match *m,
;
}
f->rules -= rulemap[i].n;
+
+ pipapo_lt_bits_adjust(f);
}
}
@@ -1426,11 +1537,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
{
struct nft_pipapo *priv = nft_set_priv(set);
int rules_f0, first_rule = 0;
+ struct nft_pipapo_elem *e;
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
struct nft_pipapo_field *f;
- struct nft_pipapo_elem *e;
int i, start, rules_fx;
start = first_rule;
@@ -1466,6 +1577,10 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
}
}
+ e = nft_set_catchall_gc(set);
+ if (e)
+ nft_set_elem_destroy(set, e, true);
+
priv->last_gc = jiffies;
}
@@ -1498,6 +1613,9 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
for_each_possible_cpu(i)
kfree(*per_cpu_ptr(m->scratch, i));
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(m->scratch_aligned);
+#endif
free_percpu(m->scratch);
pipapo_free_fields(m);
@@ -1690,30 +1808,33 @@ static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set,
static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule,
int rule_count, u8 *left, u8 *right)
{
+ int g, mask_len = 0, bit_offset = 0;
u8 *l = left, *r = right;
- int g, mask_len = 0;
for (g = 0; g < f->groups; g++) {
int b, x0, x1;
x0 = -1;
x1 = -1;
- for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) {
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) {
unsigned long *pos;
- pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize;
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt) +
+ (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize;
if (test_bit(first_rule, pos) && x0 == -1)
x0 = b;
if (test_bit(first_rule + rule_count - 1, pos))
x1 = b;
}
- if (g % 2) {
- *(l++) |= x0 & 0x0f;
- *(r++) |= x1 & 0x0f;
- } else {
- *l |= x0 << 4;
- *r |= x1 << 4;
+ *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset);
+ *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset);
+
+ bit_offset += f->bb;
+ if (bit_offset >= BITS_PER_BYTE) {
+ bit_offset %= BITS_PER_BYTE;
+ l++;
+ r++;
}
if (x1 - x0 == 0)
@@ -1748,8 +1869,9 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
pipapo_get_boundaries(f, first_rule, rule_count, left, right);
- return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) &&
- !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE);
+ return !memcmp(start, left,
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) &&
+ !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f));
}
/**
@@ -1801,8 +1923,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
rules_fx = f->mt[start].n;
start = f->mt[start].to;
- match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
- match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
if (i == m->field_count) {
@@ -1885,56 +2007,24 @@ static u64 nft_pipapo_privsize(const struct nlattr * const nla[],
}
/**
- * nft_pipapo_estimate() - Estimate set size, space and lookup complexity
- * @desc: Set description, element count and field description used here
+ * nft_pipapo_estimate() - Set size, space and lookup complexity
+ * @desc: Set description, element count and field description used
* @features: Flags: NFT_SET_INTERVAL needs to be there
* @est: Storage for estimation data
*
- * The size for this set type can vary dramatically, as it depends on the number
- * of rules (composing netmasks) the entries expand to. We compute the worst
- * case here.
- *
- * In general, for a non-ranged entry or a single composing netmask, we need
- * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that
- * is, each input bit needs four bits of matching data), plus a bucket in the
- * mapping table for each field.
- *
- * Return: true only for compatible range concatenations
+ * Return: true if set description is compatible, false otherwise
*/
static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est)
{
- unsigned long entry_size;
- int i;
-
- if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1)
+ if (!(features & NFT_SET_INTERVAL) ||
+ desc->field_count < NFT_PIPAPO_MIN_FIELDS)
return false;
- for (i = 0, entry_size = 0; i < desc->field_count; i++) {
- unsigned long rules;
-
- if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES)
- return false;
-
- /* Worst-case ranges for each concatenated field: each n-bit
- * field can expand to up to n * 2 rules in each bucket, and
- * each rule also needs a mapping bucket.
- */
- rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2;
- entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE;
- entry_size += rules * sizeof(union nft_pipapo_map_bucket);
- }
-
- /* Rules in lookup and mapping tables are needed for each entry */
- est->size = desc->size * entry_size;
- if (est->size && div_u64(est->size, desc->size) != entry_size)
+ est->size = pipapo_estimate_size(desc);
+ if (!est->size)
return false;
- est->size += sizeof(struct nft_pipapo) +
- sizeof(struct nft_pipapo_match) * 2;
-
- est->size += sizeof(struct nft_pipapo_field) * desc->field_count;
-
est->lookup = NFT_SET_CLASS_O_LOG_N;
est->space = NFT_SET_CLASS_O_N;
@@ -1961,38 +2051,52 @@ static int nft_pipapo_init(const struct nft_set *set,
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m;
struct nft_pipapo_field *f;
- int err, i;
+ int err, i, field_count;
+
+ field_count = desc->field_count ? : 1;
- if (desc->field_count > NFT_PIPAPO_MAX_FIELDS)
+ if (field_count > NFT_PIPAPO_MAX_FIELDS)
return -EINVAL;
- m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count,
+ m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count,
GFP_KERNEL);
if (!m)
return -ENOMEM;
- m->field_count = desc->field_count;
+ m->field_count = field_count;
m->bsize_max = 0;
m->scratch = alloc_percpu(unsigned long *);
if (!m->scratch) {
err = -ENOMEM;
- goto out_free;
+ goto out_scratch;
}
for_each_possible_cpu(i)
*per_cpu_ptr(m->scratch, i) = NULL;
+#ifdef NFT_PIPAPO_ALIGN
+ m->scratch_aligned = alloc_percpu(unsigned long *);
+ if (!m->scratch_aligned) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(m->scratch_aligned, i) = NULL;
+#endif
+
rcu_head_init(&m->rcu);
nft_pipapo_for_each_field(f, i, m) {
- f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE;
- priv->groups += f->groups;
+ int len = desc->field_len[i] ? : set->klen;
+
+ f->bb = NFT_PIPAPO_GROUP_BITS_INIT;
+ f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f);
- priv->width += round_up(desc->field_len[i], sizeof(u32));
+ priv->width += round_up(len, sizeof(u32));
f->bsize = 0;
f->rules = 0;
- f->lt = NULL;
+ NFT_PIPAPO_LT_ASSIGN(f, NULL);
f->mt = NULL;
}
@@ -2010,13 +2114,43 @@ static int nft_pipapo_init(const struct nft_set *set,
return 0;
out_free:
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(m->scratch_aligned);
+#endif
free_percpu(m->scratch);
+out_scratch:
kfree(m);
return err;
}
/**
+ * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array
+ * @set: nftables API set representation
+ * @m: matching data pointing to key mapping array
+ */
+static void nft_set_pipapo_match_destroy(const struct nft_set *set,
+ struct nft_pipapo_match *m)
+{
+ struct nft_pipapo_field *f;
+ int i, r;
+
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
+ ;
+
+ for (r = 0; r < f->rules; r++) {
+ struct nft_pipapo_elem *e;
+
+ if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
+ continue;
+
+ e = f->mt[r].e;
+
+ nft_set_elem_destroy(set, e, true);
+ }
+}
+
+/**
* nft_pipapo_destroy() - Free private data for set and all committed elements
* @set: nftables API set representation
*/
@@ -2024,37 +2158,34 @@ static void nft_pipapo_destroy(const struct nft_set *set)
{
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m;
- struct nft_pipapo_field *f;
- int i, r, cpu;
+ int cpu;
m = rcu_dereference_protected(priv->match, true);
if (m) {
rcu_barrier();
- for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
- ;
-
- for (r = 0; r < f->rules; r++) {
- struct nft_pipapo_elem *e;
-
- if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
- continue;
-
- e = f->mt[r].e;
-
- nft_set_elem_destroy(set, e, true);
- }
+ nft_set_pipapo_match_destroy(set, m);
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(m->scratch_aligned);
+#endif
for_each_possible_cpu(cpu)
kfree(*per_cpu_ptr(m->scratch, cpu));
free_percpu(m->scratch);
-
pipapo_free_fields(m);
kfree(m);
priv->match = NULL;
}
if (priv->clone) {
+ m = priv->clone;
+
+ if (priv->dirty)
+ nft_set_pipapo_match_destroy(set, m);
+
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(priv->clone->scratch_aligned);
+#endif
for_each_possible_cpu(cpu)
kfree(*per_cpu_ptr(priv->clone->scratch, cpu));
free_percpu(priv->clone->scratch);
@@ -2081,8 +2212,7 @@ static void nft_pipapo_gc_init(const struct nft_set *set)
priv->last_gc = jiffies;
}
-struct nft_set_type nft_set_pipapo_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_pipapo_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT,
.ops = {
@@ -2102,3 +2232,26 @@ struct nft_set_type nft_set_pipapo_type __read_mostly = {
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+const struct nft_set_type nft_set_pipapo_avx2_type = {
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
+ NFT_SET_TIMEOUT,
+ .ops = {
+ .lookup = nft_pipapo_avx2_lookup,
+ .insert = nft_pipapo_insert,
+ .activate = nft_pipapo_activate,
+ .deactivate = nft_pipapo_deactivate,
+ .flush = nft_pipapo_flush,
+ .remove = nft_pipapo_remove,
+ .walk = nft_pipapo_walk,
+ .get = nft_pipapo_get,
+ .privsize = nft_pipapo_privsize,
+ .estimate = nft_pipapo_avx2_estimate,
+ .init = nft_pipapo_init,
+ .destroy = nft_pipapo_destroy,
+ .gc_init = nft_pipapo_gc_init,
+ .elemsize = offsetof(struct nft_pipapo_elem, ext),
+ },
+};
+#endif
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
new file mode 100644
index 000000000000..25a75591583e
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo.h
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef _NFT_SET_PIPAPO_H
+
+#include <linux/log2.h>
+#include <net/ipv6.h> /* For the maximum length of a field */
+
+/* Count of concatenated fields depends on count of 32-bit nftables registers */
+#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT
+
+/* Restrict usage to multiple fields, make sure rbtree is used otherwise */
+#define NFT_PIPAPO_MIN_FIELDS 2
+
+/* Largest supported field size */
+#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr))
+#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE)
+
+/* Bits to be grouped together in table buckets depending on set size */
+#define NFT_PIPAPO_GROUP_BITS_INIT NFT_PIPAPO_GROUP_BITS_SMALL_SET
+#define NFT_PIPAPO_GROUP_BITS_SMALL_SET 8
+#define NFT_PIPAPO_GROUP_BITS_LARGE_SET 4
+#define NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4 \
+ BUILD_BUG_ON((NFT_PIPAPO_GROUP_BITS_SMALL_SET != 8) || \
+ (NFT_PIPAPO_GROUP_BITS_LARGE_SET != 4))
+#define NFT_PIPAPO_GROUPS_PER_BYTE(f) (BITS_PER_BYTE / (f)->bb)
+
+/* If a lookup table gets bigger than NFT_PIPAPO_LT_SIZE_HIGH, switch to the
+ * small group width, and switch to the big group width if the table gets
+ * smaller than NFT_PIPAPO_LT_SIZE_LOW.
+ *
+ * Picking 2MiB as threshold (for a single table) avoids as much as possible
+ * crossing page boundaries on most architectures (x86-64 and MIPS huge pages,
+ * ARMv7 supersections, POWER "large" pages, SPARC Level 1 regions, etc.), which
+ * keeps performance nice in case kvmalloc() gives us non-contiguous areas.
+ */
+#define NFT_PIPAPO_LT_SIZE_THRESHOLD (1 << 21)
+#define NFT_PIPAPO_LT_SIZE_HYSTERESIS (1 << 16)
+#define NFT_PIPAPO_LT_SIZE_HIGH NFT_PIPAPO_LT_SIZE_THRESHOLD
+#define NFT_PIPAPO_LT_SIZE_LOW NFT_PIPAPO_LT_SIZE_THRESHOLD - \
+ NFT_PIPAPO_LT_SIZE_HYSTERESIS
+
+/* Fields are padded to 32 bits in input registers */
+#define NFT_PIPAPO_GROUPS_PADDED_SIZE(f) \
+ (round_up((f)->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f), sizeof(u32)))
+#define NFT_PIPAPO_GROUPS_PADDING(f) \
+ (NFT_PIPAPO_GROUPS_PADDED_SIZE(f) - (f)->groups / \
+ NFT_PIPAPO_GROUPS_PER_BYTE(f))
+
+/* Number of buckets given by 2 ^ n, with n bucket bits */
+#define NFT_PIPAPO_BUCKETS(bb) (1 << (bb))
+
+/* Each n-bit range maps to up to n * 2 rules */
+#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2))
+
+/* Use the rest of mapping table buckets for rule indices, but it makes no sense
+ * to exceed 32 bits
+ */
+#if BITS_PER_LONG == 64
+#define NFT_PIPAPO_MAP_TOBITS 32
+#else
+#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS)
+#endif
+
+/* ...which gives us the highest allowed index for a rule */
+#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \
+ - (1UL << NFT_PIPAPO_MAP_NBITS))
+
+/* Definitions for vectorised implementations */
+#ifdef NFT_PIPAPO_ALIGN
+#define NFT_PIPAPO_ALIGN_HEADROOM \
+ (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN)
+#define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN))
+#define NFT_PIPAPO_LT_ASSIGN(field, x) \
+ do { \
+ (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \
+ (field)->lt = (x); \
+ } while (0)
+#else
+#define NFT_PIPAPO_ALIGN_HEADROOM 0
+#define NFT_PIPAPO_LT_ALIGN(lt) (lt)
+#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x))
+#endif /* NFT_PIPAPO_ALIGN */
+
+#define nft_pipapo_for_each_field(field, index, match) \
+ for ((field) = (match)->f, (index) = 0; \
+ (index) < (match)->field_count; \
+ (index)++, (field)++)
+
+/**
+ * union nft_pipapo_map_bucket - Bucket of mapping table
+ * @to: First rule number (in next field) this rule maps to
+ * @n: Number of rules (in next field) this rule maps to
+ * @e: If there's no next field, pointer to element this rule maps to
+ */
+union nft_pipapo_map_bucket {
+ struct {
+#if BITS_PER_LONG == 64
+ static_assert(NFT_PIPAPO_MAP_TOBITS <= 32);
+ u32 to;
+
+ static_assert(NFT_PIPAPO_MAP_NBITS <= 32);
+ u32 n;
+#else
+ unsigned long to:NFT_PIPAPO_MAP_TOBITS;
+ unsigned long n:NFT_PIPAPO_MAP_NBITS;
+#endif
+ };
+ struct nft_pipapo_elem *e;
+};
+
+/**
+ * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
+ * @groups: Amount of bit groups
+ * @rules: Number of inserted rules
+ * @bsize: Size of each bucket in lookup table, in longs
+ * @bb: Number of bits grouped together in lookup table buckets
+ * @lt: Lookup table: 'groups' rows of buckets
+ * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes
+ * @mt: Mapping table: one bucket per rule
+ */
+struct nft_pipapo_field {
+ int groups;
+ unsigned long rules;
+ size_t bsize;
+ int bb;
+#ifdef NFT_PIPAPO_ALIGN
+ unsigned long *lt_aligned;
+#endif
+ unsigned long *lt;
+ union nft_pipapo_map_bucket *mt;
+};
+
+/**
+ * struct nft_pipapo_match - Data used for lookup and matching
+ * @field_count Amount of fields in set
+ * @scratch: Preallocated per-CPU maps for partial matching results
+ * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes
+ * @bsize_max: Maximum lookup table bucket size of all fields, in longs
+ * @rcu Matching data is swapped on commits
+ * @f: Fields, with lookup and mapping tables
+ */
+struct nft_pipapo_match {
+ int field_count;
+#ifdef NFT_PIPAPO_ALIGN
+ unsigned long * __percpu *scratch_aligned;
+#endif
+ unsigned long * __percpu *scratch;
+ size_t bsize_max;
+ struct rcu_head rcu;
+ struct nft_pipapo_field f[];
+};
+
+/**
+ * struct nft_pipapo - Representation of a set
+ * @match: Currently in-use matching data
+ * @clone: Copy where pending insertions and deletions are kept
+ * @width: Total bytes to be matched for one packet, including padding
+ * @dirty: Working copy has pending insertions or deletions
+ * @last_gc: Timestamp of last garbage collection run, jiffies
+ */
+struct nft_pipapo {
+ struct nft_pipapo_match __rcu *match;
+ struct nft_pipapo_match *clone;
+ int width;
+ bool dirty;
+ unsigned long last_gc;
+};
+
+struct nft_pipapo_elem;
+
+/**
+ * struct nft_pipapo_elem - API-facing representation of single set element
+ * @ext: nftables API extensions
+ */
+struct nft_pipapo_elem {
+ struct nft_set_ext ext;
+};
+
+int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool match_only);
+
+/**
+ * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets
+ * @f: Field including lookup table
+ * @dst: Area to store result
+ * @data: Input data selecting table buckets
+ */
+static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
+ unsigned long *dst,
+ const u8 *data)
+{
+ unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
+ int group;
+
+ for (group = 0; group < f->groups; group += BITS_PER_BYTE / 4, data++) {
+ u8 v;
+
+ v = *data >> 4;
+ __bitmap_and(dst, dst, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
+
+ v = *data & 0x0f;
+ __bitmap_and(dst, dst, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
+ }
+}
+
+/**
+ * pipapo_and_field_buckets_8bit() - Intersect 8-bit buckets
+ * @f: Field including lookup table
+ * @dst: Area to store result
+ * @data: Input data selecting table buckets
+ */
+static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f,
+ unsigned long *dst,
+ const u8 *data)
+{
+ unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
+ int group;
+
+ for (group = 0; group < f->groups; group++, data++) {
+ __bitmap_and(dst, dst, lt + *data * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(8);
+ }
+}
+
+/**
+ * pipapo_estimate_size() - Estimate worst-case for set size
+ * @desc: Set description, element count and field description used here
+ *
+ * The size for this set type can vary dramatically, as it depends on the number
+ * of rules (composing netmasks) the entries expand to. We compute the worst
+ * case here.
+ *
+ * In general, for a non-ranged entry or a single composing netmask, we need
+ * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that
+ * is, each input bit needs four bits of matching data), plus a bucket in the
+ * mapping table for each field.
+ *
+ * Return: worst-case set size in bytes, 0 on any overflow
+ */
+static u64 pipapo_estimate_size(const struct nft_set_desc *desc)
+{
+ unsigned long entry_size;
+ u64 size;
+ int i;
+
+ for (i = 0, entry_size = 0; i < desc->field_count; i++) {
+ unsigned long rules;
+
+ if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES)
+ return 0;
+
+ /* Worst-case ranges for each concatenated field: each n-bit
+ * field can expand to up to n * 2 rules in each bucket, and
+ * each rule also needs a mapping bucket.
+ */
+ rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2;
+ entry_size += rules *
+ NFT_PIPAPO_BUCKETS(NFT_PIPAPO_GROUP_BITS_INIT) /
+ BITS_PER_BYTE;
+ entry_size += rules * sizeof(union nft_pipapo_map_bucket);
+ }
+
+ /* Rules in lookup and mapping tables are needed for each entry */
+ size = desc->size * entry_size;
+ if (size && div_u64(size, desc->size) != entry_size)
+ return 0;
+
+ size += sizeof(struct nft_pipapo) + sizeof(struct nft_pipapo_match) * 2;
+
+ size += sizeof(struct nft_pipapo_field) * desc->field_count;
+
+ return size;
+}
+
+#endif /* _NFT_SET_PIPAPO_H */
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
new file mode 100644
index 000000000000..52e0d026d30a
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -0,0 +1,1228 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
+ *
+ * Copyright (c) 2019-2020 Red Hat GmbH
+ *
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include <linux/compiler.h>
+#include <asm/fpu/api.h>
+
+#include "nft_set_pipapo_avx2.h"
+#include "nft_set_pipapo.h"
+
+#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG)
+
+/* Load from memory into YMM register with non-temporal hint ("stream load"),
+ * that is, don't fetch lines from memory into the cache. This avoids pushing
+ * precious packet data out of the cache hierarchy, and is appropriate when:
+ *
+ * - loading buckets from lookup tables, as they are not going to be used
+ * again before packets are entirely classified
+ *
+ * - loading the result bitmap from the previous field, as it's never used
+ * again
+ */
+#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \
+ asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
+
+/* Stream a single lookup table bucket into YMM register given lookup table,
+ * group index, value of packet bits, bucket size.
+ */
+#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \
+ NFT_PIPAPO_AVX2_LOAD(reg, \
+ lt[((group) * NFT_PIPAPO_BUCKETS(4) + \
+ (v)) * (bsize)])
+#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \
+ NFT_PIPAPO_AVX2_LOAD(reg, \
+ lt[((group) * NFT_PIPAPO_BUCKETS(8) + \
+ (v)) * (bsize)])
+
+/* Bitwise AND: the staple operation of this algorithm */
+#define NFT_PIPAPO_AVX2_AND(dst, a, b) \
+ asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
+
+/* Jump to label if @reg is zero */
+#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
+ asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
+ "je %l[" #label "]" : : : : label)
+
+/* Store 256 bits from YMM register into memory. Contrary to bucket load
+ * operation, we don't bypass the cache here, as stored matching results
+ * are always used shortly after.
+ */
+#define NFT_PIPAPO_AVX2_STORE(loc, reg) \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
+
+/* Zero out a complete YMM register, @reg */
+#define NFT_PIPAPO_AVX2_ZERO(reg) \
+ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
+
+/* Current working bitmap index, toggled between field matches */
+static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
+
+/**
+ * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
+ *
+ * This zeroes out ymm15, which is later used whenever we need to clear a
+ * memory location, by storing its content into memory.
+ */
+static void nft_pipapo_avx2_prepare(void)
+{
+ NFT_PIPAPO_AVX2_ZERO(15);
+}
+
+/**
+ * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
+ * @data: Base memory area
+ * @start: First bit to set
+ * @len: Count of bits to fill
+ *
+ * This is nothing else than a version of bitmap_set(), as used e.g. by
+ * pipapo_refill(), tailored for the microarchitectures using it and better
+ * suited for the specific usage: it's very likely that we'll set a small number
+ * of bits, not crossing a word boundary, and correct branch prediction is
+ * critical here.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ */
+static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
+{
+ int offset = start % BITS_PER_LONG;
+ unsigned long mask;
+
+ data += start / BITS_PER_LONG;
+
+ if (likely(len == 1)) {
+ *data |= BIT(offset);
+ return;
+ }
+
+ if (likely(len < BITS_PER_LONG || offset)) {
+ if (likely(len + offset <= BITS_PER_LONG)) {
+ *data |= GENMASK(len - 1 + offset, offset);
+ return;
+ }
+
+ *data |= ~0UL << offset;
+ len -= BITS_PER_LONG - offset;
+ data++;
+
+ if (len <= BITS_PER_LONG) {
+ mask = ~0UL >> (BITS_PER_LONG - len);
+ *data |= mask;
+ return;
+ }
+ }
+
+ memset(data, 0xff, len / BITS_PER_BYTE);
+ data += len / BITS_PER_LONG;
+
+ len %= BITS_PER_LONG;
+ if (len)
+ *data |= ~0UL >> (BITS_PER_LONG - len);
+}
+
+/**
+ * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
+ * @offset: Start from given bitmap (equivalent to bucket) offset, in longs
+ * @map: Bitmap to be scanned for set bits
+ * @dst: Destination bitmap
+ * @mt: Mapping table containing bit set specifiers
+ * @last: Return index of first set bit, if this is the last field
+ *
+ * This is an alternative implementation of pipapo_refill() suitable for usage
+ * with AVX2 lookup routines: we know there are four words to be scanned, at
+ * a given offset inside the map, for each matching iteration.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ *
+ * Return: first set bit index if @last, index of first filled word otherwise.
+ */
+static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
+ unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool last)
+{
+ int ret = -1;
+
+#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \
+ do { \
+ while (map[(x)]) { \
+ int r = __builtin_ctzl(map[(x)]); \
+ int i = (offset + (x)) * BITS_PER_LONG + r; \
+ \
+ if (last) \
+ return i; \
+ \
+ nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \
+ \
+ if (ret == -1) \
+ ret = mt[i].to; \
+ \
+ map[(x)] &= ~(1UL << r); \
+ } \
+ } while (0)
+
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
+#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * Load buckets from lookup table corresponding to the values of each 4-bit
+ * group of packet bytes, and perform a bitwise intersection between them. If
+ * this is the first field in the set, simply AND the buckets together
+ * (equivalent to using an all-ones starting bitmap), use the provided starting
+ * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
+ * working bitmap, @fill.
+ *
+ * This is used for 8-bit fields (i.e. protocol numbers).
+ *
+ * Out-of-order (and superscalar) execution is vital here, so it's critical to
+ * avoid false data dependencies. CPU and compiler could (mostly) take care of
+ * this on their own, but the operation ordering is explicitly given here with
+ * a likely execution order in mind, to highlight possible stalls. That's why
+ * a number of logically distinct operations (i.e. loading buckets, intersecting
+ * buckets) are interleaved.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_AND(4, 2, 3);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 16-bit fields (i.e. ports).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(7, 6, 7);
+ }
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_AND(9, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 6, 7);
+ NFT_PIPAPO_AVX2_AND(12, 8, 9);
+ NFT_PIPAPO_AVX2_AND(13, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 6, 7);
+ NFT_PIPAPO_AVX2_AND(13, 8, 9);
+ NFT_PIPAPO_AVX2_AND(14, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ NFT_PIPAPO_AVX2_AND(1, 1, 14);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 9, 10);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ NFT_PIPAPO_AVX2_AND(8, 6, 7);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
+ pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf,
+ pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf,
+ pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
+ pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
+ pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(14, 9, 10);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize);
+ NFT_PIPAPO_AVX2_AND(7, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 8, 9);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 12, 13);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 14, 0);
+ NFT_PIPAPO_AVX2_AND(7, 1, 2);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 3, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 9, 10);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 6, 7);
+ NFT_PIPAPO_AVX2_AND(1, 8, 9);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_AND(3, 12, 0);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(4, 1, 2);
+ NFT_PIPAPO_AVX2_AND(5, 3, 4);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 8-bit fields (i.e. protocol numbers).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_AND(2, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 16-bit fields (i.e. ports).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(4, 3, 2);
+ }
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ NFT_PIPAPO_AVX2_AND(0, 6, 7);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(7, 2, 3);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_AND(1, 6, 7);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_AND(4, 2, 3);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 1, 2);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 3, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize);
+ NFT_PIPAPO_AVX2_AND(3, 5, 6);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(6, 4, 5);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * This function should never be called, but is provided for the case the field
+ * size doesn't match any of the known data types. Matching rate is
+ * substantially lower than AVX2 routines.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ unsigned long bsize = f->bsize;
+ int i, ret = -1, b;
+
+ if (first)
+ memset(map, 0xff, bsize * sizeof(*map));
+
+ for (i = offset; i < bsize; i++) {
+ if (f->bb == 8)
+ pipapo_and_field_buckets_8bit(f, map, pkt);
+ else
+ pipapo_and_field_buckets_4bit(f, map, pkt);
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+ b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
+
+ if (last)
+ return b;
+
+ if (ret == -1)
+ ret = b / XSAVE_YMM_SIZE;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
+ * @desc: Set description, element count and field description used
+ * @features: Flags: NFT_SET_INTERVAL needs to be there
+ * @est: Storage for estimation data
+ *
+ * Return: true if set is compatible and AVX2 available, false otherwise.
+ */
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ if (!(features & NFT_SET_INTERVAL) ||
+ desc->field_count < NFT_PIPAPO_MIN_FIELDS)
+ return false;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
+ return false;
+
+ est->size = pipapo_estimate_size(desc);
+ if (!est->size)
+ return false;
+
+ est->lookup = NFT_SET_CLASS_O_LOG_N;
+
+ est->space = NFT_SET_CLASS_O_N;
+
+ return true;
+}
+
+/**
+ * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @key: nftables API element representation containing key data
+ * @ext: nftables API extension pointer, filled with matching reference
+ *
+ * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
+ *
+ * This implementation exploits the repetitive characteristic of the algorithm
+ * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
+ *
+ * Return: true on match, false otherwise.
+ */
+bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ unsigned long *res, *fill, *scratch;
+ u8 genmask = nft_genmask_cur(net);
+ const u8 *rp = (const u8 *)key;
+ struct nft_pipapo_match *m;
+ struct nft_pipapo_field *f;
+ bool map_index;
+ int i, ret = 0;
+
+ if (unlikely(!irq_fpu_usable()))
+ return nft_pipapo_lookup(net, set, key, ext);
+
+ m = rcu_dereference(priv->match);
+
+ /* This also protects access to all data related to scratch maps.
+ *
+ * Note that we don't need a valid MXCSR state for any of the
+ * operations we use here, so pass 0 as mask and spare a LDMXCSR
+ * instruction.
+ */
+ kernel_fpu_begin_mask(0);
+
+ scratch = *raw_cpu_ptr(m->scratch_aligned);
+ if (unlikely(!scratch)) {
+ kernel_fpu_end();
+ return false;
+ }
+ map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
+
+ res = scratch + (map_index ? m->bsize_max : 0);
+ fill = scratch + (map_index ? 0 : m->bsize_max);
+
+ /* Starting map doesn't need to be set for this implementation */
+
+ nft_pipapo_avx2_prepare();
+
+next_match:
+ nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1, first = !i;
+
+#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
+ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
+ ret, rp, \
+ first, last))
+
+ if (likely(f->bb == 8)) {
+ if (f->groups == 1) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
+ } else if (f->groups == 2) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
+ } else if (f->groups == 4) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
+ } else if (f->groups == 6) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
+ } else if (f->groups == 16) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
+ } else {
+ ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
+ ret, rp,
+ first, last);
+ }
+ } else {
+ if (f->groups == 2) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
+ } else if (f->groups == 4) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
+ } else if (f->groups == 8) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
+ } else if (f->groups == 12) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
+ } else if (f->groups == 32) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
+ } else {
+ ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
+ ret, rp,
+ first, last);
+ }
+ }
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+#undef NFT_SET_PIPAPO_AVX2_LOOKUP
+
+ if (ret < 0)
+ goto out;
+
+ if (last) {
+ *ext = &f->mt[ret].e->ext;
+ if (unlikely(nft_set_elem_expired(*ext) ||
+ !nft_set_elem_active(*ext, genmask))) {
+ ret = 0;
+ goto next_match;
+ }
+
+ goto out;
+ }
+
+ swap(res, fill);
+ rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ }
+
+out:
+ if (i % 2)
+ raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
+ kernel_fpu_end();
+
+ return ret >= 0;
+}
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
new file mode 100644
index 000000000000..dbb6aaca8a7a
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _NFT_SET_PIPAPO_AVX2_H
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+#include <asm/fpu/xstate.h>
+#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
+
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est);
+#endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */
+
+#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 5000b938ab1e..7325bee7d144 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -18,7 +18,7 @@
struct nft_rbtree {
struct rb_root root;
rwlock_t lock;
- seqcount_t count;
+ seqcount_rwlock_t count;
struct delayed_work gc_work;
};
@@ -33,6 +33,11 @@ static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
}
+static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
+{
+ return !nft_rbtree_interval_end(rbe);
+}
+
static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
const struct nft_rbtree_elem *interval)
{
@@ -64,7 +69,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
if (interval &&
nft_rbtree_equal(set, this, interval) &&
nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(interval))
+ nft_rbtree_interval_start(interval))
continue;
interval = rbe;
} else if (d > 0)
@@ -74,6 +79,10 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
parent = rcu_dereference_raw(parent->rb_left);
continue;
}
+
+ if (nft_set_elem_expired(&rbe->ext))
+ return false;
+
if (nft_rbtree_interval_end(rbe)) {
if (nft_set_is_anonymous(set))
return false;
@@ -89,7 +98,8 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
- !nft_rbtree_interval_end(interval)) {
+ !nft_set_elem_expired(&interval->ext) &&
+ nft_rbtree_interval_start(interval)) {
*ext = &interval->ext;
return true;
}
@@ -97,8 +107,9 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
return false;
}
-static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
{
struct nft_rbtree *priv = nft_set_priv(set);
unsigned int seq = read_seqcount_begin(&priv->count);
@@ -149,6 +160,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
continue;
}
+ if (nft_set_elem_expired(&rbe->ext))
+ return false;
+
if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
(flags & NFT_SET_ELEM_INTERVAL_END)) {
@@ -165,6 +179,7 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
+ !nft_set_elem_expired(&interval->ext) &&
((!nft_rbtree_interval_end(interval) &&
!(flags & NFT_SET_ELEM_INTERVAL_END)) ||
(nft_rbtree_interval_end(interval) &&
@@ -204,12 +219,66 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree_elem *new,
struct nft_set_ext **ext)
{
+ bool overlap = false, dup_end_left = false, dup_end_right = false;
struct nft_rbtree *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
struct nft_rbtree_elem *rbe;
struct rb_node *parent, **p;
int d;
+ /* Detect overlaps as we descend the tree. Set the flag in these cases:
+ *
+ * a1. _ _ __>| ?_ _ __| (insert end before existing end)
+ * a2. _ _ ___| ?_ _ _>| (insert end after existing end)
+ * a3. _ _ ___? >|_ _ __| (insert start before existing end)
+ *
+ * and clear it later on, as we eventually reach the points indicated by
+ * '?' above, in the cases described below. We'll always meet these
+ * later, locally, due to tree ordering, and overlaps for the intervals
+ * that are the closest together are always evaluated last.
+ *
+ * b1. _ _ __>| !_ _ __| (insert end before existing start)
+ * b2. _ _ ___| !_ _ _>| (insert end after existing start)
+ * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf)
+ * '--' no nodes falling in this range
+ * b4. >|_ _ ! (insert start before existing start)
+ *
+ * Case a3. resolves to b3.:
+ * - if the inserted start element is the leftmost, because the '0'
+ * element in the tree serves as end element
+ * - otherwise, if an existing end is found immediately to the left. If
+ * there are existing nodes in between, we need to further descend the
+ * tree before we can conclude the new start isn't causing an overlap
+ *
+ * or to b4., which, preceded by a3., means we already traversed one or
+ * more existing intervals entirely, from the right.
+ *
+ * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
+ * in that order.
+ *
+ * The flag is also cleared in two special cases:
+ *
+ * b5. |__ _ _!|<_ _ _ (insert start right before existing end)
+ * b6. |__ _ >|!__ _ _ (insert end right after existing start)
+ *
+ * which always happen as last step and imply that no further
+ * overlapping is possible.
+ *
+ * Another special case comes from the fact that start elements matching
+ * an already existing start element are allowed: insertion is not
+ * performed but we return -EEXIST in that case, and the error will be
+ * cleared by the caller if NLM_F_EXCL is not present in the request.
+ * This way, request for insertion of an exact overlap isn't reported as
+ * error to userspace if not desired.
+ *
+ * However, if the existing start matches a pre-existing start, but the
+ * end element doesn't match the corresponding pre-existing end element,
+ * we need to report a partial overlap. This is a local condition that
+ * can be noticed without need for a tracking flag, by checking for a
+ * local duplicated end for a corresponding start, from left and right,
+ * separately.
+ */
+
parent = NULL;
p = &priv->root.rb_node;
while (*p != NULL) {
@@ -218,25 +287,82 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
d = memcmp(nft_set_ext_key(&rbe->ext),
nft_set_ext_key(&new->ext),
set->klen);
- if (d < 0)
+ if (d < 0) {
p = &parent->rb_left;
- else if (d > 0)
+
+ if (nft_rbtree_interval_start(new)) {
+ if (nft_rbtree_interval_end(rbe) &&
+ nft_set_elem_active(&rbe->ext, genmask) &&
+ !nft_set_elem_expired(&rbe->ext) && !*p)
+ overlap = false;
+ } else {
+ if (dup_end_left && !*p)
+ return -ENOTEMPTY;
+
+ overlap = nft_rbtree_interval_end(rbe) &&
+ nft_set_elem_active(&rbe->ext,
+ genmask) &&
+ !nft_set_elem_expired(&rbe->ext);
+
+ if (overlap) {
+ dup_end_right = true;
+ continue;
+ }
+ }
+ } else if (d > 0) {
p = &parent->rb_right;
- else {
+
+ if (nft_rbtree_interval_end(new)) {
+ if (dup_end_right && !*p)
+ return -ENOTEMPTY;
+
+ overlap = nft_rbtree_interval_end(rbe) &&
+ nft_set_elem_active(&rbe->ext,
+ genmask) &&
+ !nft_set_elem_expired(&rbe->ext);
+
+ if (overlap) {
+ dup_end_left = true;
+ continue;
+ }
+ } else if (nft_set_elem_active(&rbe->ext, genmask) &&
+ !nft_set_elem_expired(&rbe->ext)) {
+ overlap = nft_rbtree_interval_end(rbe);
+ }
+ } else {
if (nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(new)) {
+ nft_rbtree_interval_start(new)) {
p = &parent->rb_left;
- } else if (!nft_rbtree_interval_end(rbe) &&
+
+ if (nft_set_elem_active(&rbe->ext, genmask) &&
+ !nft_set_elem_expired(&rbe->ext))
+ overlap = false;
+ } else if (nft_rbtree_interval_start(rbe) &&
nft_rbtree_interval_end(new)) {
p = &parent->rb_right;
- } else if (nft_set_elem_active(&rbe->ext, genmask)) {
+
+ if (nft_set_elem_active(&rbe->ext, genmask) &&
+ !nft_set_elem_expired(&rbe->ext))
+ overlap = false;
+ } else if (nft_set_elem_active(&rbe->ext, genmask) &&
+ !nft_set_elem_expired(&rbe->ext)) {
*ext = &rbe->ext;
return -EEXIST;
} else {
- p = &parent->rb_left;
+ overlap = false;
+ if (nft_rbtree_interval_end(rbe))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
}
}
+
+ dup_end_left = dup_end_right = false;
}
+
+ if (overlap)
+ return -ENOTEMPTY;
+
rb_link_node_rcu(&new->node, parent, p);
rb_insert_color(&new->node, &priv->root);
return 0;
@@ -317,10 +443,10 @@ static void *nft_rbtree_deactivate(const struct net *net,
parent = parent->rb_right;
else {
if (nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(this)) {
+ nft_rbtree_interval_start(this)) {
parent = parent->rb_left;
continue;
- } else if (!nft_rbtree_interval_end(rbe) &&
+ } else if (nft_rbtree_interval_start(rbe) &&
nft_rbtree_interval_end(this)) {
parent = parent->rb_right;
continue;
@@ -350,6 +476,8 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
if (iter->count < iter->skip)
goto cont;
+ if (nft_set_elem_expired(&rbe->ext))
+ goto cont;
if (!nft_set_elem_active(&rbe->ext, iter->genmask))
goto cont;
@@ -418,6 +546,12 @@ static void nft_rbtree_gc(struct work_struct *work)
write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
+ rbe = nft_set_catchall_gc(set);
+ if (rbe) {
+ gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+ if (gcb)
+ nft_set_gc_batch_add(gcb, rbe);
+ }
nft_set_gc_batch_complete(gcb);
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
@@ -437,7 +571,7 @@ static int nft_rbtree_init(const struct nft_set *set,
struct nft_rbtree *priv = nft_set_priv(set);
rwlock_init(&priv->lock);
- seqcount_init(&priv->count);
+ seqcount_rwlock_init(&priv->count, &priv->lock);
priv->root = RB_ROOT;
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
@@ -481,8 +615,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-struct nft_set_type nft_set_rbtree_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_rbtree_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
.ops = {
.privsize = nft_rbtree_privsize,
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 637ce3e8c575..49a5348a6a14 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -9,11 +9,78 @@
struct nft_socket {
enum nft_socket_keys key:8;
+ u8 level;
+ u8 len;
union {
- enum nft_registers dreg:8;
+ u8 dreg;
};
};
+static void nft_socket_wildcard(const struct nft_pktinfo *pkt,
+ struct nft_regs *regs, struct sock *sk,
+ u32 *dest)
+{
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr));
+ break;
+#endif
+ default:
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+}
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static noinline bool
+nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level)
+{
+ struct cgroup *cgrp;
+ u64 cgid;
+
+ if (!sk_fullsock(sk))
+ return false;
+
+ cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level);
+ if (!cgrp)
+ return false;
+
+ cgid = cgroup_id(cgrp);
+ memcpy(dest, &cgid, sizeof(u64));
+ return true;
+}
+#endif
+
+static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt)
+{
+ const struct net_device *indev = nft_in(pkt);
+ const struct sk_buff *skb = pkt->skb;
+ struct sock *sk = NULL;
+
+ if (!indev)
+ return NULL;
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, indev);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, indev);
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return sk;
+}
+
static void nft_socket_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -27,20 +94,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
sk = NULL;
if (!sk)
- switch(nft_pf(pkt)) {
- case NFPROTO_IPV4:
- sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
- break;
-#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
- case NFPROTO_IPV6:
- sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
- break;
-#endif
- default:
- WARN_ON_ONCE(1);
- regs->verdict.code = NFT_BREAK;
- return;
- }
+ sk = nft_socket_do_lookup(pkt);
if (!sk) {
regs->verdict.code = NFT_BREAK;
@@ -59,6 +113,21 @@ static void nft_socket_eval(const struct nft_expr *expr,
return;
}
break;
+ case NFT_SOCKET_WILDCARD:
+ if (!sk_fullsock(sk)) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ nft_socket_wildcard(pkt, regs, sk, dest);
+ break;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case NFT_SOCKET_CGROUPV2:
+ if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ break;
+#endif
default:
WARN_ON(1);
regs->verdict.code = NFT_BREAK;
@@ -71,6 +140,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
[NFTA_SOCKET_KEY] = { .type = NLA_U32 },
[NFTA_SOCKET_DREG] = { .type = NLA_U32 },
+ [NFTA_SOCKET_LEVEL] = { .type = NLA_U32 },
};
static int nft_socket_init(const struct nft_ctx *ctx,
@@ -94,21 +164,38 @@ static int nft_socket_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+ priv->key = ntohl(nla_get_be32(tb[NFTA_SOCKET_KEY]));
switch(priv->key) {
case NFT_SOCKET_TRANSPARENT:
+ case NFT_SOCKET_WILDCARD:
len = sizeof(u8);
break;
case NFT_SOCKET_MARK:
len = sizeof(u32);
break;
+#ifdef CONFIG_CGROUPS
+ case NFT_SOCKET_CGROUPV2: {
+ unsigned int level;
+
+ if (!tb[NFTA_SOCKET_LEVEL])
+ return -EINVAL;
+
+ level = ntohl(nla_get_be32(tb[NFTA_SOCKET_LEVEL]));
+ if (level > 255)
+ return -EOPNOTSUPP;
+
+ priv->level = level;
+ len = sizeof(u64);
+ break;
+ }
+#endif
default:
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_SOCKET_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
static int nft_socket_dump(struct sk_buff *skb,
@@ -116,13 +203,51 @@ static int nft_socket_dump(struct sk_buff *skb,
{
const struct nft_socket *priv = nft_expr_priv(expr);
- if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+ if (nla_put_be32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
return -1;
if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
return -1;
+ if (priv->key == NFT_SOCKET_CGROUPV2 &&
+ nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level)))
+ return -1;
return 0;
}
+static bool nft_socket_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_socket *priv = nft_expr_priv(expr);
+ const struct nft_socket *socket;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ socket = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != socket->key ||
+ priv->dreg != socket->dreg ||
+ priv->level != socket->level) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
+static int nft_socket_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT));
+}
+
static struct nft_expr_type nft_socket_type;
static const struct nft_expr_ops nft_socket_ops = {
.type = &nft_socket_type,
@@ -130,6 +255,8 @@ static const struct nft_expr_ops nft_socket_ops = {
.eval = nft_socket_eval,
.init = nft_socket_init,
.dump = nft_socket_dump,
+ .validate = nft_socket_validate,
+ .reduce = nft_socket_reduce,
};
static struct nft_expr_type nft_socket_type __read_mostly = {
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index e2c1fc608841..6cf9a04fbfe2 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -109,7 +109,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv,
{
struct synproxy_options opts = {};
struct sk_buff *skb = pkt->skb;
- int thoff = pkt->xt.thoff;
+ int thoff = nft_thoff(pkt);
const struct tcphdr *tcp;
struct tcphdr _tcph;
@@ -123,7 +123,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv,
return;
}
- tcp = skb_header_pointer(skb, pkt->xt.thoff,
+ tcp = skb_header_pointer(skb, thoff,
sizeof(struct tcphdr),
&_tcph);
if (!tcp) {
@@ -191,8 +191,10 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx,
if (err)
goto nf_ct_failure;
err = nf_synproxy_ipv6_init(snet, ctx->net);
- if (err)
+ if (err) {
+ nf_synproxy_ipv4_fini(snet, ctx->net);
goto nf_ct_failure;
+ }
break;
}
@@ -286,6 +288,7 @@ static const struct nft_expr_ops nft_synproxy_ops = {
.dump = nft_synproxy_dump,
.type = &nft_synproxy_type,
.validate = nft_synproxy_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_synproxy_type __read_mostly = {
@@ -388,3 +391,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
MODULE_ALIAS_NFT_EXPR("synproxy");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY);
+MODULE_DESCRIPTION("nftables SYNPROXY expression support");
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index d67f83a0958d..62da25ad264b 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -13,9 +13,9 @@
#endif
struct nft_tproxy {
- enum nft_registers sreg_addr:8;
- enum nft_registers sreg_port:8;
- u8 family;
+ u8 sreg_addr;
+ u8 sreg_port;
+ u8 family;
};
static void nft_tproxy_eval_v4(const struct nft_expr *expr,
@@ -30,6 +30,12 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
__be16 tport = 0;
struct sock *sk;
+ if (pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
if (!hp) {
regs->verdict.code = NFT_BREAK;
@@ -46,11 +52,11 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
if (priv->sreg_addr)
- taddr = regs->data[priv->sreg_addr];
+ taddr = nft_reg_load_be32(&regs->data[priv->sreg_addr]);
taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
if (priv->sreg_port)
- tport = nft_reg_load16(&regs->data[priv->sreg_port]);
+ tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -82,16 +88,17 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
const struct nft_tproxy *priv = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct in6_addr taddr;
- int thoff = pkt->xt.thoff;
+ int thoff = nft_thoff(pkt);
struct udphdr _hdr, *hp;
+ struct in6_addr taddr;
__be16 tport = 0;
struct sock *sk;
int l4proto;
memset(&taddr, 0, sizeof(taddr));
- if (!pkt->tprot_set) {
+ if (pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -117,7 +124,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
if (priv->sreg_port)
- tport = nft_reg_load16(&regs->data[priv->sreg_port]);
+ tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -247,15 +254,15 @@ static int nft_tproxy_init(const struct nft_ctx *ctx,
}
if (tb[NFTA_TPROXY_REG_ADDR]) {
- priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
- err = nft_validate_register_load(priv->sreg_addr, alen);
+ err = nft_parse_register_load(tb[NFTA_TPROXY_REG_ADDR],
+ &priv->sreg_addr, alen);
if (err < 0)
return err;
}
if (tb[NFTA_TPROXY_REG_PORT]) {
- priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
- err = nft_validate_register_load(priv->sreg_port, sizeof(u16));
+ err = nft_parse_register_load(tb[NFTA_TPROXY_REG_PORT],
+ &priv->sreg_port, sizeof(u16));
if (err < 0)
return err;
}
@@ -263,6 +270,29 @@ static int nft_tproxy_init(const struct nft_ctx *ctx,
return 0;
}
+static void nft_tproxy_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+ switch (priv->family) {
+ case NFPROTO_IPV4:
+ nf_defrag_ipv4_disable(ctx->net);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nf_defrag_ipv6_disable(ctx->net);
+ break;
+#endif
+ case NFPROTO_UNSPEC:
+ nf_defrag_ipv4_disable(ctx->net);
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ nf_defrag_ipv6_disable(ctx->net);
+#endif
+ break;
+ }
+}
+
static int nft_tproxy_dump(struct sk_buff *skb,
const struct nft_expr *expr)
{
@@ -282,13 +312,23 @@ static int nft_tproxy_dump(struct sk_buff *skb,
return 0;
}
+static int nft_tproxy_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING);
+}
+
static struct nft_expr_type nft_tproxy_type;
static const struct nft_expr_ops nft_tproxy_ops = {
.type = &nft_tproxy_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)),
.eval = nft_tproxy_eval,
.init = nft_tproxy_init,
+ .destroy = nft_tproxy_destroy,
.dump = nft_tproxy_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .validate = nft_tproxy_validate,
};
static struct nft_expr_type nft_tproxy_type __read_mostly = {
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 764e88682a81..983ade4be3b3 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -11,11 +11,13 @@
#include <net/ip_tunnels.h>
#include <net/vxlan.h>
#include <net/erspan.h>
+#include <net/geneve.h>
struct nft_tunnel {
enum nft_tunnel_keys key:8;
- enum nft_registers dreg:8;
+ u8 dreg;
enum nft_tunnel_mode mode:8;
+ u8 len;
};
static void nft_tunnel_get_eval(const struct nft_expr *expr,
@@ -92,8 +94,6 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]);
-
if (tb[NFTA_TUNNEL_MODE]) {
priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE]));
if (priv->mode > NFT_TUNNEL_MODE_MAX)
@@ -102,8 +102,9 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
priv->mode = NFT_TUNNEL_MODE_NONE;
}
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_TUNNEL_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
static int nft_tunnel_get_dump(struct sk_buff *skb,
@@ -123,6 +124,31 @@ nla_put_failure:
return -1;
}
+static bool nft_tunnel_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_tunnel *priv = nft_expr_priv(expr);
+ const struct nft_tunnel *tunnel;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ tunnel = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != tunnel->key ||
+ priv->dreg != tunnel->dreg ||
+ priv->mode != tunnel->mode) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+
static struct nft_expr_type nft_tunnel_type;
static const struct nft_expr_ops nft_tunnel_get_ops = {
.type = &nft_tunnel_type,
@@ -130,10 +156,12 @@ static const struct nft_expr_ops nft_tunnel_get_ops = {
.eval = nft_tunnel_get_eval,
.init = nft_tunnel_get_init,
.dump = nft_tunnel_get_dump,
+ .reduce = nft_tunnel_get_reduce,
};
static struct nft_expr_type nft_tunnel_type __read_mostly = {
.name = "tunnel",
+ .family = NFPROTO_NETDEV,
.ops = &nft_tunnel_get_ops,
.policy = nft_tunnel_policy,
.maxattr = NFTA_TUNNEL_MAX,
@@ -144,6 +172,7 @@ struct nft_tunnel_opts {
union {
struct vxlan_metadata vxlan;
struct erspan_metadata erspan;
+ u8 data[IP_TUNNEL_OPTS_MAX];
} u;
u32 len;
__be16 flags;
@@ -301,9 +330,53 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
return 0;
}
+static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 },
+ [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 },
+ [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+};
+
+static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
+ struct nft_tunnel_opts *opts)
+{
+ struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len;
+ struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1];
+ int err, data_len;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_GENEVE_MAX, attr,
+ nft_tunnel_opts_geneve_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_GENEVE_CLASS] ||
+ !tb[NFTA_TUNNEL_KEY_GENEVE_TYPE] ||
+ !tb[NFTA_TUNNEL_KEY_GENEVE_DATA])
+ return -EINVAL;
+
+ attr = tb[NFTA_TUNNEL_KEY_GENEVE_DATA];
+ data_len = nla_len(attr);
+ if (data_len % 4)
+ return -EINVAL;
+
+ opts->len += sizeof(*opt) + data_len;
+ if (opts->len > IP_TUNNEL_OPTS_MAX)
+ return -EINVAL;
+
+ memcpy(opt->opt_data, nla_data(attr), data_len);
+ opt->length = data_len / 4;
+ opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]);
+ opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]);
+ opts->flags = TUNNEL_GENEVE_OPT;
+
+ return 0;
+}
+
static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_OPTS_UNSPEC] = {
+ .strict_start_type = NFTA_TUNNEL_KEY_OPTS_GENEVE },
[NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, },
[NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_OPTS_GENEVE] = { .type = NLA_NESTED, },
};
static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
@@ -311,22 +384,44 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
struct ip_tunnel_info *info,
struct nft_tunnel_opts *opts)
{
- struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1];
- int err;
+ struct nlattr *nla;
+ __be16 type = 0;
+ int err, rem;
- err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr,
- nft_tunnel_opts_policy, NULL);
+ err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX,
+ nft_tunnel_opts_policy, NULL);
if (err < 0)
return err;
- if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) {
- err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN],
- opts);
- } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) {
- err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN],
- opts);
- } else {
- return -EOPNOTSUPP;
+ nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(nla)) {
+ case NFTA_TUNNEL_KEY_OPTS_VXLAN:
+ if (type)
+ return -EINVAL;
+ err = nft_tunnel_obj_vxlan_init(nla, opts);
+ if (err)
+ return err;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case NFTA_TUNNEL_KEY_OPTS_ERSPAN:
+ if (type)
+ return -EINVAL;
+ err = nft_tunnel_obj_erspan_init(nla, opts);
+ if (err)
+ return err;
+ type = TUNNEL_ERSPAN_OPT;
+ break;
+ case NFTA_TUNNEL_KEY_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT)
+ return -EINVAL;
+ err = nft_tunnel_obj_geneve_init(nla, opts);
+ if (err)
+ return err;
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
}
return err;
@@ -518,6 +613,25 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
break;
}
nla_nest_end(skb, inner);
+ } else if (opts->flags & TUNNEL_GENEVE_OPT) {
+ struct geneve_opt *opt;
+ int offset = 0;
+
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
+ if (!inner)
+ goto failure;
+ while (opts->len > offset) {
+ opt = (struct geneve_opt *)opts->u.data + offset;
+ if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE,
+ opt->type) ||
+ nla_put(skb, NFTA_TUNNEL_KEY_GENEVE_DATA,
+ opt->length * 4, opt->opt_data))
+ goto inner_failure;
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+ nla_nest_end(skb, inner);
}
nla_nest_end(skb, nest);
return 0;
@@ -633,3 +747,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("tunnel");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL);
+MODULE_DESCRIPTION("nftables tunnel expression support");
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index 06d5cabf1d7c..1c5343c936a8 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -24,9 +24,10 @@ static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = {
struct nft_xfrm {
enum nft_xfrm_keys key:8;
- enum nft_registers dreg:8;
+ u8 dreg;
u8 dir;
u8 spnum;
+ u8 len;
};
static int nft_xfrm_get_init(const struct nft_ctx *ctx,
@@ -50,7 +51,7 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY]));
+ priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY]));
switch (priv->key) {
case NFT_XFRM_KEY_REQID:
case NFT_XFRM_KEY_SPI:
@@ -86,9 +87,9 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx,
priv->spnum = spnum;
- priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
/* Return true if key asks for daddr/saddr and current
@@ -133,13 +134,13 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
WARN_ON_ONCE(1);
break;
case NFT_XFRM_KEY_DADDR_IP4:
- *dest = state->id.daddr.a4;
+ *dest = (__force __u32)state->id.daddr.a4;
return;
case NFT_XFRM_KEY_DADDR_IP6:
memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr));
return;
case NFT_XFRM_KEY_SADDR_IP4:
- *dest = state->props.saddr.a4;
+ *dest = (__force __u32)state->props.saddr.a4;
return;
case NFT_XFRM_KEY_SADDR_IP6:
memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr));
@@ -148,7 +149,7 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
*dest = state->props.reqid;
return;
case NFT_XFRM_KEY_SPI:
- *dest = state->id.spi;
+ *dest = (__force __u32)state->id.spi;
return;
}
@@ -253,6 +254,31 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e
return nft_chain_validate_hooks(ctx->chain, hooks);
}
+static bool nft_xfrm_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_xfrm *priv = nft_expr_priv(expr);
+ const struct nft_xfrm *xfrm;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ xfrm = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != xfrm->key ||
+ priv->dreg != xfrm->dreg ||
+ priv->dir != xfrm->dir ||
+ priv->spnum != xfrm->spnum) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
static struct nft_expr_type nft_xfrm_type;
static const struct nft_expr_ops nft_xfrm_get_ops = {
@@ -262,6 +288,7 @@ static const struct nft_expr_ops nft_xfrm_get_ops = {
.init = nft_xfrm_get_init,
.dump = nft_xfrm_get_dump,
.validate = nft_xfrm_validate,
+ .reduce = nft_xfrm_reduce,
};
static struct nft_expr_type nft_xfrm_type __read_mostly = {
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 51b454d8fa9c..2182d361e273 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -25,7 +25,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP)
skb->csum = 0;
@@ -51,7 +51,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
case CHECKSUM_COMPLETE:
if (len == skb->len - dataoff)
return nf_ip_checksum(skb, hook, dataoff, protocol);
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
skb->len - dataoff, 0);
@@ -79,7 +79,7 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
skb->csum = ~csum_unfold(
csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
@@ -106,7 +106,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
case CHECKSUM_COMPLETE:
if (len == skb->len - dataoff)
return nf_ip6_checksum(skb, hook, dataoff, protocol);
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
hsum = skb_checksum(skb, 0, dataoff, 0);
skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
@@ -191,8 +191,8 @@ static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry
skb->mark == rt_info->mark &&
iph->daddr == rt_info->daddr &&
iph->saddr == rt_info->saddr))
- return ip_route_me_harder(entry->state.net, skb,
- RTN_UNSPEC);
+ return ip_route_me_harder(entry->state.net, entry->state.sk,
+ skb, RTN_UNSPEC);
}
#endif
return 0;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index cd2b034eef59..470282cf3fae 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -24,6 +24,7 @@
#include <linux/audit.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp.h>
@@ -38,6 +39,24 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
#define XT_PCPU_BLOCK_SIZE 4096
#define XT_MAX_TABLE_SIZE (512 * 1024 * 1024)
+struct xt_template {
+ struct list_head list;
+
+ /* called when table is needed in the given netns */
+ int (*table_init)(struct net *net);
+
+ struct module *me;
+
+ /* A unique name... */
+ char name[XT_TABLE_MAXNAMELEN];
+};
+
+static struct list_head xt_templates[NFPROTO_NUMPROTO];
+
+struct xt_pernet {
+ struct list_head tables[NFPROTO_NUMPROTO];
+};
+
struct compat_delta {
unsigned int offset; /* offset in kernel */
int delta; /* delta in 32bit user land */
@@ -47,7 +66,7 @@ struct xt_af {
struct mutex mutex;
struct list_head match;
struct list_head target;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct mutex compat_mutex;
struct compat_delta *compat_tab;
unsigned int number; /* number of slots in compat_tab[] */
@@ -55,7 +74,8 @@ struct xt_af {
#endif
};
-static struct xt_af *xt;
+static unsigned int xt_pernet_id __read_mostly;
+static struct xt_af *xt __read_mostly;
static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
[NFPROTO_UNSPEC] = "x",
@@ -330,6 +350,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
const struct xt_match *m;
int have_rev = 0;
+ mutex_lock(&xt[af].mutex);
list_for_each_entry(m, &xt[af].match, list) {
if (strcmp(m->name, name) == 0) {
if (m->revision > *bestp)
@@ -338,6 +359,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
have_rev = 1;
}
}
+ mutex_unlock(&xt[af].mutex);
if (af != NFPROTO_UNSPEC && !have_rev)
return match_revfn(NFPROTO_UNSPEC, name, revision, bestp);
@@ -350,6 +372,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp)
const struct xt_target *t;
int have_rev = 0;
+ mutex_lock(&xt[af].mutex);
list_for_each_entry(t, &xt[af].target, list) {
if (strcmp(t->name, name) == 0) {
if (t->revision > *bestp)
@@ -358,6 +381,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp)
have_rev = 1;
}
}
+ mutex_unlock(&xt[af].mutex);
if (af != NFPROTO_UNSPEC && !have_rev)
return target_revfn(NFPROTO_UNSPEC, name, revision, bestp);
@@ -371,12 +395,10 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
{
int have_rev, best = -1;
- mutex_lock(&xt[af].mutex);
if (target == 1)
have_rev = target_revfn(af, name, revision, &best);
else
have_rev = match_revfn(af, name, revision, &best);
- mutex_unlock(&xt[af].mutex);
/* Nothing at all? Return 0 to try loading module. */
if (best == -1) {
@@ -639,7 +661,7 @@ static bool error_tg_ok(unsigned int usersize, unsigned int kernsize,
return usersize == kernsize && strnlen(msg, msglen) < msglen;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
{
struct xt_af *xp = &xt[af];
@@ -731,7 +753,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
{
const struct xt_match *match = m->u.kernel.match;
struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
- int pad, off = xt_compat_match_offset(match);
+ int off = xt_compat_match_offset(match);
u_int16_t msize = cm->u.user.match_size;
char name[sizeof(m->u.user.name)];
@@ -741,13 +763,10 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
match->compat_from_user(m->data, cm->data);
else
memcpy(m->data, cm->data, msize - sizeof(*cm));
- pad = XT_ALIGN(match->matchsize) - match->matchsize;
- if (pad > 0)
- memset(m->data + match->matchsize, 0, pad);
msize += off;
m->u.user.match_size = msize;
- strlcpy(name, match->name, sizeof(name));
+ strscpy(name, match->name, sizeof(name));
module_put(match->me);
strncpy(m->u.user.name, name, sizeof(m->u.user.name));
@@ -845,7 +864,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
__alignof__(struct compat_xt_entry_match));
}
EXPORT_SYMBOL(xt_compat_check_entry_offsets);
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
/**
* xt_check_entry_offsets - validate arp/ip/ip6t_entry
@@ -863,7 +882,7 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
* match structures are aligned, and that the last structure ends where
* the target structure begins.
*
- * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
+ * Also see xt_compat_check_entry_offsets for CONFIG_NETFILTER_XTABLES_COMPAT version.
*
* The arp/ip/ip6t_entry structure @base must have passed following tests:
* - it must point to a valid memory location
@@ -1028,34 +1047,34 @@ int xt_check_target(struct xt_tgchk_param *par,
EXPORT_SYMBOL_GPL(xt_check_target);
/**
- * xt_copy_counters_from_user - copy counters and metadata from userspace
+ * xt_copy_counters - copy counters and metadata from a sockptr_t
*
- * @user: src pointer to userspace memory
+ * @arg: src sockptr
* @len: alleged size of userspace memory
* @info: where to store the xt_counters_info metadata
- * @compat: true if we setsockopt call is done by 32bit task on 64bit kernel
*
* Copies counter meta data from @user and stores it in @info.
*
* vmallocs memory to hold the counters, then copies the counter data
* from @user to the new memory and returns a pointer to it.
*
- * If @compat is true, @info gets converted automatically to the 64bit
- * representation.
+ * If called from a compat syscall, @info gets converted automatically to the
+ * 64bit representation.
*
* The metadata associated with the counters is stored in @info.
*
* Return: returns pointer that caller has to test via IS_ERR().
* If IS_ERR is false, caller has to vfree the pointer.
*/
-void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
- struct xt_counters_info *info, bool compat)
+void *xt_copy_counters(sockptr_t arg, unsigned int len,
+ struct xt_counters_info *info)
{
+ size_t offset;
void *mem;
u64 size;
-#ifdef CONFIG_COMPAT
- if (compat) {
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall()) {
/* structures only differ in size due to alignment */
struct compat_xt_counters_info compat_tmp;
@@ -1063,12 +1082,12 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
return ERR_PTR(-EINVAL);
len -= sizeof(compat_tmp);
- if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0)
+ if (copy_from_sockptr(&compat_tmp, arg, sizeof(compat_tmp)) != 0)
return ERR_PTR(-EFAULT);
memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1);
info->num_counters = compat_tmp.num_counters;
- user += sizeof(compat_tmp);
+ offset = sizeof(compat_tmp);
} else
#endif
{
@@ -1076,10 +1095,10 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
return ERR_PTR(-EINVAL);
len -= sizeof(*info);
- if (copy_from_user(info, user, sizeof(*info)) != 0)
+ if (copy_from_sockptr(info, arg, sizeof(*info)) != 0)
return ERR_PTR(-EFAULT);
- user += sizeof(*info);
+ offset = sizeof(*info);
}
info->name[sizeof(info->name) - 1] = '\0';
@@ -1093,15 +1112,15 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
if (!mem)
return ERR_PTR(-ENOMEM);
- if (copy_from_user(mem, user, len) == 0)
+ if (copy_from_sockptr_offset(mem, arg, offset, len) == 0)
return mem;
vfree(mem);
return ERR_PTR(-EFAULT);
}
-EXPORT_SYMBOL_GPL(xt_copy_counters_from_user);
+EXPORT_SYMBOL_GPL(xt_copy_counters);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
int xt_compat_target_offset(const struct xt_target *target)
{
u_int16_t csize = target->compatsize ? : target->targetsize;
@@ -1114,7 +1133,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
{
const struct xt_target *target = t->u.kernel.target;
struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
- int pad, off = xt_compat_target_offset(target);
+ int off = xt_compat_target_offset(target);
u_int16_t tsize = ct->u.user.target_size;
char name[sizeof(t->u.user.name)];
@@ -1124,13 +1143,10 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
target->compat_from_user(t->data, ct->data);
else
memcpy(t->data, ct->data, tsize - sizeof(*ct));
- pad = XT_ALIGN(target->targetsize) - target->targetsize;
- if (pad > 0)
- memset(t->data + target->targetsize, 0, pad);
tsize += off;
t->u.user.target_size = tsize;
- strlcpy(name, target->name, sizeof(name));
+ strscpy(name, target->name, sizeof(name));
module_put(target->me);
strncpy(t->u.user.name, name, sizeof(t->u.user.name));
@@ -1197,50 +1213,65 @@ void xt_free_table_info(struct xt_table_info *info)
}
EXPORT_SYMBOL(xt_free_table_info);
+struct xt_table *xt_find_table(struct net *net, u8 af, const char *name)
+{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+ struct xt_table *t;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(t, &xt_net->tables[af], list) {
+ if (strcmp(t->name, name) == 0) {
+ mutex_unlock(&xt[af].mutex);
+ return t;
+ }
+ }
+ mutex_unlock(&xt[af].mutex);
+ return NULL;
+}
+EXPORT_SYMBOL(xt_find_table);
+
/* Find table by name, grabs mutex & ref. Returns ERR_PTR on error. */
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
const char *name)
{
- struct xt_table *t, *found = NULL;
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+ struct module *owner = NULL;
+ struct xt_template *tmpl;
+ struct xt_table *t;
mutex_lock(&xt[af].mutex);
- list_for_each_entry(t, &net->xt.tables[af], list)
+ list_for_each_entry(t, &xt_net->tables[af], list)
if (strcmp(t->name, name) == 0 && try_module_get(t->me))
return t;
- if (net == &init_net)
- goto out;
-
- /* Table doesn't exist in this netns, re-try init */
- list_for_each_entry(t, &init_net.xt.tables[af], list) {
+ /* Table doesn't exist in this netns, check larval list */
+ list_for_each_entry(tmpl, &xt_templates[af], list) {
int err;
- if (strcmp(t->name, name))
+ if (strcmp(tmpl->name, name))
continue;
- if (!try_module_get(t->me))
+ if (!try_module_get(tmpl->me))
goto out;
+
+ owner = tmpl->me;
+
mutex_unlock(&xt[af].mutex);
- err = t->table_init(net);
+ err = tmpl->table_init(net);
if (err < 0) {
- module_put(t->me);
+ module_put(owner);
return ERR_PTR(err);
}
- found = t;
-
mutex_lock(&xt[af].mutex);
break;
}
- if (!found)
- goto out;
-
/* and once again: */
- list_for_each_entry(t, &net->xt.tables[af], list)
+ list_for_each_entry(t, &xt_net->tables[af], list)
if (strcmp(t->name, name) == 0)
return t;
- module_put(found->me);
+ module_put(owner);
out:
mutex_unlock(&xt[af].mutex);
return ERR_PTR(-ENOENT);
@@ -1271,7 +1302,7 @@ void xt_table_unlock(struct xt_table *table)
}
EXPORT_SYMBOL_GPL(xt_table_unlock);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
void xt_compat_lock(u_int8_t af)
{
mutex_lock(&xt[af].compat_mutex);
@@ -1387,7 +1418,7 @@ xt_replace_table(struct xt_table *table,
table->private = newinfo;
/* make sure all cpus see new ->private value */
- smp_wmb();
+ smp_mb();
/*
* Even though table entries have now been swapped, other CPU's
@@ -1408,15 +1439,10 @@ xt_replace_table(struct xt_table *table,
}
}
-#ifdef CONFIG_AUDIT
- if (audit_enabled) {
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_NETFILTER_CFG,
- "table=%s family=%u entries=%u",
- table->name, table->af, private->number);
- }
-#endif
-
+ audit_log_nfcfg(table->name, table->af, private->number,
+ !private->number ? AUDIT_XT_OP_REGISTER :
+ AUDIT_XT_OP_REPLACE,
+ GFP_KERNEL);
return private;
}
EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1426,9 +1452,10 @@ struct xt_table *xt_register_table(struct net *net,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo)
{
- int ret;
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
struct xt_table_info *private;
struct xt_table *t, *table;
+ int ret;
/* Don't add one object to multiple lists. */
table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
@@ -1439,7 +1466,7 @@ struct xt_table *xt_register_table(struct net *net,
mutex_lock(&xt[table->af].mutex);
/* Don't autoload: we'd eat our tail... */
- list_for_each_entry(t, &net->xt.tables[table->af], list) {
+ list_for_each_entry(t, &xt_net->tables[table->af], list) {
if (strcmp(t->name, table->name) == 0) {
ret = -EEXIST;
goto unlock;
@@ -1458,7 +1485,7 @@ struct xt_table *xt_register_table(struct net *net,
/* save number of initial entries */
private->initial_entries = private->number;
- list_add(&table->list, &net->xt.tables[table->af]);
+ list_add(&table->list, &xt_net->tables[table->af]);
mutex_unlock(&xt[table->af].mutex);
return table;
@@ -1478,6 +1505,9 @@ void *xt_unregister_table(struct xt_table *table)
private = table->private;
list_del(&table->list);
mutex_unlock(&xt[table->af].mutex);
+ audit_log_nfcfg(table->name, table->af, private->number,
+ AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
+ kfree(table->ops);
kfree(table);
return private;
@@ -1487,24 +1517,30 @@ EXPORT_SYMBOL_GPL(xt_unregister_table);
#ifdef CONFIG_PROC_FS
static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
{
+ u8 af = (unsigned long)pde_data(file_inode(seq->file));
struct net *net = seq_file_net(seq);
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ struct xt_pernet *xt_net;
+
+ xt_net = net_generic(net, xt_pernet_id);
mutex_lock(&xt[af].mutex);
- return seq_list_start(&net->xt.tables[af], *pos);
+ return seq_list_start(&xt_net->tables[af], *pos);
}
static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ u8 af = (unsigned long)pde_data(file_inode(seq->file));
struct net *net = seq_file_net(seq);
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ struct xt_pernet *xt_net;
- return seq_list_next(v, &net->xt.tables[af], pos);
+ xt_net = net_generic(net, xt_pernet_id);
+
+ return seq_list_next(v, &xt_net->tables[af], pos);
}
static void xt_table_seq_stop(struct seq_file *seq, void *v)
{
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ u_int8_t af = (unsigned long)pde_data(file_inode(seq->file));
mutex_unlock(&xt[af].mutex);
}
@@ -1548,7 +1584,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
[MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
[MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE,
};
- uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
+ uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file));
struct nf_mttg_trav *trav = seq->private;
if (ppos != NULL)
@@ -1575,7 +1611,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
trav->curr = trav->curr->next;
if (trav->curr != trav->head)
break;
- /* fall through */
+ fallthrough;
default:
return NULL;
}
@@ -1597,7 +1633,7 @@ static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
{
- uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
+ uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file));
struct nf_mttg_trav *trav = seq->private;
switch (trav->class) {
@@ -1722,6 +1758,58 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
}
EXPORT_SYMBOL_GPL(xt_hook_ops_alloc);
+int xt_register_template(const struct xt_table *table,
+ int (*table_init)(struct net *net))
+{
+ int ret = -EEXIST, af = table->af;
+ struct xt_template *t;
+
+ mutex_lock(&xt[af].mutex);
+
+ list_for_each_entry(t, &xt_templates[af], list) {
+ if (WARN_ON_ONCE(strcmp(table->name, t->name) == 0))
+ goto out_unlock;
+ }
+
+ ret = -ENOMEM;
+ t = kzalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto out_unlock;
+
+ BUILD_BUG_ON(sizeof(t->name) != sizeof(table->name));
+
+ strscpy(t->name, table->name, sizeof(t->name));
+ t->table_init = table_init;
+ t->me = table->me;
+ list_add(&t->list, &xt_templates[af]);
+ ret = 0;
+out_unlock:
+ mutex_unlock(&xt[af].mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xt_register_template);
+
+void xt_unregister_template(const struct xt_table *table)
+{
+ struct xt_template *t;
+ int af = table->af;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(t, &xt_templates[af], list) {
+ if (strcmp(table->name, t->name))
+ continue;
+
+ list_del(&t->list);
+ mutex_unlock(&xt[af].mutex);
+ kfree(t);
+ return;
+ }
+
+ mutex_unlock(&xt[af].mutex);
+ WARN_ON_ONCE(1);
+}
+EXPORT_SYMBOL_GPL(xt_unregister_template);
+
int xt_proto_init(struct net *net, u_int8_t af)
{
#ifdef CONFIG_PROC_FS
@@ -1739,7 +1827,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
root_uid = make_kuid(net->user_ns, 0);
root_gid = make_kgid(net->user_ns, 0);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops,
sizeof(struct seq_net_private),
@@ -1749,7 +1837,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
if (uid_valid(root_uid) && gid_valid(root_gid))
proc_set_user(proc, root_uid, root_gid);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
proc = proc_create_seq_private(buf, 0440, net->proc_net,
&xt_match_seq_ops, sizeof(struct nf_mttg_trav),
@@ -1759,7 +1847,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
if (uid_valid(root_uid) && gid_valid(root_gid))
proc_set_user(proc, root_uid, root_gid);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
proc = proc_create_seq_private(buf, 0440, net->proc_net,
&xt_target_seq_ops, sizeof(struct nf_mttg_trav),
@@ -1774,12 +1862,12 @@ int xt_proto_init(struct net *net, u_int8_t af)
#ifdef CONFIG_PROC_FS
out_remove_matches:
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
out_remove_tables:
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
out:
@@ -1793,15 +1881,15 @@ void xt_proto_fini(struct net *net, u_int8_t af)
#ifdef CONFIG_PROC_FS
char buf[XT_FUNCTION_MAXNAMELEN];
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
#endif /*CONFIG_PROC_FS*/
@@ -1865,24 +1953,28 @@ EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
static int __net_init xt_net_init(struct net *net)
{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
int i;
for (i = 0; i < NFPROTO_NUMPROTO; i++)
- INIT_LIST_HEAD(&net->xt.tables[i]);
+ INIT_LIST_HEAD(&xt_net->tables[i]);
return 0;
}
static void __net_exit xt_net_exit(struct net *net)
{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
int i;
for (i = 0; i < NFPROTO_NUMPROTO; i++)
- WARN_ON_ONCE(!list_empty(&net->xt.tables[i]));
+ WARN_ON_ONCE(!list_empty(&xt_net->tables[i]));
}
static struct pernet_operations xt_net_ops = {
.init = xt_net_init,
.exit = xt_net_exit,
+ .id = &xt_pernet_id,
+ .size = sizeof(struct xt_pernet),
};
static int __init xt_init(void)
@@ -1900,12 +1992,13 @@ static int __init xt_init(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++) {
mutex_init(&xt[i].mutex);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
mutex_init(&xt[i].compat_mutex);
xt[i].compat_tab = NULL;
#endif
INIT_LIST_HEAD(&xt[i].target);
INIT_LIST_HEAD(&xt[i].match);
+ INIT_LIST_HEAD(&xt_templates[i]);
}
rv = register_pernet_subsys(&xt_net_ops);
if (rv < 0)
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index 9cdc16b0d0d8..b6a015aee0ce 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -117,7 +117,7 @@ static int audit_tg_check(const struct xt_tgchk_param *par)
const struct xt_audit_info *info = par->targinfo;
if (info->type > XT_AUDIT_TYPE_MAX) {
- pr_info_ratelimited("Audit type out of range (valid range: 0..%hhu)\n",
+ pr_info_ratelimited("Audit type out of range (valid range: 0..%u)\n",
XT_AUDIT_TYPE_MAX);
return -ERANGE;
}
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index a5c8b653476a..76acecf3e757 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -6,7 +6,7 @@
* with the SECMARK target and state match.
*
* Based somewhat on CONNMARK:
- * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com>
* by Henrik Nordstrom <hno@marasystems.com>
*
* (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index d4deee39158b..2be2f7a7b60f 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -24,7 +24,7 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
return XT_CONTINUE;
if (ct) {
- atomic_inc(&ct->ct_general.use);
+ refcount_inc(&ct->ct_general.use);
nf_ct_set(skb, ct, IP_CT_NEW);
} else {
nf_ct_set(skb, ct, IP_CT_UNTRACKED);
@@ -96,7 +96,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
return -ENOMEM;
}
- help->helper = helper;
+ rcu_assign_pointer(help->helper, helper);
return 0;
}
@@ -136,6 +136,21 @@ static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info)
}
}
+static void xt_ct_put_helper(struct nf_conn_help *help)
+{
+ struct nf_conntrack_helper *helper;
+
+ if (!help)
+ return;
+
+ /* not yet exposed to other cpus, or ruleset
+ * already detached (post-replacement).
+ */
+ helper = rcu_dereference_raw(help->helper);
+ if (helper)
+ nf_conntrack_helper_put(helper);
+}
+
static int xt_ct_tg_check(const struct xt_tgchk_param *par,
struct xt_ct_target_info_v1 *info)
{
@@ -172,7 +187,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err2;
}
- ret = 0;
if ((info->ct_events || info->exp_events) &&
!nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events,
GFP_KERNEL)) {
@@ -202,15 +216,13 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err4;
}
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
- nf_conntrack_get(&ct->ct_general);
out:
info->ct = ct;
return 0;
err4:
help = nfct_help(ct);
- if (help)
- nf_conntrack_helper_put(help->helper);
+ xt_ct_put_helper(help);
err3:
nf_ct_tmpl_free(ct);
err2:
@@ -272,8 +284,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
if (ct) {
help = nfct_help(ct);
- if (help)
- nf_conntrack_helper_put(help->helper);
+ xt_ct_put_helper(help);
nf_ct_netns_put(par->net, par->family);
@@ -352,21 +363,10 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static int notrack_chk(const struct xt_tgchk_param *par)
-{
- if (!par->net->xt.notrack_deprecated_warning) {
- pr_info("netfilter: NOTRACK target is deprecated, "
- "use CT instead or upgrade iptables\n");
- par->net->xt.notrack_deprecated_warning = true;
- }
- return 0;
-}
-
static struct xt_target notrack_tg_reg __read_mostly = {
.name = "NOTRACK",
.revision = 0,
.family = NFPROTO_UNSPEC,
- .checkentry = notrack_chk,
.target = notrack_tg,
.table = "raw",
.me = THIS_MODULE,
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index eababc354ff1..cfa44515ab72 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -24,6 +24,8 @@ MODULE_ALIAS("ip6t_DSCP");
MODULE_ALIAS("ipt_TOS");
MODULE_ALIAS("ip6t_TOS");
+#define XT_DSCP_ECN_MASK 3u
+
static unsigned int
dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -34,8 +36,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
- ipv4_change_dsfield(ip_hdr(skb),
- (__force __u8)(~XT_DSCP_MASK),
+ ipv4_change_dsfield(ip_hdr(skb), XT_DSCP_ECN_MASK,
dinfo->dscp << XT_DSCP_SHIFT);
}
@@ -52,8 +53,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
return NF_DROP;
- ipv6_change_dsfield(ipv6_hdr(skb),
- (__force __u8)(~XT_DSCP_MASK),
+ ipv6_change_dsfield(ipv6_hdr(skb), XT_DSCP_ECN_MASK,
dinfo->dscp << XT_DSCP_SHIFT);
}
return XT_CONTINUE;
diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
index 713fb38541df..8928ec56c388 100644
--- a/net/netfilter/xt_HMARK.c
+++ b/net/netfilter/xt_HMARK.c
@@ -276,7 +276,7 @@ hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t,
return 0;
/* follow-up fragments don't contain ports, skip all fragments */
- if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+ if (ip_is_fragment(ip))
return 0;
hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index f56d3ed93e56..0f8bb0bf558f 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/timer.h>
+#include <linux/alarmtimer.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/netfilter.h>
@@ -30,6 +31,7 @@
struct idletimer_tg {
struct list_head entry;
+ struct alarm alarm;
struct timer_list timer;
struct work_struct work;
@@ -37,6 +39,7 @@ struct idletimer_tg {
struct device_attribute attr;
unsigned int refcnt;
+ u8 timer_type;
};
static LIST_HEAD(idletimer_tg_list);
@@ -62,20 +65,29 @@ static ssize_t idletimer_tg_show(struct device *dev,
{
struct idletimer_tg *timer;
unsigned long expires = 0;
+ struct timespec64 ktimespec = {};
+ long time_diff = 0;
mutex_lock(&list_mutex);
timer = __idletimer_tg_find_by_label(attr->attr.name);
- if (timer)
- expires = timer->timer.expires;
+ if (timer) {
+ if (timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm);
+ ktimespec = ktime_to_timespec64(expires_alarm);
+ time_diff = ktimespec.tv_sec;
+ } else {
+ expires = timer->timer.expires;
+ time_diff = jiffies_to_msecs(expires - jiffies) / 1000;
+ }
+ }
mutex_unlock(&list_mutex);
- if (time_after(expires, jiffies))
- return sprintf(buf, "%u\n",
- jiffies_to_msecs(expires - jiffies) / 1000);
+ if (time_after(expires, jiffies) || ktimespec.tv_sec > 0)
+ return sysfs_emit(buf, "%ld\n", time_diff);
- return sprintf(buf, "0\n");
+ return sysfs_emit(buf, "0\n");
}
static void idletimer_tg_work(struct work_struct *work)
@@ -95,6 +107,16 @@ static void idletimer_tg_expired(struct timer_list *t)
schedule_work(&timer->work);
}
+static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm,
+ ktime_t now)
+{
+ struct idletimer_tg *timer = alarm->data;
+
+ pr_debug("alarm %s expired\n", timer->attr.attr.name);
+ schedule_work(&timer->work);
+ return ALARMTIMER_NORESTART;
+}
+
static int idletimer_check_sysfs_name(const char *name, unsigned int size)
{
int ret;
@@ -115,7 +137,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
{
int ret;
- info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+ info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL);
if (!info->timer) {
ret = -ENOMEM;
goto out;
@@ -160,6 +182,68 @@ out:
return ret;
}
+static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info)
+{
+ int ret;
+
+ info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+ if (!info->timer) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = idletimer_check_sysfs_name(info->label, sizeof(info->label));
+ if (ret < 0)
+ goto out_free_timer;
+
+ sysfs_attr_init(&info->timer->attr.attr);
+ info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
+ if (!info->timer->attr.attr.name) {
+ ret = -ENOMEM;
+ goto out_free_timer;
+ }
+ info->timer->attr.attr.mode = 0444;
+ info->timer->attr.show = idletimer_tg_show;
+
+ ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ if (ret < 0) {
+ pr_debug("couldn't add file to sysfs");
+ goto out_free_attr;
+ }
+
+ /* notify userspace */
+ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD);
+
+ list_add(&info->timer->entry, &idletimer_tg_list);
+ pr_debug("timer type value is %u", info->timer_type);
+ info->timer->timer_type = info->timer_type;
+ info->timer->refcnt = 1;
+
+ INIT_WORK(&info->timer->work, idletimer_tg_work);
+
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t tout;
+ alarm_init(&info->timer->alarm, ALARM_BOOTTIME,
+ idletimer_tg_alarmproc);
+ info->timer->alarm.data = info->timer;
+ tout = ktime_set(info->timeout, 0);
+ alarm_start_relative(&info->timer->alarm, tout);
+ } else {
+ timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ }
+
+ return 0;
+
+out_free_attr:
+ kfree(info->timer->attr.attr.name);
+out_free_timer:
+ kfree(info->timer);
+out:
+ return ret;
+}
+
/*
* The actual xt_tables plugin.
*/
@@ -177,13 +261,30 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
return XT_CONTINUE;
}
-static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+/*
+ * The actual xt_tables plugin.
+ */
+static unsigned int idletimer_tg_target_v1(struct sk_buff *skb,
+ const struct xt_action_param *par)
{
- struct idletimer_tg_info *info = par->targinfo;
- int ret;
+ const struct idletimer_tg_info_v1 *info = par->targinfo;
- pr_debug("checkentry targinfo%s\n", info->label);
+ pr_debug("resetting timer %s, timeout period %u\n",
+ info->label, info->timeout);
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t tout = ktime_set(info->timeout, 0);
+ alarm_start_relative(&info->timer->alarm, tout);
+ } else {
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ }
+
+ return XT_CONTINUE;
+}
+
+static int idletimer_tg_helper(struct idletimer_tg_info *info)
+{
if (info->timeout == 0) {
pr_debug("timeout value is zero\n");
return -EINVAL;
@@ -198,7 +299,23 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
pr_debug("label is empty or not nul-terminated\n");
return -EINVAL;
}
+ return 0;
+}
+
+static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ ret = idletimer_tg_helper(info);
+ if(ret < 0)
+ {
+ pr_debug("checkentry helper return invalid\n");
+ return -EINVAL;
+ }
mutex_lock(&list_mutex);
info->timer = __idletimer_tg_find_by_label(info->label);
@@ -222,6 +339,68 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
return 0;
}
+static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info_v1 *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ if (info->send_nl_msg)
+ return -EOPNOTSUPP;
+
+ ret = idletimer_tg_helper((struct idletimer_tg_info *)info);
+ if(ret < 0)
+ {
+ pr_debug("checkentry helper return invalid\n");
+ return -EINVAL;
+ }
+
+ if (info->timer_type > XT_IDLETIMER_ALARM) {
+ pr_debug("invalid value for timer type\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&list_mutex);
+
+ info->timer = __idletimer_tg_find_by_label(info->label);
+ if (info->timer) {
+ if (info->timer->timer_type != info->timer_type) {
+ pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n");
+ mutex_unlock(&list_mutex);
+ return -EINVAL;
+ }
+
+ info->timer->refcnt++;
+ if (info->timer_type & XT_IDLETIMER_ALARM) {
+ /* calculate remaining expiry time */
+ ktime_t tout = alarm_expires_remaining(&info->timer->alarm);
+ struct timespec64 ktimespec = ktime_to_timespec64(tout);
+
+ if (ktimespec.tv_sec > 0) {
+ pr_debug("time_expiry_remaining %lld\n",
+ ktimespec.tv_sec);
+ alarm_start_relative(&info->timer->alarm, tout);
+ }
+ } else {
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ }
+ pr_debug("increased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ } else {
+ ret = idletimer_tg_create_v1(info);
+ if (ret < 0) {
+ pr_debug("failed to create timer\n");
+ mutex_unlock(&list_mutex);
+ return ret;
+ }
+ }
+
+ mutex_unlock(&list_mutex);
+ return 0;
+}
+
static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
{
const struct idletimer_tg_info *info = par->targinfo;
@@ -247,7 +426,38 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
mutex_unlock(&list_mutex);
}
-static struct xt_target idletimer_tg __read_mostly = {
+static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par)
+{
+ const struct idletimer_tg_info_v1 *info = par->targinfo;
+
+ pr_debug("destroy targinfo %s\n", info->label);
+
+ mutex_lock(&list_mutex);
+
+ if (--info->timer->refcnt == 0) {
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ alarm_cancel(&info->timer->alarm);
+ } else {
+ del_timer_sync(&info->timer->timer);
+ }
+ cancel_work_sync(&info->timer->work);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
+ } else {
+ pr_debug("decreased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ }
+
+ mutex_unlock(&list_mutex);
+}
+
+
+static struct xt_target idletimer_tg[] __read_mostly = {
+ {
.name = "IDLETIMER",
.family = NFPROTO_UNSPEC,
.target = idletimer_tg_target,
@@ -256,6 +466,20 @@ static struct xt_target idletimer_tg __read_mostly = {
.checkentry = idletimer_tg_checkentry,
.destroy = idletimer_tg_destroy,
.me = THIS_MODULE,
+ },
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_UNSPEC,
+ .revision = 1,
+ .target = idletimer_tg_target_v1,
+ .targetsize = sizeof(struct idletimer_tg_info_v1),
+ .usersize = offsetof(struct idletimer_tg_info_v1, timer),
+ .checkentry = idletimer_tg_checkentry_v1,
+ .destroy = idletimer_tg_destroy_v1,
+ .me = THIS_MODULE,
+ },
+
+
};
static struct class *idletimer_tg_class;
@@ -283,7 +507,8 @@ static int __init idletimer_tg_init(void)
idletimer_tg_kobj = &idletimer_tg_device->kobj;
- err = xt_register_target(&idletimer_tg);
+ err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg));
+
if (err < 0) {
pr_debug("couldn't register xt target\n");
goto out_dev;
@@ -300,7 +525,7 @@ out:
static void __exit idletimer_tg_exit(void)
{
- xt_unregister_target(&idletimer_tg);
+ xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg));
device_destroy(idletimer_tg_class, MKDEV(0, 0));
class_destroy(idletimer_tg_class);
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index a1e79b517c01..f39244f9c0ed 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -44,6 +44,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int log_tg_check(const struct xt_tgchk_param *par)
{
const struct xt_log_info *loginfo = par->targinfo;
+ int ret;
if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6)
return -EINVAL;
@@ -58,7 +59,14 @@ static int log_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+ if (ret != 0 && !par->nft_compat) {
+ request_module("%s", "nf_log_syslog");
+
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+ }
+
+ return ret;
}
static void log_tg_destroy(const struct xt_tgdtor_param *par)
@@ -108,3 +116,4 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging");
MODULE_ALIAS("ipt_LOG");
MODULE_ALIAS("ip6t_LOG");
+MODULE_SOFTDEP("pre: nf_log_syslog");
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 6e83ce3000db..e660c3710a10 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -42,13 +42,21 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int nflog_tg_check(const struct xt_tgchk_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
+ int ret;
if (info->flags & ~XT_NFLOG_MASK)
return -EINVAL;
if (info->prefix[sizeof(info->prefix) - 1] != '\0')
return -EINVAL;
- return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+ if (ret != 0 && !par->nft_compat) {
+ request_module("%s", "nfnetlink_log");
+
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+ }
+
+ return ret;
}
static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
@@ -79,3 +87,4 @@ static void __exit nflog_tg_exit(void)
module_init(nflog_tg_init);
module_exit(nflog_tg_exit);
+MODULE_SOFTDEP("pre: nfnetlink_log");
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 37253d399c6b..80f6624e2355 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -94,11 +94,11 @@ static unsigned int
xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_rateest_target_info *info = par->targinfo;
- struct gnet_stats_basic_packed *stats = &info->est->bstats;
+ struct gnet_stats_basic_sync *stats = &info->est->bstats;
spin_lock_bh(&info->est->lock);
- stats->bytes += skb->len;
- stats->packets++;
+ u64_stats_add(&stats->bytes, skb->len);
+ u64_stats_inc(&stats->packets);
spin_unlock_bh(&info->est->lock);
return XT_CONTINUE;
@@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
} cfg;
int ret;
+ if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name))
+ return -ENAMETOOLONG;
+
net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
mutex_lock(&xn->hash_lock);
@@ -140,7 +143,8 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
if (!est)
goto err1;
- strlcpy(est->name, info->name, sizeof(est->name));
+ gnet_stats_basic_sync_init(&est->bstats);
+ strscpy(est->name, info->name, sizeof(est->name));
spin_lock_init(&est->lock);
est->refcnt = 1;
est->params.interval = info->interval;
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 2317721f3ecb..498a0bf6f044 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -21,15 +21,12 @@ MODULE_DESCRIPTION("Xtables: packet security mark modification");
MODULE_ALIAS("ipt_SECMARK");
MODULE_ALIAS("ip6t_SECMARK");
-#define PFX "SECMARK: "
-
static u8 mode;
static unsigned int
-secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info)
{
u32 secmark = 0;
- const struct xt_secmark_target_info *info = par->targinfo;
switch (mode) {
case SECMARK_MODE_SEL:
@@ -43,7 +40,7 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static int checkentry_lsm(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info_v1 *info)
{
int err;
@@ -75,15 +72,15 @@ static int checkentry_lsm(struct xt_secmark_target_info *info)
return 0;
}
-static int secmark_tg_check(const struct xt_tgchk_param *par)
+static int
+secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info)
{
- struct xt_secmark_target_info *info = par->targinfo;
int err;
- if (strcmp(par->table, "mangle") != 0 &&
- strcmp(par->table, "security") != 0) {
+ if (strcmp(table, "mangle") != 0 &&
+ strcmp(table, "security") != 0) {
pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n",
- par->table);
+ table);
return -EINVAL;
}
@@ -118,25 +115,76 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
}
}
-static struct xt_target secmark_tg_reg __read_mostly = {
- .name = "SECMARK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = secmark_tg_check,
- .destroy = secmark_tg_destroy,
- .target = secmark_tg,
- .targetsize = sizeof(struct xt_secmark_target_info),
- .me = THIS_MODULE,
+static int secmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+ struct xt_secmark_target_info *info = par->targinfo;
+ struct xt_secmark_target_info_v1 newinfo = {
+ .mode = info->mode,
+ };
+ int ret;
+
+ memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX);
+
+ ret = secmark_tg_check(par->table, &newinfo);
+ info->secid = newinfo.secid;
+
+ return ret;
+}
+
+static unsigned int
+secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_secmark_target_info *info = par->targinfo;
+ struct xt_secmark_target_info_v1 newinfo = {
+ .secid = info->secid,
+ };
+
+ return secmark_tg(skb, &newinfo);
+}
+
+static int secmark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+ return secmark_tg_check(par->table, par->targinfo);
+}
+
+static unsigned int
+secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ return secmark_tg(skb, par->targinfo);
+}
+
+static struct xt_target secmark_tg_reg[] __read_mostly = {
+ {
+ .name = "SECMARK",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = secmark_tg_check_v0,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v0,
+ .targetsize = sizeof(struct xt_secmark_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "SECMARK",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = secmark_tg_check_v1,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v1,
+ .targetsize = sizeof(struct xt_secmark_target_info_v1),
+ .usersize = offsetof(struct xt_secmark_target_info_v1, secid),
+ .me = THIS_MODULE,
+ },
};
static int __init secmark_tg_init(void)
{
- return xt_register_target(&secmark_tg_reg);
+ return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
}
static void __exit secmark_tg_exit(void)
{
- xt_unregister_target(&secmark_tg_reg);
+ xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
}
module_init(secmark_tg_init);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 122db9fbb9f4..116a885adb3c 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -239,8 +239,8 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
oldlen = ipv6h->payload_len;
newlen = htons(ntohs(oldlen) + ret);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(csum_sub(skb->csum, oldlen),
- newlen);
+ skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen),
+ (__force __wsum)newlen);
ipv6h->payload_len = newlen;
}
return XT_CONTINUE;
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 194dc03341f3..e4bea1d346cf 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -74,18 +74,10 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
-
- pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
- iph->protocol, &iph->daddr, ntohs(hp->dest),
- &laddr, ntohs(lport), skb->mark);
-
nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
- iph->protocol, &iph->saddr, ntohs(hp->source),
- &iph->daddr, ntohs(hp->dest), skb->mark);
return NF_DROP;
}
@@ -122,16 +114,12 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
int tproto;
tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0) {
- pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ if (tproto < 0)
return NF_DROP;
- }
hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+ if (!hp)
return NF_DROP;
- }
/* check if there's an ongoing connection on the packet
* addresses, this happens if the redirect already happened
@@ -168,19 +156,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
-
- pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->source),
- laddr, ntohs(lport), skb->mark);
-
nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->source),
- &iph->daddr, ntohs(hp->dest), skb->mark);
-
return NF_DROP;
}
@@ -200,6 +179,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par)
pr_info_ratelimited("Can be used only with -p tcp or -p udp\n");
return -EINVAL;
}
+
+static void tproxy_tg6_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_defrag_ipv6_disable(par->net);
+}
#endif
static int tproxy_tg4_check(const struct xt_tgchk_param *par)
@@ -219,6 +203,11 @@ static int tproxy_tg4_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
+static void tproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_defrag_ipv4_disable(par->net);
+}
+
static struct xt_target tproxy_tg_reg[] __read_mostly = {
{
.name = "TPROXY",
@@ -228,6 +217,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
.revision = 0,
.targetsize = sizeof(struct xt_tproxy_target_info),
.checkentry = tproxy_tg4_check,
+ .destroy = tproxy_tg4_destroy,
.hooks = 1 << NF_INET_PRE_ROUTING,
.me = THIS_MODULE,
},
@@ -239,6 +229,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
.revision = 1,
.targetsize = sizeof(struct xt_tproxy_target_info_v1),
.checkentry = tproxy_tg4_check,
+ .destroy = tproxy_tg4_destroy,
.hooks = 1 << NF_INET_PRE_ROUTING,
.me = THIS_MODULE,
},
@@ -251,6 +242,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
.revision = 1,
.targetsize = sizeof(struct xt_tproxy_target_info_v1),
.checkentry = tproxy_tg6_check,
+ .destroy = tproxy_tg6_destroy,
.hooks = 1 << NF_INET_PRE_ROUTING,
.me = THIS_MODULE,
},
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index 349ab5609b1b..5582dce98cae 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -52,3 +52,4 @@ static void __exit trace_tg_exit(void)
module_init(trace_tg_init);
module_exit(trace_tg_exit);
+MODULE_SOFTDEP("pre: nf_log_syslog");
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 13cf3f9b5938..849ac552a154 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
- return BPF_PROG_RUN(info->filter, skb);
+ return bpf_prog_run(info->filter, skb);
}
static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 46fcac75f726..5d04ef80a61d 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -62,10 +62,10 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
key[4] = zone->id;
} else {
const struct iphdr *iph = ip_hdr(skb);
- key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
- iph->daddr : iph->saddr;
- key[0] &= info->mask.ip;
+ key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
+ (__force __u32)iph->daddr : (__force __u32)iph->saddr;
+ key[0] &= (__force __u32)info->mask.ip;
key[1] = zone->id;
}
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index eec2f3a88d73..e5ebc0810675 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -2,7 +2,7 @@
/*
* xt_connmark - Netfilter module to operate on connection marks
*
- * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com>
* by Henrik Nordstrom <hno@marasystems.com>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
* Jan Engelhardt <jengelh@medozas.de>
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 8c835ad63729..0859b8f76764 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -132,7 +132,7 @@ struct xt_hashlimit_htable {
const char *name;
struct net *net;
- struct hlist_head hash[0]; /* hashtable itself */
+ struct hlist_head hash[]; /* hashtable itself */
};
static int
@@ -1052,7 +1052,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
static void *dl_seq_start(struct seq_file *s, loff_t *pos)
__acquires(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket;
spin_lock_bh(&htable->lock);
@@ -1069,7 +1069,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos)
static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
*pos = ++(*bucket);
@@ -1083,7 +1083,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
static void dl_seq_stop(struct seq_file *s, void *v)
__releases(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
if (!IS_ERR(bucket))
@@ -1125,7 +1125,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1140,7 +1140,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1155,7 +1155,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1169,7 +1169,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_show_v2(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = (unsigned int *)v;
struct dsthash_ent *ent;
@@ -1183,7 +1183,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v)
static int dl_seq_show_v1(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
@@ -1197,7 +1197,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
static int dl_seq_show(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index bd1dea9c7b88..8b4fd27857f2 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -8,16 +8,14 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_limit.h>
struct xt_limit_priv {
- spinlock_t lock;
unsigned long prev;
- uint32_t credit;
+ u32 credit;
};
MODULE_LICENSE("GPL");
@@ -66,22 +64,31 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateinfo *r = par->matchinfo;
struct xt_limit_priv *priv = r->master;
- unsigned long now = jiffies;
-
- spin_lock_bh(&priv->lock);
- priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
- if (priv->credit > r->credit_cap)
- priv->credit = r->credit_cap;
-
- if (priv->credit >= r->cost) {
- /* We're not limited. */
- priv->credit -= r->cost;
- spin_unlock_bh(&priv->lock);
- return true;
- }
-
- spin_unlock_bh(&priv->lock);
- return false;
+ unsigned long now;
+ u32 old_credit, new_credit, credit_increase = 0;
+ bool ret;
+
+ /* fastpath if there is nothing to update */
+ if ((READ_ONCE(priv->credit) < r->cost) && (READ_ONCE(priv->prev) == jiffies))
+ return false;
+
+ do {
+ now = jiffies;
+ credit_increase += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
+ old_credit = READ_ONCE(priv->credit);
+ new_credit = old_credit;
+ new_credit += credit_increase;
+ if (new_credit > r->credit_cap)
+ new_credit = r->credit_cap;
+ if (new_credit >= r->cost) {
+ ret = true;
+ new_credit -= r->cost;
+ } else {
+ ret = false;
+ }
+ } while (cmpxchg(&priv->credit, old_credit, new_credit) != old_credit);
+
+ return ret;
}
/* Precision saver. */
@@ -122,7 +129,6 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
r->credit_cap = priv->credit; /* Credits full. */
r->cost = user2credits(r->avg);
}
- spin_lock_init(&priv->lock);
return 0;
}
@@ -134,7 +140,7 @@ static void limit_mt_destroy(const struct xt_mtdtor_param *par)
kfree(info->master);
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_xt_rateinfo {
u_int32_t avg;
u_int32_t burst;
@@ -176,7 +182,7 @@ static int limit_mt_compat_to_user(void __user *dst, const void *src)
};
return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
}
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
static struct xt_match limit_mt_reg __read_mostly = {
.name = "limit",
@@ -186,7 +192,7 @@ static struct xt_match limit_mt_reg __read_mostly = {
.checkentry = limit_mt_check,
.destroy = limit_mt_destroy,
.matchsize = sizeof(struct xt_rateinfo),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(struct compat_xt_rateinfo),
.compat_from_user = limit_mt_compat_from_user,
.compat_to_user = limit_mt_compat_to_user,
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index a8e5f6c8db7a..b4f7bbc3f3ca 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -244,3 +244,4 @@ MODULE_ALIAS("ipt_SNAT");
MODULE_ALIAS("ipt_DNAT");
MODULE_ALIAS("ip6t_SNAT");
MODULE_ALIAS("ip6t_DNAT");
+MODULE_DESCRIPTION("SNAT and DNAT targets support");
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index 5aab6df74e0f..7c6bf1c16813 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
- * (C) 2011 Intra2net AG <http://www.intra2net.com>
+ * (C) 2011 Intra2net AG <https://www.intra2net.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -27,7 +27,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
overquota = nfnl_acct_overquota(xt_net(par), info->nfacct);
- return overquota == NFACCT_UNDERQUOTA ? false : true;
+ return overquota != NFACCT_UNDERQUOTA;
}
static int
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 225a7ab6d79a..7ddb9a78e3fc 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -71,7 +71,7 @@ struct recent_entry {
u_int8_t ttl;
u_int8_t index;
u_int16_t nstamps;
- unsigned long stamps[0];
+ unsigned long stamps[];
};
struct recent_table {
@@ -82,7 +82,7 @@ struct recent_table {
unsigned int entries;
u8 nstamps_max_mask;
struct list_head lru_list;
- struct list_head iphash[0];
+ struct list_head iphash[];
};
struct recent_net {
@@ -152,7 +152,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
/*
* Drop entries with timestamps older then 'time'.
*/
-static void recent_entry_reap(struct recent_table *t, unsigned long time)
+static void recent_entry_reap(struct recent_table *t, unsigned long time,
+ struct recent_entry *working, bool update)
{
struct recent_entry *e;
@@ -162,6 +163,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time)
e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
/*
+ * Do not reap the entry which are going to be updated.
+ */
+ if (e == working && update)
+ return;
+
+ /*
* The last time stamp is the most recent.
*/
if (time_after(time, e->stamps[e->index-1]))
@@ -303,7 +310,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
/* info->seconds must be non-zero */
if (info->check_set & XT_RECENT_REAP)
- recent_entry_reap(t, time);
+ recent_entry_reap(t, time, e,
+ info->check_set & XT_RECENT_UPDATE && ret);
}
if (info->check_set & XT_RECENT_SET ||
@@ -543,7 +551,7 @@ static int recent_seq_open(struct inode *inode, struct file *file)
if (st == NULL)
return -ENOMEM;
- st->table = PDE_DATA(inode);
+ st->table = pde_data(inode);
return 0;
}
@@ -551,7 +559,7 @@ static ssize_t
recent_mt_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *loff)
{
- struct recent_table *t = PDE_DATA(file_inode(file));
+ struct recent_table *t = pde_data(file_inode(file));
struct recent_entry *e;
char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
const char *c = buf;
@@ -640,7 +648,7 @@ static void __net_exit recent_proc_net_exit(struct net *net)
struct recent_table *t;
/* recent_net_exit() is called before recent_mt_destroy(). Make sure
- * that the parent xt_recent proc entry is is empty before trying to
+ * that the parent xt_recent proc entry is empty before trying to
* remove it.
*/
spin_lock_bh(&recent_lock);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 5f973987265d..7013f55f05d1 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -216,6 +216,16 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par)
return 0;
}
+static void socket_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ if (par->family == NFPROTO_IPV4)
+ nf_defrag_ipv4_disable(par->net);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ else if (par->family == NFPROTO_IPV6)
+ nf_defrag_ipv6_disable(par->net);
+#endif
+}
+
static struct xt_match socket_mt_reg[] __read_mostly = {
{
.name = "socket",
@@ -231,6 +241,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.revision = 1,
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2_v3,
+ .destroy = socket_mt_destroy,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -245,6 +256,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
+ .destroy = socket_mt_destroy,
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
@@ -256,6 +268,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v2_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -268,6 +281,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v2_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -280,6 +294,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v3_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -292,6 +307,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v3_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 203e24ae472c..b26c1dcfc27b 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -34,7 +34,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
switch (info->mode) {
case XT_STATISTIC_MODE_RANDOM:
- if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability)
+ if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability)
ret = !ret;
break;
case XT_STATISTIC_MODE_NTH:
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 67cb98489415..6aa12d0f54e2 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -5,7 +5,7 @@
* based on ipt_time by Fabrice MARIE <fabrice@netfilter.org>
* This is a module which is used for time matching
* It is using some modified code from dietlibc (localtime() function)
- * that you can find at http://www.fefe.de/dietlibc/
+ * that you can find at https://www.fefe.de/dietlibc/
* This file is distributed under the terms of the GNU General Public
* License (GPL). Copies of the GPL can be obtained from gnu.org/gpl.
*/